## Detecting Malicious URL With Machine Learning In Python



In [None]:
# EDA Packages
import pandas as pd
import numpy as np
import random
import pickle

# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Load Url Data
urls_data = pd.read_csv("/content/urldata.csv")

In [None]:
type(urls_data)

pandas.core.frame.DataFrame

In [None]:
urls_data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


### Data Vectorization Using TfidVectorizer
#### Create A tokenizer
 + Split ,Remove Repetitions and "Com"

In [None]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')	# make tokens after splitting by slash
    total_Tokens = []
    for i in tkns_BySlash:
        tokens = str(i).split('-')	# make tokens after splitting by dash
        tkns_ByDot = []
        for j in range(0,len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')	# make tokens after splitting by dot
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))	#remove redundant tokens
    if 'com' in total_Tokens:
        total_Tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_Tokens

In [None]:
# Labels
y = urls_data["label"]

In [None]:
# Features
url_list = urls_data["url"]

In [None]:
# Using Default Tokenizer
#vectorizer = TfidfVectorizer()

# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [None]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)



#### Split into training and testing dataset 80/20 ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check the shape of X_train and y_train
print(X_train.shape)  # should print (23766, n_features)
print(y_train.shape)  # should print (23766,)

# Drop rows with missing values in both X_train and y_train
#X_train = X_train.dropna(axis=0)
#y_train = y_train.dropna(axis=0)

# Check the shape again
print(X_train.shape)  # should print (23765, n_features)
print(y_train.shape)  # should print (23765,)


(242077, 521642)
(242077,)
(242077, 521642)
(242077,)


In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)


In [None]:
# Model Building
#using logistic regression
logit = LogisticRegression()
logit.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Accuracy of Our Model
print("Accuracy ",logit.score(X_test, y_test))

Accuracy  0.9697455386649042


### Predicting With Our Model

In [None]:
X_predict = ["www.google.com/search=dhanish",
"google.com/search=faizanahmad",
"pakistanifacebookforever.com/getpassword.php/",
"www.radsport-voggel.de/wp-admin/includes/log.exe",
"https://www.canva.com/design/DAFevoRJbDU/njVoufKkttRsgrF20M-hCQ/view?utm_content=DAFevoRJbDU&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton ",
"colab.research.google.com/drive/" ]

In [None]:
X_predict = vectorizer.transform(X_predict)
New_predict = logit.predict(X_predict)

NameError: ignored

In [None]:
print(New_predict)

['good' 'good' 'good' 'bad' 'bad' 'bad']


In [None]:
# https://db.aa419.org/fakebankslist.php
X_predict1 = ["www.buyfakebillsonlinee.blogspot3.com",
"www.unitedairlineslogistics.com",
"colab.research.google.com",
"colab.research.google.com" ]

In [None]:
X_predict1 = vectorizer.transform(X_predict1)
New_predict1 = logit.predict(X_predict1)
print(New_predict1)

['good' 'good' 'bad' 'good']


In [None]:
# Using Default Tokenizer
vectorizer = TfidfVectorizer()

In [None]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model Building

logit = LogisticRegression()	#using logistic regression
logit.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Accuracy of Our Model with our Custom Token
print("Accuracy ",logit.score(X_test, y_test))

Accuracy  0.9739590218109716


In [None]:
filename = 'phishing_detector'
pickle.dump(logit,open(filename,'wb'))

In [None]:
model = pickle.load(open(filename,'rb'))


In [None]:
print(X_test[1][0])

  (0, 227600)	0.41782366831815265
  (0, 83160)	0.3872571917799924
  (0, 207865)	0.2899705918310137
  (0, 248205)	0.3363953188895187
  (0, 6294)	0.34258049963238163
  (0, 300916)	0.2841222806742981
  (0, 224918)	0.2718355804212035
  (0, 184780)	0.2611246965934722
  (0, 156343)	0.2258408154277996
  (0, 185380)	0.24687669220043676
  (0, 239388)	0.1205603251226841
  (0, 176910)	0.1074986920558343


In [None]:
ans = model.predict(vectorizer.transform(["google.exe"]))

In [None]:
print(ans)

['bad']
