In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sklearn
import numpy as np
import pandas as pd
import gc

In [3]:

# read in the csv containing the urls and type
df = pd.read_csv('/content/drive/Shareddrives/Machine Learning Final Project/data/malicious_urls.csv')
df['type'].unique()

array(['phishing', 'benign', 'defacement', 'malware'], dtype=object)

In [4]:
# encode benign to be 0 and all the malicious types to be 1
mal_type = {'benign': 0, 'defacement': 1, 'phishing': 2, 'malware': 3}
df['label'] = [mal_type[item] for item in df['type']]
df

Unnamed: 0,url,type,label
0,br-icloud.com.br,phishing,2
1,mp3raid.com/music/krizz_kaliko.html,benign,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1
...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2


In [5]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(df['type'])
test = encoder.transform(df['type'])
hot = np_utils.to_categorical(test)

In [6]:
df['url_length'] = [len(item) for item in df['url']]
df

Unnamed: 0,url,type,label,url_length
0,br-icloud.com.br,phishing,2,16
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235
...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45


In [7]:
df['backslash_count'] = [item.count('/') for item in df['url']]
df

Unnamed: 0,url,type,label,url_length,backslash_count
0,br-icloud.com.br,phishing,2,16,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3
...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2


In [8]:
# encode none to be 0, http to be 1, and https to be 2
df['http_type'] = [2 if ('https://') in item else (1 if ('http://') in item else 0) for item in df['url']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type
0,br-icloud.com.br,phishing,2,16,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1
...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0


In [9]:
!pip install tldextract
import tldextract
url_info = [tldextract.extract(item) for item in df['url']]
df['subdomain'] = [item.subdomain if len(item.subdomain) > 0 else None for item in url_info]
df['suffix'] = [item.suffix if len(item.suffix) > 0 else None for item in url_info]
del url_info
df

Collecting tldextract
  Downloading tldextract-3.2.1-py3-none-any.whl (87 kB)
[?25l[K     |███▊                            | 10 kB 18.3 MB/s eta 0:00:01[K     |███████▌                        | 20 kB 24.4 MB/s eta 0:00:01[K     |███████████▏                    | 30 kB 28.5 MB/s eta 0:00:01[K     |███████████████                 | 40 kB 30.7 MB/s eta 0:00:01[K     |██████████████████▋             | 51 kB 5.8 MB/s eta 0:00:01[K     |██████████████████████▍         | 61 kB 6.8 MB/s eta 0:00:01[K     |██████████████████████████▏     | 71 kB 7.7 MB/s eta 0:00:01[K     |█████████████████████████████▉  | 81 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████████| 87 kB 2.9 MB/s 
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-3.2.1


Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix
0,br-icloud.com.br,phishing,2,16,0,0,,com.br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,,com
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,,org
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,www,be
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,,net
...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,xbox360,com
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,games,com
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,www,com
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,en,org


In [10]:
df['periods_in_suffix'] = [item.count('.') if not item == None else (0) for item in df['suffix']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix
0,br-icloud.com.br,phishing,2,16,0,0,,com.br,1
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,,com,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,,org,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,www,be,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,,net,0
...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,xbox360,com,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,games,com,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,www,com,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,en,org,0


In [11]:
df['trimmed_suffix'] = [item.split('.')[0] if not item == None else (None) for item in df['suffix']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix,trimmed_suffix
0,br-icloud.com.br,phishing,2,16,0,0,,com.br,1,com
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,,com,0,com
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,,org,0,org
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,www,be,0,be
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,,net,0,net
...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,xbox360,com,0,com
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,games,com,0,com
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,www,com,0,com
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,en,org,0,org


In [12]:
df['contains_percent'] = [1 if ('%') in item else (0) for item in df['url']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix,trimmed_suffix,contains_percent
0,br-icloud.com.br,phishing,2,16,0,0,,com.br,1,com,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,,com,0,com,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,,org,0,org,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,www,be,0,be,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,,net,0,net,0
...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,xbox360,com,0,com,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,games,com,0,com,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,www,com,0,com,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,en,org,0,org,0


In [13]:

from urllib.parse import urlparse
from os.path import splitext

def get_ext(url):
    """Return the filename extension from url, or ''."""
    parsed = urlparse(url)
    root, ext = splitext(parsed.path)
    return ext[1:]  # or ext[1:] if you don't want the leading '.'


In [14]:
df['file_extension'] = [get_ext(item) if len(get_ext(item)) > 0 else None for item in df['url']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix,trimmed_suffix,contains_percent,file_extension
0,br-icloud.com.br,phishing,2,16,0,0,,com.br,1,com,0,br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,,com,0,com,0,html
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,,org,0,org,0,htm
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,www,be,0,be,0,php
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,,net,0,net,0,php
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,xbox360,com,0,com,0,html
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,games,com,0,com,0,
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,www,com,0,com,0,
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,en,org,0,org,0,


In [15]:
from collections import defaultdict

# encode all the various domain suffixes with a default value of 0
suffix = defaultdict(lambda: 0)
suffix['com'] = 1
suffix['org'] = 2
suffix['net'] = 3
suffix['gov'] = 4
suffix['edu'] = 5

df['trimmed_suffix'] = [suffix[item] for item in df['trimmed_suffix']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix,trimmed_suffix,contains_percent,file_extension
0,br-icloud.com.br,phishing,2,16,0,0,,com.br,1,1,0,br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,,com,0,1,0,html
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,,org,0,2,0,htm
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,www,be,0,0,0,php
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,,net,0,3,0,php
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,xbox360,com,0,1,0,html
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,games,com,0,1,0,
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,www,com,0,1,0,
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,en,org,0,2,0,


In [16]:
# encode all the various domain suffixes with a default value of 0
subdomain = defaultdict(lambda: 0)
subdomain['www'] = 1

df['subdomain'] = [subdomain[item] for item in df['subdomain']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix,trimmed_suffix,contains_percent,file_extension
0,br-icloud.com.br,phishing,2,16,0,0,0,com.br,1,1,0,br
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,0,com,0,1,0,html
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,0,org,0,2,0,htm
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,1,be,0,0,0,php
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,0,net,0,3,0,php
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,0,com,0,1,0,html
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,0,com,0,1,0,
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,1,com,0,1,0,
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,0,org,0,2,0,


In [17]:
# encode the various file extensions with a default value of 0
file_ext = defaultdict(lambda: 0)
file_ext['html'] = 1
file_ext['php'] = 2
file_ext['aspx'] = 3

df['file_extension'] = [file_ext[item] for item in df['file_extension']]
df

Unnamed: 0,url,type,label,url_length,backslash_count,http_type,subdomain,suffix,periods_in_suffix,trimmed_suffix,contains_percent,file_extension
0,br-icloud.com.br,phishing,2,16,0,0,0,com.br,1,1,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,35,2,0,0,com,0,1,0,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,31,3,0,0,org,0,2,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,1,88,3,1,1,be,0,0,0,2
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,235,3,1,0,net,0,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,2,39,3,0,0,com,0,1,0,1
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,2,44,4,0,0,com,0,1,0,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,2,42,4,0,1,com,0,1,0,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,2,45,2,0,0,org,0,2,0,0


In [18]:
y = df['label']
df.drop(['type', 'label', 'url', 'suffix'], axis=1, inplace=True)
df

Unnamed: 0,url_length,backslash_count,http_type,subdomain,periods_in_suffix,trimmed_suffix,contains_percent,file_extension
0,16,0,0,0,1,1,0,0
1,35,2,0,0,0,1,0,1
2,31,3,0,0,0,2,0,0
3,88,3,1,1,0,0,0,2
4,235,3,1,0,0,3,0,2
...,...,...,...,...,...,...,...,...
651186,39,3,0,0,0,1,0,1
651187,44,4,0,0,0,1,0,0
651188,42,4,0,1,0,1,0,0
651189,45,2,0,0,0,2,0,0


In [None]:
#some model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split

normalized_df=(df-df.min())/(df.max()-df.min())
normalized_df

#X_train, X_test, y_train, y_test = train_test_split(normalized_df, hot, test_size=0.33, random_state=100)

In [None]:
#Model with 5 layers
model = Sequential()
model.add(Dense(90, input_shape = (None,X_train.shape[1]), activation = 'softmax'))
model.add(Dense(50, activation='relu', kernel_initializer="uniform"))
model.add(Dense(24, activation='relu', kernel_initializer="uniform"))
model.add(Dense(12, activation='relu', kernel_initializer="uniform"))
model.add(Dense(4, activation='sigmoid', kernel_initializer='uniform'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics= ['accuracy'])

In [None]:
#fit our data to model
history = model.fit(X_train, y_train, validation_split = 0.33, epochs = 250, batch_size = 5000)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [None]:
#test/predict 
from sklearn.metrics import confusion_matrix

label_predict = model.predict(X_test)

rounded = (label_predict  > 0.5).astype(int)

cm = confusion_matrix(y_test, rounded)

scores = model.evaluate(X_test,y_test)



ValueError: ignored

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rounded, target_names=['Benign', 'Defacement', 'Phishing', 'Malware']))

In [None]:
def print_cl_scores(scores):
  print('Accuracy: ' + str(np.mean(scores['test_accuracy'])))
  print('Precision: ' + str(np.mean(scores['test_precision'])))
  print('Recall: ' + str(np.mean(scores['test_recall'])))
  print('F1-Score: ' + str(np.mean(scores['test_f1'])))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

X_train, X_holdout, y_train, y_holdout = train_test_split(df.values, y, test_size=0.3,random_state=17)

# for kNN, we need to scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, KFold

cl = RandomForestClassifier(criterion='gini', max_depth=10)
scores = cross_validate(cl, df, y, cv=KFold(n_splits=5, shuffle=True), scoring=["accuracy", "precision", "recall", "f1"])
print_cl_scores(scores)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, KFold

cl = KNeighborsClassifier(n_neighbors=5)
scores = cross_validate(cl, df, y, cv=KFold(n_splits=5, shuffle=True), scoring=["accuracy", "precision", "recall", "f1"])
print_cl_scores(scores)

In [None]:
from sklearn.naive_bayes import GaussianNB
cl = GaussianNB()
scores = cross_validate(cl, df, y, cv=KFold(n_splits=5, shuffle=True), scoring=["accuracy", "precision", "recall", "f1"])
print_cl_scores(scores)

In [None]:
from sklearn.linear_model import LogisticRegression
cl = LogisticRegression()
scores = cross_validate(cl, df, y, cv=KFold(n_splits=5, shuffle=True), scoring=["accuracy", "precision", "recall", "f1"])
print_cl_scores(scores)

In [None]:
print(history.history.keys())

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
