In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
from sklearn.metrics import *

#-- Pytorch specific libraries import -----#
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Read the dataset from Google Drive
dataset_path = '/content/drive/MyDrive/Dataset/malicious_phish.csv'
df_data = pd.read_csv(dataset_path)

# Display the shape of the dataset
print(df_data.shape)

Mounted at /content/drive
(651191, 2)


In [5]:
df_data.head(6)

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign


In [7]:
types = df_data['type'].unique()
num_types = len(types)

print("Number of unique types: ", num_types)
print("Types of data: ", types)

Number of unique types:  4
Types of data:  ['phishing' 'benign' 'defacement' 'malware']


In [10]:
import pandas as pd

# Assuming your DataFrame is named 'df_data'
df_data['type'] = df_data['type'].replace({'defacement': 'phishing', 'malware': 'phishing', 'benign': 'legitimate'})

types = df_data['type'].unique()
num_types = len(types)

print("Number of unique types: ", num_types)
print("Types of data: ", types)

Number of unique types:  2
Types of data:  ['phishing' 'legitimate']


In [11]:
# Assuming you have a DataFrame named df_data
print(df_data.columns)

Index(['url', 'type'], dtype='object')


In [12]:
import numpy as np

print(df_data['type'].isna().sum())
print(df_data['type'].isin([np.inf, -np.inf]).sum())

0
0


In [13]:
#Encoding 'status' as label 1 & 0 , naming the field as target
df_data['target'] = pd.get_dummies(df_data['type'])['legitimate'].astype('int')
df_data.drop('type',axis = 1, inplace=True)
df_data[['url','target']].head(5)

Unnamed: 0,url,target
0,br-icloud.com.br,0
1,mp3raid.com/music/krizz_kaliko.html,1
2,bopsecrets.org/rexroth/cr/1.htm,1
3,http://www.garage-pirenne.be/index.php?option=...,0
4,http://adventure-nicaragua.net/index.php?optio...,0


In [14]:
tmp = df_data.isnull().sum().reset_index(name='missing_val')
tmp[tmp['missing_val']!= 0]

Unnamed: 0,index,missing_val


In [15]:
print(df_data.columns)

Index(['url', 'target'], dtype='object')


In [16]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse

# Assuming you have a DataFrame named df_data with 'url' and 'target' columns

# Extracting the desired features from the 'url' column
df_features = pd.DataFrame()
df_features['url'] = df_data['url']

parsed_url = df_data['url'].apply(lambda x: urlparse(x))

df_features['length_url'] = df_data['url'].apply(len)
df_features['length_hostname'] = parsed_url.apply(lambda x: len(x.hostname) if x.hostname else 0)
df_features['ip'] = parsed_url.apply(lambda x: int(bool(x.hostname) and x.hostname.replace('.', '').isdigit()))
df_features['nb_dots'] = parsed_url.apply(lambda x: x.netloc.count('.'))
df_features['nb_hyphens'] = parsed_url.apply(lambda x: x.netloc.count('-'))
df_features['nb_at'] = parsed_url.apply(lambda x: x.netloc.count('@'))
df_features['nb_qm'] = df_data['url'].apply(lambda x: x.count('?'))
df_features['nb_and'] = df_data['url'].apply(lambda x: x.count('&'))
df_features['nb_or'] = df_data['url'].apply(lambda x: x.count('|'))
df_features['nb_eq'] = df_data['url'].apply(lambda x: x.count('='))
df_features['nb_underscore'] = df_data['url'].apply(lambda x: x.count('_'))
df_features['nb_tilde'] = df_data['url'].apply(lambda x: x.count('~'))
df_features['nb_percent'] = df_data['url'].apply(lambda x: x.count('%'))
df_features['nb_slash'] = df_data['url'].apply(lambda x: x.count('/'))
df_features['nb_star'] = df_data['url'].apply(lambda x: x.count('*'))
df_features['nb_colon'] = df_data['url'].apply(lambda x: x.count(':'))
df_features['nb_comma'] = df_data['url'].apply(lambda x: x.count(','))
df_features['nb_semicolon'] = df_data['url'].apply(lambda x: x.count(';'))
df_features['nb_dollar'] = df_data['url'].apply(lambda x: x.count('$'))
df_features['nb_space'] = df_data['url'].apply(lambda x: x.count(' '))
df_features['nb_www'] = df_data['url'].apply(lambda x: x.count('www'))
df_features['nb_com'] = df_data['url'].apply(lambda x: x.count('.com'))
df_features['nb_dslash'] = df_data['url'].apply(lambda x: x.count('//'))
df_features['http_in_path'] = parsed_url.apply(lambda x: int('http' in x.path.lower()))
df_features['https_token'] = df_data['url'].apply(lambda x: int('https' in x))
df_features['ratio_digits_url'] = df_data['url'].apply(lambda x: sum(char.isdigit() for char in x)) / df_features['length_url']
#df_features['ratio_digits_host'] = parsed_url.apply(lambda x: sum(char.isdigit() for char in x.hostname)) / df_features['length_hostname']
df_features['punycode'] = df_data['url'].apply(lambda x: int('xn--' in x))
df_features['port'] = parsed_url.apply(lambda x: x.port if x.port else 0)
df_features['tld_in_path'] = parsed_url.apply(lambda x: int(bool(x.path) and x.path.endswith(('.org', '.net', '.com'))))
#df_features['tld_in_subdomain'] = parsed_url.apply(lambda x: int(bool(x.subdomain) and x.subdomain.endswith(('.org', '.net', '.com'))))
#df_features['abnormal_subdomain'] = parsed_url.apply(lambda x: int(bool(x.subdomain) and not x.subdomain.endswith(('.org', '.net', '.com'))))
df_features['nb_subdomains'] = parsed_url.apply(lambda x: x.hostname.count('.') if x.hostname else 0)
df_features['prefix_suffix'] = df_data['url'].apply(lambda x: int(bool('-' in x) or '_' in x))
df_features['random_domain'] = df_data['url'].apply(lambda x: int(bool('random' in x or 'r4nd0m' in x)))
df_features['shortening_service'] = df_data['url'].apply(lambda x: int(bool('bit.ly' in x or 'goo.gl' in x or 'tinyurl' in x)))
df_features['path_extension'] = parsed_url.apply(lambda x: x.path.split('/')[-1].split('.')[-1].lower() if x.path.split('/')[-1].split('.')[-1:] else '')
df_features['nb_redirection'] = df_data['url'].apply(lambda x: x.count('//'))
df_features['nb_external_redirection'] = df_data['url'].apply(lambda x: x.count('http') - 1)
df_features['length_words_raw'] = df_data['url'].apply(lambda x: len(x.split()))
#df_features['char_repeat'] = df_data['url'].apply(lambda x: int(bool('..', '//' in x)))
df_features['shortest_words_raw'] = df_data['url'].apply(lambda x: min(len(word) for word in x.split()))
df_features['shortest_word_host'] = parsed_url.apply(lambda x: min(len(word) for word in x.hostname.split('.')) if x.hostname else 0)
df_features['shortest_word_path'] = parsed_url.apply(lambda x: min(len(word) for word in x.path.split('/')) if x.path else 0)
df_features['longest_words_raw'] = df_data['url'].apply(lambda x: max(len(word) for word in x.split()))
df_features['longest_word_host'] = parsed_url.apply(lambda x: max(len(word) for word in x.hostname.split('.')) if x.hostname else 0)
df_features['longest_word_path'] = parsed_url.apply(lambda x: max(len(word) for word in x.path.split('/')) if x.path else 0)
df_features['avg_words_raw'] = df_data['url'].apply(lambda x: np.mean([len(word) for word in x.split()]))
df_features['avg_word_host'] = parsed_url.apply(lambda x: np.mean([len(word) for word in x.hostname.split('.')]) if x.hostname else 0)
df_features['avg_word_path'] = parsed_url.apply(lambda x: np.mean([len(word) for word in x.path.split('/')]) if x.path else 0)
df_features['phish_hints'] = df_data['url'].apply(lambda x: int(bool('signin' in x or 'login' in x or 'secure' in x or 'account' in x)))
df_features['domain_in_brand'] = df_data['url'].apply(lambda x: int(bool('paypal' in x or 'facebook' in x or 'google' in x)))
#df_features['brand_in_subdomain'] = parsed_url.apply(lambda x: int(bool('paypal' in x.subdomain or 'facebook' in x.subdomain or 'google' in x.subdomain)) if x.subdomain else 0)
df_features['brand_in_path'] = parsed_url.apply(lambda x: int(bool('paypal' in x.path or 'facebook' in x.path or 'google' in x.path)) if x.path else 0)
df_features['suspecious_tld'] = parsed_url.apply(lambda x: int(bool(x.hostname) and x.hostname.endswith(('.tk', '.ml', '.ga', '.cf', '.gq'))))
df_features['statistical_report'] = df_data['url'].apply(lambda x: int(bool('statisticalreport' in x or 'statistics' in x)))
df_features['nb_hyperlinks'] = df_data['url'].apply(lambda x: x.count('href='))
df_features['ratio_intHyperlinks'] = df_data['url'].apply(lambda x: x.count('href="http'))
df_features['ratio_extHyperlinks'] = df_data['url'].apply(lambda x: x.count('href="https') + x.count('href="//'))
df_features['ratio_nullHyperlinks'] = df_data['url'].apply(lambda x: x.count('href=""'))
df_features['nb_extCSS'] = df_data['url'].apply(lambda x: x.count('.css'))
df_features['ratio_intRedirection'] = df_data['url'].apply(lambda x: x.count('window.location.href="http'))
df_features['ratio_extRedirection'] = df_data['url'].apply(lambda x: x.count('window.location.href="https') + x.count('window.location.href="//'))
df_features['ratio_intErrors'] = df_data['url'].apply(lambda x: x.count('<script>document.location.href="http'))
df_features['ratio_extErrors'] = df_data['url'].apply(lambda x: x.count('<script>document.location.href="https') + x.count('<script>document.location.href="//'))
df_features['login_form'] = df_data['url'].apply(lambda x: int(bool('login' in x or 'signin' in x or 'log-in' in x or 'sign-in' in x)))
df_features['external_favicon'] = df_data['url'].apply(lambda x: int(bool('favicon' in x)))
df_features['links_in_tags'] = df_data['url'].apply(lambda x: int(bool('<a href=' in x)))
df_features['submit_email'] = df_data['url'].apply(lambda x: int(bool('mailto:' in x or 'email:' in x)))
df_features['ratio_intMedia'] = df_data['url'].apply(lambda x: x.count('<img src="http'))
df_features['ratio_extMedia'] = df_data['url'].apply(lambda x: x.count('<img src="https') + x.count('<img src="//'))
df_features['sfh'] = df_data['url'].apply(lambda x: int(bool('action="' not in x or 'about:blank' in x)))
df_features['iframe'] = df_data['url'].apply(lambda x: int(bool('<iframe' in x)))
df_features['popup_window'] = df_data['url'].apply(lambda x: int(bool('prompt(' in x or 'alert(' in x or 'confirm(' in x)))
df_features['safe_anchor'] = df_data['url'].apply(lambda x: int(bool('onmouseover="window.status' in x or 'onmouseover="document.location' in x)))
df_features['onmouseover'] = df_data['url'].apply(lambda x: int(bool('onmouseover=' in x)))
df_features['right_clic'] = df_data['url'].apply(lambda x: int(bool('right-click' in x)))
df_features['empty_title'] = df_data['url'].apply(lambda x: int(bool('<title></title>' in x)))
#df_features['domain_in_title'] = df_data['url'].apply(lambda x: int(bool(parsed_url.apply(lambda y: y.hostname if y.hostname else '').isin(x))))
df_features['domain_with_copyright'] = df_data['url'].apply(lambda x: int(bool('copyright' in x or 'cprt' in x)))
df_features['whois_registered_domain'] = df_data['url'].apply(lambda x: int(bool('whois' in x)))
df_features['domain_registration_length'] = df_data['url'].apply(lambda x: int(bool('http' not in x)))
df_features['domain_age'] = df_data['url'].apply(lambda x: int(bool('http' in x)))
df_features['web_traffic'] = df_data['url'].apply(lambda x: int(bool('http' not in x)))
df_features['dns_record'] = df_data['url'].apply(lambda x: int(bool('https' in x)))
df_features['google_index'] = df_data['url'].apply(lambda x: int(bool('https' in x)))
df_features['page_rank'] = df_data['url'].apply(lambda x: int(bool('https' in x)))

# Include the 'target' column
#df_features['target'] = df_data['type']

# Print the extracted features
print(df_features)


                                                      url  length_url  \
0                                        br-icloud.com.br          16   
1                     mp3raid.com/music/krizz_kaliko.html          35   
2                         bopsecrets.org/rexroth/cr/1.htm          31   
3       http://www.garage-pirenne.be/index.php?option=...          88   
4       http://adventure-nicaragua.net/index.php?optio...         235   
...                                                   ...         ...   
651186            xbox360.ign.com/objects/850/850402.html          39   
651187       games.teamxbox.com/xbox-360/1860/Dead-Space/          44   
651188         www.gamespot.com/xbox360/action/deadspace/          42   
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)          45   
651190          www.angelfire.com/goth/devilmaycrytonite/          41   

        length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  \
0                     0   0        0      

In [17]:
print(df_features.columns)

Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolon', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'punycode', 'port', 'tld_in_path', 'nb_subdomains', 'prefix_suffix',
       'random_domain', 'shortening_service', 'path_extension',
       'nb_redirection', 'nb_external_redirection', 'length_words_raw',
       'shortest_words_raw', 'shortest_word_host', 'shortest_word_path',
       'longest_words_raw', 'longest_word_host', 'longest_word_path',
       'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints',
       'domain_in_brand', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks',
       'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
       '

In [18]:
import pandas as pd

# Assuming you have the df_data DataFrame and df_features DataFrame

# Concatenate df_data and df_features horizontally
df_combined = pd.concat([df_data, df_features], axis=1)

# Print the combined DataFrame
print(df_combined)

                                                      url  target  \
0                                        br-icloud.com.br       0   
1                     mp3raid.com/music/krizz_kaliko.html       1   
2                         bopsecrets.org/rexroth/cr/1.htm       1   
3       http://www.garage-pirenne.be/index.php?option=...       0   
4       http://adventure-nicaragua.net/index.php?optio...       0   
...                                                   ...     ...   
651186            xbox360.ign.com/objects/850/850402.html       0   
651187       games.teamxbox.com/xbox-360/1860/Dead-Space/       0   
651188         www.gamespot.com/xbox360/action/deadspace/       0   
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)       0   
651190          www.angelfire.com/goth/devilmaycrytonite/       0   

                                                      url  length_url  \
0                                        br-icloud.com.br          16   
1                     mp3

In [19]:
print(df_combined.columns)

Index(['url', 'target', 'url', 'length_url', 'length_hostname', 'ip',
       'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq',
       'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star',
       'nb_colon', 'nb_comma', 'nb_semicolon', 'nb_dollar', 'nb_space',
       'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token',
       'ratio_digits_url', 'punycode', 'port', 'tld_in_path', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'shortest_words_raw', 'shortest_word_host',
       'shortest_word_path', 'longest_words_raw', 'longest_word_host',
       'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path',
       'phish_hints', 'domain_in_brand', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks',
       'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_