## Notes


- Remove www from url at the start
- Created a url length function (remove http and https)

In [1]:
import pandas as pd

urls_df = pd.read_csv("malicious_phish_with_whitelist.csv")

****

## Data Identification


****

In [2]:
urls_df.drop_duplicates(inplace = True) #inplace = True to make sure original df is modified

print(urls_df.duplicated().value_counts()) #Check the modified df

False    642125
Name: count, dtype: int64


****

## Feature Extraction

In [3]:
import tldextract #Helps to splits url into its subdomain, domain, and suffix
import re
from urllib.parse import urlparse
import ipaddress
import requests

**Remove WWW from URLs**

In [4]:
#Normalize the URL by throwing "www." prefix from the url to make it easier to compare and process consistently  
urls_df['url'] = urls_df['url'].replace(to_replace = 'www.', value = '', regex = True)

- **Abnormal URL** 

i) Help to identify URLs where domain part (netloc) appears in an unexpected part of the URL

ii) The function checks if the domain part of the URL appears elsewhere in the URL, which is a useful heuristic for identifying potentially suspicious URLs

In [5]:
def is_abnormal_url(url):
    parsed_url = urlparse(url)
    netloc = parsed_url.netloc
    if netloc:
        netloc = str(netloc)
        #To check if netloc (domain appears in an unexpected part of the URL)
        match = re.search(netloc, url)
        if match:
            return 1
            #if netloc found in URL, but not at the expected position, it is abnormal
    return 0

In [6]:
urls_df['abnormal_url'] = urls_df['url'].apply(is_abnormal_url)

In [7]:
abnormal_url_count = urls_df[urls_df['abnormal_url'] == 1]['type'].value_counts()
print(abnormal_url_count)

abnormal_url_count = urls_df[urls_df['abnormal_url'] == 0]['type'].value_counts()
print(abnormal_url_count)

urls_df[urls_df["abnormal_url"] == 1].head()

type
defacement    95308
benign        36378
phishing      24835
malware       22435
Name: count, dtype: int64
type
benign      392702
phishing     69257
malware       1210
Name: count, dtype: int64


Unnamed: 0,url,type,abnormal_url
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign,1
8,http://pashminaonline.com/pure-pashminas,defacement,1
11,http://ikenmijnkunst.nl/index.php/exposities/e...,defacement,1


- **HTTP Secure**

In [8]:
# Function to extract whether the URL is secure (uses HTTPS)
def extract_is_https(url):
    parsed_url = urlparse(url)
    return 1 if parsed_url.scheme == 'https' else 0

In [9]:
# Apply the function to each URL and create a new column 'is_https'
urls_df['is_https'] = urls_df['url'].apply(extract_is_https)

In [10]:
# print(urls_df[urls_df['is_https'] == 1]['type'].value_counts())
# urls_df[urls_df['is_https'] == 1].head()

- **http and https removal function**

In [11]:
# Helper function to clean URLs
def clean_url(url):
    return re.sub(r'^(http://|https://)', '', url)

urls_df['cleaned_url'] = urls_df['url'].apply(clean_url)

- **Special characters count**

In [12]:
feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
for x in feature:
     urls_df[x] = urls_df['cleaned_url'].apply(lambda i: i.count(x))

- **Letters & Digits**

In [13]:
def count_letters(url):
    no_of_letters = sum(char.isalpha() for char in url) #isalpha() checks if it is alphabets
    return no_of_letters

def count_digits(url):
    no_of_digits = sum(char.isdigit() for char in url) 
    return no_of_digits

In [14]:
urls_df['letters'] = urls_df['cleaned_url'].apply(count_letters)
urls_df['digits'] = urls_df['cleaned_url'].apply(count_digits)

urls_df.head()

Unnamed: 0,url,type,abnormal_url,is_https,cleaned_url,@,?,-,=,.,#,%,+,$,!,*,",",//,letters,digits
0,br-icloud.com.br,phishing,0,0,br-icloud.com.br,0,0,1,0,2,0,0,0,0,0,0,0,0,13,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,mp3raid.com/music/krizz_kaliko.html,0,0,0,0,2,0,0,0,0,0,0,0,0,29,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,bopsecrets.org/rexroth/cr/1.htm,0,0,0,0,2,0,0,0,0,0,0,0,0,25,1
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,0,garage-pirenne.be/index.php?option=com_content...,0,1,1,4,2,0,0,0,0,0,0,0,0,56,7
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,0,adventure-nicaragua.net/index.php?option=com_m...,0,1,1,3,2,0,0,0,0,0,0,0,0,195,22


- **Url length**

In [15]:
urls_df['url_length'] = urls_df['cleaned_url'].apply(lambda x: len(str(x)))

- **Primary Domain**

In [16]:
from tld import get_tld, is_tld

#function to extract the primary domain of urls with try and except to intercept unextractable urls
def extract_primary_domain(url):
        try:
            ext = tldextract.extract(url)
            pri_domain = ext.domain+"."+ext.suffix #concatenate only the domain and suffix, thus, the primary domain
        except:  
            pri_domain = None #in case there is no primary domain to be found
        
        return pri_domain

In [17]:
urls_df['primary_domain'] = urls_df['url'].apply(extract_primary_domain) #applied extraction of pri domain to each url and creating a new column for it

- **URL Region & Root Domain** (Dependent on primary domain)

In [18]:
import hashlib

def get_url_region(primary_domain):
    ccTLD_to_region = {
    ".ac": "Ascension Island",
    ".ad": "Andorra",
    ".ae": "United Arab Emirates",
    ".af": "Afghanistan",
    ".ag": "Antigua and Barbuda",
    ".ai": "Anguilla",
    ".al": "Albania",
    ".am": "Armenia",
    ".an": "Netherlands Antilles",
    ".ao": "Angola",
    ".aq": "Antarctica",
    ".ar": "Argentina",
    ".as": "American Samoa",
    ".at": "Austria",
    ".au": "Australia",
    ".aw": "Aruba",
    ".ax": "Åland Islands",
    ".az": "Azerbaijan",
    ".ba": "Bosnia and Herzegovina",
    ".bb": "Barbados",
    ".bd": "Bangladesh",
    ".be": "Belgium",
    ".bf": "Burkina Faso",
    ".bg": "Bulgaria",
    ".bh": "Bahrain",
    ".bi": "Burundi",
    ".bj": "Benin",
    ".bm": "Bermuda",
    ".bn": "Brunei Darussalam",
    ".bo": "Bolivia",
    ".br": "Brazil",
    ".bs": "Bahamas",
    ".bt": "Bhutan",
    ".bv": "Bouvet Island",
    ".bw": "Botswana",
    ".by": "Belarus",
    ".bz": "Belize",
    ".ca": "Canada",
    ".cc": "Cocos Islands",
    ".cd": "Democratic Republic of the Congo",
    ".cf": "Central African Republic",
    ".cg": "Republic of the Congo",
    ".ch": "Switzerland",
    ".ci": "Côte d'Ivoire",
    ".ck": "Cook Islands",
    ".cl": "Chile",
    ".cm": "Cameroon",
    ".cn": "China",
    ".co": "Colombia",
    ".cr": "Costa Rica",
    ".cu": "Cuba",
    ".cv": "Cape Verde",
    ".cw": "Curaçao",
    ".cx": "Christmas Island",
    ".cy": "Cyprus",
    ".cz": "Czech Republic",
    ".de": "Germany",
    ".dj": "Djibouti",
    ".dk": "Denmark",
    ".dm": "Dominica",
    ".do": "Dominican Republic",
    ".dz": "Algeria",
    ".ec": "Ecuador",
    ".ee": "Estonia",
    ".eg": "Egypt",
    ".er": "Eritrea",
    ".es": "Spain",
    ".et": "Ethiopia",
    ".eu": "European Union",
    ".fi": "Finland",
    ".fj": "Fiji",
    ".fk": "Falkland Islands",
    ".fm": "Federated States of Micronesia",
    ".fo": "Faroe Islands",
    ".fr": "France",
    ".ga": "Gabon",
    ".gb": "United Kingdom",
    ".gd": "Grenada",
    ".ge": "Georgia",
    ".gf": "French Guiana",
    ".gg": "Guernsey",
    ".gh": "Ghana",
    ".gi": "Gibraltar",
    ".gl": "Greenland",
    ".gm": "Gambia",
    ".gn": "Guinea",
    ".gp": "Guadeloupe",
    ".gq": "Equatorial Guinea",
    ".gr": "Greece",
    ".gs": "South Georgia and the South Sandwich Islands",
    ".gt": "Guatemala",
    ".gu": "Guam",
    ".gw": "Guinea-Bissau",
    ".gy": "Guyana",
    ".hk": "Hong Kong",
    ".hm": "Heard Island and McDonald Islands",
    ".hn": "Honduras",
    ".hr": "Croatia",
    ".ht": "Haiti",
    ".hu": "Hungary",
    ".id": "Indonesia",
    ".ie": "Ireland",
    ".il": "Israel",
    ".im": "Isle of Man",
    ".in": "India",
    ".io": "British Indian Ocean Territory",
    ".iq": "Iraq",
    ".ir": "Iran",
    ".is": "Iceland",
    ".it": "Italy",
    ".je": "Jersey",
    ".jm": "Jamaica",
    ".jo": "Jordan",
    ".jp": "Japan",
    ".ke": "Kenya",
    ".kg": "Kyrgyzstan",
    ".kh": "Cambodia",
    ".ki": "Kiribati",
    ".km": "Comoros",
    ".kn": "Saint Kitts and Nevis",
    ".kp": "Democratic People's Republic of Korea (North Korea)",
    ".kr": "Republic of Korea (South Korea)",
    ".kw": "Kuwait",
    ".ky": "Cayman Islands",
    ".kz": "Kazakhstan",
    ".la": "Laos",
    ".lb": "Lebanon",
    ".lc": "Saint Lucia",
    ".li": "Liechtenstein",
    ".lk": "Sri Lanka",
    ".lr": "Liberia",
    ".ls": "Lesotho",
    ".lt": "Lithuania",
    ".lu": "Luxembourg",
    ".lv": "Latvia",
    ".ly": "Libya",
    ".ma": "Morocco",
    ".mc": "Monaco",
    ".md": "Moldova",
    ".me": "Montenegro",
    ".mf": "Saint Martin (French part)",
    ".mg": "Madagascar",
    ".mh": "Marshall Islands",
    ".mk": "North Macedonia",
    ".ml": "Mali",
    ".mm": "Myanmar",
    ".mn": "Mongolia",
    ".mo": "Macao",
    ".mp": "Northern Mariana Islands",
    ".mq": "Martinique",
    ".mr": "Mauritania",
    ".ms": "Montserrat",
    ".mt": "Malta",
    ".mu": "Mauritius",
    ".mv": "Maldives",
    ".mw": "Malawi",
    ".mx": "Mexico",
    ".my": "Malaysia",
    ".mz": "Mozambique",
    ".na": "Namibia",
    ".nc": "New Caledonia",
    ".ne": "Niger",
    ".nf": "Norfolk Island",
    ".ng": "Nigeria",
    ".ni": "Nicaragua",
    ".nl": "Netherlands",
    ".no": "Norway",
    ".np": "Nepal",
    ".nr": "Nauru",
    ".nu": "Niue",
    ".nz": "New Zealand",
    ".om": "Oman",
    ".pa": "Panama",
    ".pe": "Peru",
    ".pf": "French Polynesia",
    ".pg": "Papua New Guinea",
    ".ph": "Philippines",
    ".pk": "Pakistan",
    ".pl": "Poland",
    ".pm": "Saint Pierre and Miquelon",
    ".pn": "Pitcairn",
    ".pr": "Puerto Rico",
    ".ps": "Palestinian Territory",
    ".pt": "Portugal",
    ".pw": "Palau",
    ".py": "Paraguay",
    ".qa": "Qatar",
    ".re": "Réunion",
    ".ro": "Romania",
    ".rs": "Serbia",
    ".ru": "Russia",
    ".rw": "Rwanda",
    ".sa": "Saudi Arabia",
    ".sb": "Solomon Islands",
    ".sc": "Seychelles",
    ".sd": "Sudan",
    ".se": "Sweden",
    ".sg": "Singapore",
    ".sh": "Saint Helena",
    ".si": "Slovenia",
    ".sj": "Svalbard and Jan Mayen",
    ".sk": "Slovakia",
    ".sl": "Sierra Leone",
    ".sm": "San Marino",
    ".sn": "Senegal",
    ".so": "Somalia",
    ".sr": "Suriname",
    ".ss": "South Sudan",
    ".st": "São Tomé and Príncipe",
    ".sv": "El Salvador",
    ".sx": "Sint Maarten (Dutch part)",
    ".sy": "Syria",
    ".sz": "Eswatini",
    ".tc": "Turks and Caicos Islands",
    ".td": "Chad",
    ".tf": "French Southern Territories",
    ".tg": "Togo",
    ".th": "Thailand",
    ".tj": "Tajikistan",
    ".tk": "Tokelau",
    ".tl": "Timor-Leste",
    ".tm": "Turkmenistan",
    ".tn": "Tunisia",
    ".to": "Tonga",
    ".tr": "Turkey",
    ".tt": "Trinidad and Tobago",
    ".tv": "Tuvalu",
    ".tw": "Taiwan",
    ".tz": "Tanzania",
    ".ua": "Ukraine",
    ".ug": "Uganda",
    ".uk": "United Kingdom",
    ".us": "United States",
    ".uy": "Uruguay",
    ".uz": "Uzbekistan",
    ".va": "Vatican City",
    ".vc": "Saint Vincent and the Grenadines",
    ".ve": "Venezuela",
    ".vg": "British Virgin Islands",
    ".vi": "U.S. Virgin Islands",
    ".vn": "Vietnam",
    ".vu": "Vanuatu",
    ".wf": "Wallis and Futuna",
    ".ws": "Samoa",
    ".ye": "Yemen",
    ".yt": "Mayotte",
    ".za": "South Africa",
    ".zm": "Zambia",
    ".zw": "Zimbabwe"
    }
    
    for ccTLD in ccTLD_to_region:
        if primary_domain.endswith(ccTLD):
            return ccTLD_to_region[ccTLD]
    
    return "Global"

urls_df['url_region'] = urls_df['primary_domain'].apply(lambda x: get_url_region(str(x)))

def extract_root_domain(url):
    extracted = tldextract.extract(url)
    root_domain = extracted.domain
    return root_domain

urls_df['root_domain'] = urls_df['primary_domain'].apply(lambda x: extract_root_domain(str(x)))

- **Hash Encode both URL Region & Root Domain**

In [19]:
def hash_encode(category):
    if category is None:
        return 0 
    
    hash_object = hashlib.md5(category.encode())
    return int(hash_object.hexdigest(), 16) % (10 ** 8)

urls_df['root_domain'] = urls_df['root_domain'].apply(hash_encode)
urls_df['url_region'] = urls_df['url_region'].apply(hash_encode)

In [20]:
urls_df.head()

Unnamed: 0,url,type,abnormal_url,is_https,cleaned_url,@,?,-,=,.,...,!,*,",",//,letters,digits,url_length,primary_domain,url_region,root_domain
0,br-icloud.com.br,phishing,0,0,br-icloud.com.br,0,0,1,0,2,...,0,0,0,0,13,0,16,br-icloud.com.br,27739261,1310791
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,mp3raid.com/music/krizz_kaliko.html,0,0,0,0,2,...,0,0,0,0,29,1,35,mp3raid.com,32604616,58335668
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,bopsecrets.org/rexroth/cr/1.htm,0,0,0,0,2,...,0,0,0,0,25,1,31,bopsecrets.org,32604616,28611805
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,0,garage-pirenne.be/index.php?option=com_content...,0,1,1,4,2,...,0,0,0,0,56,7,77,garage-pirenne.be,71484583,89045308
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,0,adventure-nicaragua.net/index.php?option=com_m...,0,1,1,3,2,...,0,0,0,0,195,22,228,adventure-nicaragua.net,32604616,76838614


- **Shorterned url**
(Checks to see whether URL contains a shortening service)

In [21]:
def shortening_service(url):
    match = re.search(r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    r'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    r'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    r'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    r'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    r'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                    r'tr\.im|link\.zip\.net', url)
    if match:
        return 1
    else:
        return 0

In [22]:
urls_df['have_shortening_service'] = urls_df['url'].apply(shortening_service)

print(urls_df[urls_df["have_shortening_service"]== 1]["type"].value_counts())
print(urls_df[urls_df["have_shortening_service"]== 0]["type"].value_counts())

type
benign        29678
phishing       6832
defacement     2529
malware         445
Name: count, dtype: int64
type
benign        399402
defacement     92779
phishing       87260
malware        23200
Name: count, dtype: int64


- **IP Address**

In [23]:
def have_ip_address(url):
    ipv4_pattern = re.compile(      
        r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
        r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)')

    ipv6_pattern = re.compile(r'\[?([A-Fa-f0-9]{1,4}:){7}([A-Fa-f0-9]{1,4}|:)\]?')

    # Check for IPv4 address
    if ipv4_pattern.search(url):
        return 1
    # Check for IPv6 address
    elif ipv6_pattern.search(url):
        return 1
    else:
        return 0

In [24]:
urls_df['have_ip_address'] = urls_df['url'].apply(have_ip_address)

print(urls_df[urls_df["have_ip_address"]== 1]["type"].value_counts())
print(urls_df[urls_df["have_ip_address"]== 0]["type"].value_counts())

type
malware     11790
phishing      483
benign        204
Name: count, dtype: int64
type
benign        428876
defacement     95308
phishing       93609
malware        11855
Name: count, dtype: int64


****

## Data splitting (Train and test data)

In [25]:
x = urls_df.drop(['type', 'url', 'primary_domain', 'cleaned_url'],  axis = 1) #added primary domain to x
y = urls_df['type']
urls_df.head()

Unnamed: 0,url,type,abnormal_url,is_https,cleaned_url,@,?,-,=,.,...,",",//,letters,digits,url_length,primary_domain,url_region,root_domain,have_shortening_service,have_ip_address
0,br-icloud.com.br,phishing,0,0,br-icloud.com.br,0,0,1,0,2,...,0,0,13,0,16,br-icloud.com.br,27739261,1310791,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,mp3raid.com/music/krizz_kaliko.html,0,0,0,0,2,...,0,0,29,1,35,mp3raid.com,32604616,58335668,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,bopsecrets.org/rexroth/cr/1.htm,0,0,0,0,2,...,0,0,25,1,31,bopsecrets.org,32604616,28611805,0,0
3,http://garage-pirenne.be/index.php?option=com_...,defacement,1,0,garage-pirenne.be/index.php?option=com_content...,0,1,1,4,2,...,0,0,56,7,77,garage-pirenne.be,71484583,89045308,0,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,1,0,adventure-nicaragua.net/index.php?option=com_m...,0,1,1,3,2,...,0,0,195,22,228,adventure-nicaragua.net,32604616,76838614,0,0


In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42) #Stratify helps to balance the category (1, 2, 3, 4)

****

## Machine Learning

- **Random Forest**

In [27]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

with open('maliciousURL_classifier_official.pkl', 'wb') as model_file:
    pickle.dump(random_forest_model, model_file)


In [28]:
from sklearn.metrics import classification_report

rf_pred = random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test, rf_pred)
print(accuracy)

y_pred = random_forest_model.predict(X_test)
print(classification_report(y_test, y_pred))

0.9251703328791123
              precision    recall  f1-score   support

      benign       0.94      0.97      0.95     85816
  defacement       0.96      0.98      0.97     19062
     malware       0.96      0.90      0.93      4729
    phishing       0.79      0.68      0.73     18818

    accuracy                           0.93    128425
   macro avg       0.91      0.88      0.90    128425
weighted avg       0.92      0.93      0.92    128425



In [29]:
# Analyze feature importances
importances = random_forest_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': x.columns, 'Importance': importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

print(feature_importance_df)

                    Feature  Importance
0              abnormal_url    0.230370
19              root_domain    0.192282
17               url_length    0.111317
15                  letters    0.101032
18               url_region    0.065028
5                         =    0.058141
16                   digits    0.056855
6                         .    0.041695
4                         -    0.038591
21          have_ip_address    0.030595
1                  is_https    0.026126
3                         ?    0.020647
8                         %    0.012245
20  have_shortening_service    0.005292
9                         +    0.004265
14                       //    0.002078
13                        ,    0.001097
2                         @    0.001044
7                         #    0.000762
11                        !    0.000354
12                        *    0.000093
10                        $    0.000088


****