In [1]:
from urllib.parse import urlparse, urlencode
import pandas as pd
import ipaddress
import regex as re

In [2]:
urls = pd.read_csv("datasets/urls.csv")
urls.head()

Unnamed: 0,url,isPhishing
0,http://1337x.to/torrent/1048648/American-Snipe...,0
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,0
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,0
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,0
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,0


*Features to Extract:*
https://www.researchgate.net/publication/333166694_Phishing_URL_detection_system_based_on_URL_features_using_SVM

1. Protocol
- to check if SSL is enabled

2. IP Address is used 

3. Length of URL
- longer urls to hide doubtful parts of address bar

4. Number of symbols (special characters) to total character ratio

5. Use of '@' symbol
- using @ leads browser to ignore everything preceding @ and the real address follows @ symbol

6. Using redirection with '//' symbol
- redirecting users to another website
- check location of occurrance of symbol

8. Length of path to length of URL

9. Check for suspicious keywords 

10. Number of subdomains

In [3]:
def use_https(url):
    if url.startswith("https"):
        return 1
    return 0

In [4]:
def has_ip(url):
    domain = urlparse(url).netloc
    try:
        ipaddress.ip_address(domain)
        return 1
    except:
        return 0


In [5]:
def length_of_url(url):
    return len(url)

In [6]:
def symbols_to_totalch(url):
    if len(url) == 0:
        return None
    num_symbols = len(url)-len(re.findall('[\w]', url))
    return round(num_symbols / len(url), 5)

In [7]:
def have_at(url):
    if "@" in url:
        return 1    
    return 0    

In [8]:
def have_redirection(url):
    position = url.rfind("//")
    if position > 7:
        return 1
    return 0

In [9]:
def path_to_url_length(url):
    if len(url) == 0:
        return None

    paths = 0
    url_list = urlparse(url).path.split("/")

    for s in url_list:
        if len(s) != 0:
            paths += 1

    return round(paths / len(url), 5)

In [10]:
def subdomains(url):
    domain = urlparse(url).netloc
    return len(domain.split("."))

Final Dataset

In [1]:
def feature_extraction(url, is_phishing):

    url_feature = [
        url,
        use_https(url),
        has_ip(url),
        length_of_url(url),
        symbols_to_totalch(url),
        have_at(url),
        have_redirection(url),
        path_to_url_length(url),
        subdomains(url),
        is_phishing
    ]

    return url_feature

In [12]:
feature_names = [
    "url",
    "use_https",
    "has_ip",
    "length_of_url",
    "symbols_to_totalch",
    "have_at",
    "have_redirection",
    "path_to_url_length",
    "subdomains",
    "is_phishing"
]

features = []
for i in range(len(urls)):
    features.append(feature_extraction(urls.url[i], urls.isPhishing[i]))

df = pd.DataFrame(features, columns=feature_names)
df.head()

Unnamed: 0,url,use_https,has_ip,length_of_url,symbols_to_totalch,have_at,have_redirection,path_to_url_length,subdomains,is_phishing
0,http://1337x.to/torrent/1048648/American-Snipe...,0,0,83,0.19277,0,0,0.03614,2,0
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,0,0,83,0.20482,0,0,0.03614,2,0
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,0,0,83,0.20482,0,0,0.03614,2,0
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,0,0,83,0.22892,0,0,0.03614,2,0
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,0,0,83,0.20482,0,0,0.03614,2,0


In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
use_https,70756.0,0.193581,0.395107,0.0,0.0,0.0,0.0,1.0
has_ip,70756.0,0.000452,0.021262,0.0,0.0,0.0,0.0,1.0
length_of_url,70756.0,89.78081,46.794274,16.0,68.0,88.0,104.0,2081.0
symbols_to_totalch,70756.0,0.170029,0.048846,0.01726,0.14013,0.16216,0.19355,0.41176
have_at,70756.0,0.007703,0.087426,0.0,0.0,0.0,0.0,1.0
have_redirection,70756.0,0.007604,0.086867,0.0,0.0,0.0,0.0,1.0
path_to_url_length,70756.0,0.026242,0.019528,0.0,0.01667,0.02564,0.03226,0.16505
subdomains,70756.0,3.15487,1.329141,2.0,2.0,3.0,5.0,10.0
is_phishing,70756.0,0.5,0.500004,0.0,0.0,0.5,1.0,1.0


In [14]:
df.to_csv("datasets/final_model.csv", index=False)