In [48]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse,urlencode
import re
from bs4 import BeautifulSoup
import requests
import whois
import urllib.request
from datetime import datetime
import time
import socket
from urllib.error import HTTPError
from cython.parallel import prange


In [49]:
pip install python-whois

Note: you may need to restart the kernel to use updated packages.


In [50]:
help(urlencode)

Help on function urlencode in module urllib.parse:

urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=<function quote_plus at 0x7f2001bd9b70>)
    Encode a dict or sequence of two-element tuples into a URL query string.
    
    If any values in the query arg are sequences and doseq is true, each
    sequence element is converted to a separate parameter.
    
    If the query arg is a sequence of two-element tuples, the order of the
    parameters in the output will match the order of parameters in the
    input.
    
    The components of a query arg may each be either a string or a bytes type.
    
    The safe, encoding, and errors parameters are passed down to the function
    specified by quote_via (encoding and errors only if a component is a str).



In [51]:
pip install socket.py

Note: you may need to restart the kernel to use updated packages.


In [52]:
help(prange)

Help on method prange in module Cython.Shadow:

prange(start=0, stop=None, step=1, nogil=False, schedule=None, chunksize=None, num_threads=None) method of Cython.Shadow.CythonDotParallel instance



In [53]:
data = pd.read_csv('https://raw.githubusercontent.com/chamanthmvs/Phishing-Website-Detection/master/raw_datasets/100-legitimate-art.txt')

In [54]:
data['websites'].str.split('://').head()
# here we divided the protocol from the entire url but we need it to be divided into entire column


0         [http, www.emuck.com:3000/archive/egan.html]
1                     [http, danoday.com/summit.shtml]
2    [http, groups.yahoo.com/group/voice_actor_appr...
3                     [http, voice-international.com/]
4                    [http, www.livinglegendsltd.com/]
Name: websites, dtype: object

In [55]:
seperation_of_protocol = data['websites'].str.split('://',expand = True)

In [56]:
seperation_of_protocol.head()

Unnamed: 0,0,1
0,http,www.emuck.com:3000/archive/egan.html
1,http,danoday.com/summit.shtml
2,http,groups.yahoo.com/group/voice_actor_appreciatio...
3,http,voice-international.com/
4,http,www.livinglegendsltd.com/


In [57]:
seperation_of_protocol[1].str.split('/',1,expand = True) #seperation of domain name 
#split(seperator,no of splits according to the seperator ,expand)

Unnamed: 0,0,1
0,www.emuck.com:3000,archive/egan.html
1,danoday.com,summit.shtml
2,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,voice-international.com,
4,www.livinglegendsltd.com,
...,...,...
96,www.post-gazette.com,magazine/19990223voicetalent1.asp
97,www.serkworks.com,roommates/index.htm
98,www.armory.com,~keeper/jesshirt.html
99,www.voicechasers.com,database/showactor.php?actorid=1220


In [58]:
seperation_of_domain_name = seperation_of_protocol[1].str.split('/',1,expand = True)

In [59]:
seperation_of_domain_name

Unnamed: 0,0,1
0,www.emuck.com:3000,archive/egan.html
1,danoday.com,summit.shtml
2,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,voice-international.com,
4,www.livinglegendsltd.com,
...,...,...
96,www.post-gazette.com,magazine/19990223voicetalent1.asp
97,www.serkworks.com,roommates/index.htm
98,www.armory.com,~keeper/jesshirt.html
99,www.voicechasers.com,database/showactor.php?actorid=1220


In [60]:
seperation_of_protocol

Unnamed: 0,0,1
0,http,www.emuck.com:3000/archive/egan.html
1,http,danoday.com/summit.shtml
2,http,groups.yahoo.com/group/voice_actor_appreciatio...
3,http,voice-international.com/
4,http,www.livinglegendsltd.com/
...,...,...
96,http,www.post-gazette.com/magazine/19990223voicetal...
97,http,www.serkworks.com/roommates/index.htm
98,http,www.armory.com/~keeper/jesshirt.html
99,http,www.voicechasers.com/database/showactor.php?ac...


In [61]:
seperation_of_domain_name.columns = ['domain_name','address']

In [62]:
seperation_of_domain_name

Unnamed: 0,domain_name,address
0,www.emuck.com:3000,archive/egan.html
1,danoday.com,summit.shtml
2,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,voice-international.com,
4,www.livinglegendsltd.com,
...,...,...
96,www.post-gazette.com,magazine/19990223voicetalent1.asp
97,www.serkworks.com,roommates/index.htm
98,www.armory.com,~keeper/jesshirt.html
99,www.voicechasers.com,database/showactor.php?actorid=1220


In [63]:
splitted_data = pd.concat([seperation_of_protocol[0],seperation_of_domain_name],axis = 1)

In [64]:
splitted_data.columns = ['protocol','domain_name','address']

In [65]:
splitted_data

Unnamed: 0,protocol,domain_name,address
0,http,www.emuck.com:3000,archive/egan.html
1,http,danoday.com,summit.shtml
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...
3,http,voice-international.com,
4,http,www.livinglegendsltd.com,
...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp
97,http,www.serkworks.com,roommates/index.htm
98,http,www.armory.com,~keeper/jesshirt.html
99,http,www.voicechasers.com,database/showactor.php?actorid=1220


# feature 1 long urls

In [66]:
# this function is defined in order to differenciate website based on length of the URL
# 0 indicates legitimate
# 1 indicates phishing
# 2 indicates suscipicious
def long_url(url):
    if len(url) < 54:
        return 0
    elif len(url) >= 54 and len(url) <=75:
        return 2
    else:
        return 1
        

In [67]:
# applying the above defined function to divide the website into three different categories
splitted_data['long_url'] = data['websites'].apply(long_url)

In [68]:
splitted_data[splitted_data['long_url'] == 0]

Unnamed: 0,protocol,domain_name,address,long_url
0,http,www.emuck.com:3000,archive/egan.html,0
1,http,danoday.com,summit.shtml,0
3,http,voice-international.com,,0
4,http,www.livinglegendsltd.com,,0
5,http,voicechasers.com,forum/viewforum.php?f=8,0
...,...,...,...,...
92,http,citypaper.net,articles/022996/article001.shtml,0
95,http,www.kylehebert.com,,0
97,http,www.serkworks.com,roommates/index.htm,0
98,http,www.armory.com,~keeper/jesshirt.html,0


# feature 2 using @ symbol

In [69]:
# using @ symbol in the URL leads the browser to ignore everything precding the '@' symbol and the real address often follows the '@' symbol
# this function is used to check wether the URL contains @ symbol or not
# 1 phishing
# 0 not phishing
def have_at_symbol(url):
    if '@' in url:
        return 1
    else:
        return 0

In [70]:
splitted_data['have_at_symbols'] = data['websites'].apply(have_at_symbol)

In [71]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols
0,http,www.emuck.com:3000,archive/egan.html,0,0
1,http,danoday.com,summit.shtml,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0
3,http,voice-international.com,,0,0
4,http,www.livinglegendsltd.com,,0,0
...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0
97,http,www.serkworks.com,roommates/index.htm,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0


# feature 3 redirecting using '//'

In [72]:
# the existance of '//' in url path means the user will be redirected to another website 
# we examine where '//' appears we find that if the url starts with 'HTTP' thet means that '//' will
#appear at sixth position
# if url has symbol '//' after the protocol then such url is to be classified as phishing
def redirection(url):
    if '//' in url:
        return 1
    else:
        return 0

In [73]:
splitted_data['redirection_//_symbol'] = seperation_of_protocol[1].apply(redirection)


In [74]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol
0,http,www.emuck.com:3000,archive/egan.html,0,0,0
1,http,danoday.com,summit.shtml,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0
3,http,voice-international.com,,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0
...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0
97,http,www.serkworks.com,roommates/index.htm,0,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0


In [75]:
# the dash symbol or suffix Seperated by (-) to the domain
# the dash symbol is rarely used in legitimate urls phishers tend to add prefixis and suffix seperated by (-) to the domain name so the
# users feel that they are dealing with legitimate webpage
def prefix_suffix_seperation(url):
    if '-' in url:
        return 1
    else:
        return 0

In [76]:
splitted_data['prefix_suffix_seperation']=splitted_data['domain_name'].apply(prefix_suffix_seperation)

In [77]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol,prefix_suffix_seperation
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0
3,http,voice-international.com,,0,0,0,1
4,http,www.livinglegendsltd.com,,0,0,0,0
...,...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0,1
97,http,www.serkworks.com,roommates/index.htm,0,0,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0,0


In [78]:
# a legitimate url domain contains onlt two dots if number of url are classified as three then the url is classified as suspicious since it has one 
# sub domain however if the dots are greater than three than the classifier is phishy
def sub_domains(url):
    if url.count('.') < 3:
        return 0
    elif url.count('.') == 3:
        return 2
    else:
        return 1

In [79]:
splitted_data['sub_domains'] = splitted_data['domain_name'].apply(sub_domains)

In [80]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0
...,...,...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0,1,0
97,http,www.serkworks.com,roommates/index.htm,0,0,0,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0,0,0


In [81]:
# feature 6
# checking ip address in the url
import re 
def having_ip_address(url):
    match = re.search('(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|' #ipv4

                      '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' #ipv4 hexadecimal
                      
                    '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}' #ipv6
                      ,url)
    if match:
        return 1
    else:
        return 0

In [82]:
splitted_data['having_ip_address'] = data['websites'].apply(having_ip_address)

In [83]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0,1,0,0
97,http,www.serkworks.com,roommates/index.htm,0,0,0,0,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0,0,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0,0,0,0


In [84]:
def shortning_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net',url)
    if match:
        return 1
    else:
        return 0

In [85]:
splitted_data['shortning_service'] = data['websites'].apply(shortning_service)

In [86]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortning_service
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0,1,0,0,0
97,http,www.serkworks.com,roommates/index.htm,0,0,0,0,0,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0,0,0,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0,0,0,0,0


In [87]:
def http_tokens(url):
    match = re.search('http://|https://',url)
    if match.start(0) == 0:
        url = url[match.end(0):]
    match = re.search('http|https',url)
    if match:
        return 1
    else:
        return 0

In [88]:
splitted_data['http_tokens'] = data['websites'].apply(http_tokens)

In [89]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortning_service,http_tokens
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0,0
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0,0
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0,0
3,http,voice-international.com,,0,0,0,1,0,0,0,0
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0,1,0,0,0,0
97,http,www.serkworks.com,roommates/index.htm,0,0,0,0,0,0,0,0
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0,0,0,0,0,0
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0,0,0,0,0,0


In [90]:
def abnormal_url_whois(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
    if dns == 1:
        return 1
    else:
        return 0

In [91]:
splitted_data['abnormal_data_whois'] = splitted_data['domain_name'].apply(abnormal_url_whois)

In [97]:
import google
from googlesearch import search
def google_index(url):
    site = search(url,10)
    if site:
        return 0
    else:
        return 1
    

In [102]:
splitted_data['google_index'] = data['websites'].apply(google_index)

In [122]:
from bs4 import BeautifulSoup
import urllib.request
def alexa_ranking(url):
    try:
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(),'xml').find("REACH")['RANK']
    except TypeError:
        return 1
    rank = int(rank)
    if rank < 100000:
        return 0
    else:
        return 2

In [123]:
splitted_data['alexa_ranking'] = data['websites'].apply(alexa_ranking)

In [124]:
splitted_data

Unnamed: 0,protocol,domain_name,address,long_url,have_at_symbols,redirection_//_symbol,prefix_suffix_seperation,sub_domains,having_ip_address,shortning_service,http_tokens,abnormal_data_whois,google_index,alexa_ranking
0,http,www.emuck.com:3000,archive/egan.html,0,0,0,0,0,0,0,0,0,0,1
1,http,danoday.com,summit.shtml,0,0,0,0,0,0,0,0,0,0,2
2,http,groups.yahoo.com,group/voice_actor_appreciation/links/events_an...,1,0,0,0,0,0,0,0,1,0,0
3,http,voice-international.com,,0,0,0,1,0,0,0,0,0,0,2
4,http,www.livinglegendsltd.com,,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,http,www.post-gazette.com,magazine/19990223voicetalent1.asp,2,0,0,1,0,0,0,0,0,0,0
97,http,www.serkworks.com,roommates/index.htm,0,0,0,0,0,0,0,0,0,0,1
98,http,www.armory.com,~keeper/jesshirt.html,0,0,0,0,0,0,0,0,0,0,2
99,http,www.voicechasers.com,database/showactor.php?actorid=1220,2,0,0,0,0,0,0,0,0,0,1


In [181]:
import whois
from datetime import datetime
import time
def domain_reg_time(domain):
    expiration_date = domain.expiration_date
    today = time.strftime('%Y-%m-%d')
    today = datetime.strptime(today,'%Y-%m-%d')
    if expiration_date is None:
        return 1
    elif type(expiration_date) is list or type(today) is list:
        return 2
    else:
        registration_length = abs((expiration_date - today).days)
        if registration_length / 365 <= 1:
            return 1
        else:
            return 0
def domain_reg_time_main(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
    if dns == 1:
        return 1
    else:
        return domain_reg_time(domain_name)

In [183]:
splitted_data['domain_reg_length'] = splitted_data['domain_name'].apply(domain_reg_time_main)

In [198]:
def domain_age(domain):
    creation_date = domain['creation_date']
    expiration_date = domain['expiration_date']
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 2
    else:
        domain_age = abs((expiration_date - creation_date).days)
        if domain_age / 30 < 6:
            return 1
        else:
            return 0
def domain_age_main(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
    if dns == 1:
        return 1
    else:
        return domain_age(domain_name)

In [200]:
splitted_data['domain_age'] = splitted_data['domain_name'].apply(domain_age_main)

In [201]:
pip install socket.py

Note: you may need to restart the kernel to use updated packages.


In [249]:
def abnormal_domain_name(domain):
    hostname = whois.whois(domain)['domain_name']
    match = re.match(hostname,domain)
    if match:
        return 0
    else:
        return 1
def abnormal_domain_name_main(domain):
    dns = 0
    try:
        domain_name = whois.whois(domain)
    except:
        dns = 1
    if dns == 1:
        return 1
    else:
        return domain_age(domain_name)

In [252]:
splitted_data['domain_name'].apply(abnormal_domain_name_main)

NameError: name 'abnormal_domain_main' is not defined