In [1]:
import requests
import json
from urllib.parse import urlparse
import pandas as pd 

In [2]:
# components includes: FQDN(fully qualified domain name), mld(main level domain), FreeURL, RDN(registered domain name)
# ref https://arxiv.org/pdf/1510.06501.pdf
def parse_url_components(url):
    parse_result = urlparse(url)
    FQDN = parse_result.netloc

    FQDN_parts = FQDN.split('.')
    mld = FQDN_parts[-2]
    RDN = '.'.join(FQDN_parts[-2:])

    FreeURL = '.'.join(FQDN_parts[0:-2])
    if parse_result.path:
        FreeURL +=  ',' +  parse_result.path
    if parse_result.query:
        FreeURL += parse_result.query

    url_components = {'FQDN': FQDN, 'RDN': RDN, 'mld': mld, 'FreeURL': FreeURL}
    return url_components

In [3]:
def get_domain_age_in_days(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/age/" + domain
    data = requests.get(show).json()
    return data['result'] if 'result' in data else None

#### Phishing  URL  and  domain  name  obfuscation  techniques tend  to  produce  long  URLs  composed  of  many  terms. [ref](https://arxiv.org/pdf/1510.06501.pdf)

In [36]:
def analyze_url(url, Majestic_million_list):
    url_features = {'url': url}

    url_components = parse_url_components(url)
    # First feature, if domain is new it could indicate that the bad guy has bought it recently...
    age_in_days_feature = get_domain_age_in_days(url_components['RDN'])
    url_features['age_in_days_feature'] = age_in_days_feature if age_in_days_feature else None

    FreeURL_dot_cnt = url_components['FreeURL'].count('.')
    url_features['FreeURL_dot_cnt'] = FreeURL_dot_cnt

    level_domain_cnt = url_components['FQDN'].count('.') + 1
    url_features['level_domain_cnt'] = level_domain_cnt

    url_length = len(url)
    url_features['url_length'] = url_length

    # FQDN: fully qualified domain name
    FQDN_length = len(url_components['FQDN'])
    url_features['FQDN_length'] = FQDN_length
    
    # mld: main level domain
    mld_length = len(url_components['mld'])
    url_features['mld_length'] = mld_length

    url_terms_cnt = url_components['FQDN'].count('.') + 1
    for parts in url_components['FreeURL'].split(',')[1:]:
        # when url ends with /
        if parts[-1] == '/':
            url_terms_cnt +=  parts.count('/') - 1
        else:
            url_terms_cnt +=  parts.count('/')
    url_features['url_terms_cnt'] = url_terms_cnt

    # Majestic ranking of the RDN, can also use Alexa ranking but the complete list cost
    RDNRank = 1000001 # default to 1000001 for those websites not in the Majestic million list
    RDN_row = Majestic_million_list[Majestic_million_list['Domain'] == url_components['RDN']]
    if len(RDN_row):
        RDNRank = RDN_row.iloc[0]['GlobalRank']
    url_features['RDNRank'] = RDNRank

    #print(url_features)
    return url_features

In [5]:
Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]

In [6]:
Majestic_million_list.head()

Unnamed: 0,GlobalRank,Domain
0,1,facebook.com
1,2,google.com
2,3,youtube.com
3,4,twitter.com
4,5,instagram.com


In [37]:
# Note some of these urls are live phishing sites (as of 2019-03-21) use with caution!
# More can be found at https://www.phishtank.com/
example_urls = ["https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/"
               ]

urls_features = []
for url in example_urls:
    urls_features.append(analyze_url(url, Majestic_million_list))

print(urls_features)

[{'url': 'https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus', 'age_in_days_feature': 5104, 'FreeURL_dot_cnt': 0, 'level_domain_cnt': 3, 'url_length': 76, 'FQDN_length': 18, 'mld_length': 10, 'url_terms_cnt': 5, 'RDNRank': 91}, {'url': 'http://cartaobndes.gov.br.cv31792.tmweb.ru/', 'age_in_days_feature': 5018, 'FreeURL_dot_cnt': 3, 'level_domain_cnt': 6, 'url_length': 43, 'FQDN_length': 35, 'mld_length': 5, 'url_terms_cnt': 6, 'RDNRank': 3910}, {'url': 'https://paypal.co.uk.yatn.eu/m/', 'age_in_days_feature': None, 'FreeURL_dot_cnt': 2, 'level_domain_cnt': 5, 'url_length': 31, 'FQDN_length': 20, 'mld_length': 4, 'url_terms_cnt': 6, 'RDNRank': 1000001}, {'url': 'http://college-eisk.ru/cli/', 'age_in_days_feature': 3086, 'FreeURL_dot_cnt': 0, 'level_domain_cnt': 2, 'url_length': 27, 'FQDN_length': 15, 'mld_length': 12, 'url_terms_cnt': 3, 'RDNRank': 1000001}, {'url': 'https://dotpay-platnosc3.eu/dotpay/', 'age_in_days_feature': None, 'FreeURL_dot_cnt': 0, 'level

In [38]:
urls_features_df = pd.DataFrame(urls_features)

In [39]:
urls_features_df.head()

Unnamed: 0,url,age_in_days_feature,FreeURL_dot_cnt,level_domain_cnt,url_length,FQDN_length,mld_length,url_terms_cnt,RDNRank
0,https://www.slideshare.net/weaveworks/client-s...,5104.0,0,3,76,18,10,5,91
1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,5018.0,3,6,43,35,5,6,3910
2,https://paypal.co.uk.yatn.eu/m/,,2,5,31,20,4,6,1000001
3,http://college-eisk.ru/cli/,3086.0,0,2,27,15,12,3,1000001
4,https://dotpay-platnosc3.eu/dotpay/,,0,2,35,19,16,3,1000001


In [41]:
# write the extracted features to csv file
urls_features_df.to_csv ('url_features.csv', index = False, header=True)