```!pip install tldextract```

In [20]:
import requests
import json
from urllib.parse import urlparse
import pandas as pd
import tldextract

In [2]:
def validate_ip(s):
    a = s.split('.')
    if len(a) != 4:
        return False
    for x in a:
        if not x.isdigit():
            return False
        i = int(x)
        if i < 0 or i > 255:
            return False
    return True

In [3]:
validate_ip('www.slideshare.net')
validate_ip('127.0.0.1')
validate_ip('222.999.0.1')

False

Components includes following components and is shown in this image ![URL structure](url_structure.png)

- FQDN: Fully Qualified Domain Name
- mld: main level domain
- FreeURL
- RDN: Registered Domain Name

[ref](https://arxiv.org/pdf/1510.06501.pdf)

In [84]:
def parse_url_components(url):
    ext_result = tldextract.extract(url)
    FQDN = '.'.join(part for part in ext_result if part)
    mld = ext_result.domain
    RDN = ext_result.registered_domain

    FreeURL = ext_result.subdomain + ',' + url.split(FQDN)[1]
    protocol = url.split("://")[0]

    url_components = {'protocol': protocol,'FQDN': FQDN, 'RDN': RDN, 'mld': mld, 'FreeURL': FreeURL}
    return url_components

In [85]:
test_url = "http://college-eisk.ru/cli/"
parse_url_components(test_url)

{'protocol': 'http',
 'FQDN': 'college-eisk.ru',
 'RDN': 'college-eisk.ru',
 'mld': 'college-eisk',
 'FreeURL': ',/cli/'}

In [5]:
def get_domain_age_in_days(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/age/" + domain
    data = requests.get(show).json()
    return data['result'] if 'result' in data else None

#### Phishing  URL  and  domain  name  obfuscation  techniques tend  to  produce  long  URLs  composed  of  many  terms. [ref](https://arxiv.org/pdf/1510.06501.pdf)

In [99]:
def analyze_url_features(url, Majestic_million_list):
    # RDNRank default to 1000001 for those websites that are not in the Majestic million list
    url_features = {'url': url, 'url_length': len(url),
                   'domain_age':None, 'IP_based': False,
                    'FreeURL_dot_cnt': 0, 'level_domain_cnt': None,
                   'FQDN_length': None, 'mld_length': None,
                   'url_terms_cnt': 0, 'RDNRank': 1000001}

    url_components = parse_url_components(url)

    url_features['IP_based'] = validate_ip(url_components['FQDN'])
    # url has domain
    if not url_features['IP_based']:    
        domain_age = get_domain_age_in_days(url_components['RDN'])
        url_features['domain_age'] = domain_age if domain_age else None

        url_features['level_domain_cnt'] = url_components['FQDN'].count('.') + 1
        url_features['FQDN_length'] = len(url_components['FQDN'])
        url_features['mld_length'] = len(url_components['mld'])

        # url_terms_cnt = the number of terms in FQDN + the number of terms in the remaining part of url
        for FQDN_part in url_components['FQDN'].split('.'):
            url_features['url_terms_cnt'] += len(FQDN_part.split('-'))


    url_features['FreeURL_dot_cnt'] = url_components['FreeURL'].count('.')

    # url_terms_cnt = the number of terms in FQDN + the number of terms in the remaining part of url
    for parts in url_components['FreeURL'].split(',')[1:]:
        for part in parts.split('/'):
            if part:
                url_features['url_terms_cnt'] += len(part.split('-'))

    RDN_row = Majestic_million_list[Majestic_million_list['Domain'] == url_components['RDN']]
    if len(RDN_row):
        url_features['RDNRank'] = RDN_row.iloc[0]['GlobalRank']

    #print(url_features)
    return url_features

In [42]:
Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]

In [43]:
Majestic_million_list.head()

Unnamed: 0,GlobalRank,Domain
0,1,facebook.com
1,2,google.com
2,3,youtube.com
3,4,twitter.com
4,5,instagram.com


In [103]:
test_url = "https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus"
analyze_url_features(test_url, Majestic_million_list)

{'url': 'https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus',
 'url_length': 76,
 'domain_age': 5106,
 'IP_based': False,
 'FreeURL_dot_cnt': 0,
 'level_domain_cnt': 3,
 'FQDN_length': 18,
 'mld_length': 10,
 'url_terms_cnt': 9,
 'RDNRank': 91}

In [100]:
# Note some of these urls are live phishing sites (as of 2019-03-21) use with caution!
# More can be found at https://www.phishtank.com/
example_urls = ["https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8",
                "http://192.168.0.1/paypal.cgi?fixaccount"
               ]

urls_features = []
for url in example_urls:
    urls_features.append(analyze_url_features(url, Majestic_million_list))

#print(urls_features)

In [101]:
urls_features_df = pd.DataFrame(urls_features)

In [102]:
urls_features_df.head(7)

Unnamed: 0,url,url_length,domain_age,IP_based,FreeURL_dot_cnt,level_domain_cnt,FQDN_length,mld_length,url_terms_cnt,RDNRank
0,https://www.slideshare.net/weaveworks/client-s...,76,5106.0,False,0,3.0,18.0,10.0,9,91
1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,43,5020.0,False,3,6.0,35.0,5.0,6,3910
2,https://paypal.co.uk.yatn.eu/m/,31,,False,2,5.0,20.0,4.0,6,1000001
3,http://college-eisk.ru/cli/,27,3088.0,False,0,2.0,15.0,12.0,4,1000001
4,https://dotpay-platnosc3.eu/dotpay/,35,,False,0,2.0,19.0,16.0,4,1000001
5,https://www.amazon.co.uk/ap/signin?encoding=UTF8,48,8640.0,False,0,4.0,16.0,6.0,6,194
6,http://192.168.0.1/paypal.cgi?fixaccount,40,,True,1,,,,1,1000001


In [12]:
# write the extracted features to csv file
urls_features_df.to_csv ('url_features.csv', index = False, header=True)

The original majestic_million file contains many unnecessary columns for this project and is too large
so here we keep only necessary columns


```Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]
Majestic_million_list.to_csv ('majestic_million.csv', index = True, header=True)```

## Compute feature sets by comparing the characteristics of the input page and those of the homepag. Ref from [DeltaPhish](https://arxiv.org/pdf/1707.00317.pdf)

Install necessary libraries
```
!pip install lxml
!pip install requests
```

In [14]:
from lxml import html

In [None]:
def get_website_hyperlinks(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    hyperlinks = []
    for atag in tree.xpath('//a'):
        hyperlinks.append(atag.attrib['href'])
        # print(atag.attrib['href'], atag.text_content())
    return hyperlinks

In [None]:
homepage_url = 'https://www.google.com/'
homepage_hyperlinks = get_website_hyperlinks(website_url)

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [None]:
inputpage_url = 'https://calendar.google.com/calendar/r?tab=wc'
inputpage_hyperlinks = get_website_hyperlinks(inputpage_url)

In [None]:
jaccard_similarity(homepage_hyperlinks, inputpage_hyperlinks)

In [None]:
print(homepage_hyperlinks)

In [None]:
print(inputpage_hyperlinks)

In [None]:
len(set(homepage_hyperlinks).intersection(inputpage_hyperlinks))

#### About relative url and absolute url [here](http://www.dirigodev.com/blog/seo-web-best-practices/relative-vs-absolute-urls-seo/)


In [None]:
def get_second_level_domain(url):
    parse_result = urlparse(url)
    domain = parse_result.netloc

    second_level_domain = '.'.join(domain.split('.')[-2:])
    return second_level_domain

In [None]:
# 2LD (second-level  domain)
inputpage_2LDs = []
for url in inputpage_hyperlinks:
    sencond_level_domain = get_second_level_domain(url)
    if sencond_level_domain:
        inputpage_2LDs.append(sencond_level_domain)
    else:
        inputpage_2LDs.append(get_second_level_domain(inputpage_url))

In [None]:
homepage_2LDs = []
for url in homepage_hyperlinks:
    sencond_level_domain = get_second_level_domain(url)
    if sencond_level_domain:
        homepage_2LDs.append(sencond_level_domain)
    # relative url
    else:
        homepage_2LDs.append(get_second_level_domain(homepage_url))

In [None]:
print(set(homepage_2LDs))

In [None]:
print(homepage_2LDs)

In [None]:
print(set(inputpage_2LDs))

In [None]:
jaccard_similarity(homepage_2LDs, inputpage_2LDs)

### Style tags

In [None]:
def get_website_style_tags(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    style_tags = []
    for element in tree.xpath('//@style'):
        # print(element)
        style_tags.append(element)
    return style_tags

In [None]:
homepage_url = 'https://www.google.com/'
homepage_styles = get_website_style_tags(homepage_url)

In [None]:
inputpage_url = 'https://calendar.google.com/calendar/r?tab=wc'
inputpage_styles = get_website_style_tags(inputpage_url)

In [None]:
print(inputpage_styles)

In [None]:
print(homepage_styles)

In [None]:
jaccard_similarity(homepage_styles, inputpage_styles)

### external  style  sheets

In [None]:
# SSURL: external  style  sheets
def get_website_SSURL(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    SS_URLs = []
    for linktag in tree.xpath('//link[@rel="stylesheet" or @type="text/css"]'):
        SS_URLs.append(linktag.attrib['href'])
    return SS_URLs

In [None]:
homepage_SSURLs = get_website_SSURL('https://www.w3schools.com/tags/tag_link.asp')

In [None]:
print(homepage_SSURLs)

### Image URL

In [None]:
# linked image url
def get_website_iURLs(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    iURLs = []
    for imgtag in tree.xpath('//img'):
        iURLs.append(imgtag.attrib['src'])
        #print(imgtag.attrib['src'])
    return iURLs

In [None]:
hompage_iurls = get_website_iURLs(homepage_url)

In [None]:
print(hompage_iurls)

In [None]:
inputpage_iurls = get_website_iURLs(inputpage_url)

In [None]:
print(inputpage_iurls)

In [None]:
jaccard_similarity(hompage_iurls, inputpage_iurls)

### Title

In [None]:
def get_website_title(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    return tree.xpath('//title')[0].text_content()

In [None]:
get_website_title(homepage_url)

In [None]:
get_website_title(inputpage_url)

### X-links

In [None]:
x_link = True if homepage_url in inputpage_hyperlinks else False

In [None]:
print(x_link)