In [39]:
import requests
import json
from urllib.parse import urlparse
import pandas as pd
import tldextract
import os
import xml.etree.ElementTree as ET
from pysafebrowsing import SafeBrowsing
import whois
from lxml import html
import html2text
import re
import time
import datetime

In [2]:
def validate_ip(s):
    a = s.split('.')
    if len(a) != 4:
        return False
    for x in a:
        if not x.isdigit():
            return False
        i = int(x)
        if i < 0 or i > 255:
            return False
    return True

In [3]:
validate_ip('www.slideshare.net')
validate_ip('127.0.0.1')
validate_ip('222.999.0.1')

False

Components includes following components and is shown in this image ![URL structure](url_structure.png)

- FQDN: Fully Qualified Domain Name
- mld: main level domain
- FreeURL
- RDN: Registered Domain Name

[ref](https://arxiv.org/pdf/1510.06501.pdf)

In [4]:
def parse_url_components(url):
    ext_result = tldextract.extract(url)
    FQDN = '.'.join(part for part in ext_result if part)
    mld = ext_result.domain
    RDN = ext_result.registered_domain

    FreeURL = ext_result.subdomain + ',' + url.split(FQDN)[1]
    protocol = url.split("://")[0]

    url_components = {'protocol': protocol,'FQDN': FQDN, 'RDN': RDN, 'mld': mld, 'FreeURL': FreeURL}
    return url_components

In [5]:
test_url = "http://college-eisk.ru/cli/"
parse_url_components(test_url)

{'protocol': 'http',
 'FQDN': 'college-eisk.ru',
 'RDN': 'college-eisk.ru',
 'mld': 'college-eisk',
 'FreeURL': ',/cli/'}

In [6]:
def get_domain_age_in_days(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/age/" + domain
    data = requests.get(show).json()
    return data['result'] if 'result' in data else None

### PhishTank and Google Safebrowsing blacklist

Note: Safebrowsing API needs to be activated in google cloud before it can be correctly used and google API key is exported in the console environment since it is more secure than writing it here.

In [7]:
# This API does not seem like working correctly. It marks Twitter and Google as phishing.
def is_phishtank_blacklisted(url):
    response = requests.post('https://checkurl.phishtank.com/checkurl/index.php?url=' + url)
    # print(response.text)
    res = response.content.decode(response.encoding)
    root = ET.fromstring(res)

    result = root.find('results').find('url0')
    is_blacklisted = result.find('in_database').text
    return True if is_blacklisted == 'true' else False

In [8]:
test_url = "https://paypal.co.uk.yatn.eu/m/"
is_phishtank_blacklisted(test_url)

True

In [9]:
def is_malicious_in_google_safebrowsing(url):
    googleapikey = os.environ['GOOGLEAPIKEY']
    s = SafeBrowsing(googleapikey)
    response = s.lookup_urls([url])
    # print(response)
    return (response[url]['malicious'])

In [10]:
test_url = 'http://malware.testing.google.test/testing/malware/'
is_malicious_in_google_safebrowsing(test_url)

True

In [11]:
# Note: there is limitation how much request you can send to virustotal API, so when you have mutiple
# urls to check, better to use time.sleep() between callings to this function
def is_malicious_in_virustotal(url):
    virustotal_apikey = os.environ['VIRUSTOTALKEY']
    params = {'apikey': virustotal_apikey, 'resource': url}
    response = requests.post('https://www.virustotal.com/vtapi/v2/url/report', data=params)
    return True if response.json()['positives'] else False

In [12]:
test_url = "https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus"
is_malicious_in_virustotal(test_url)

False

In [None]:
example_urls = ["http://twitter.com/",
                "http://github.com",
                "http://www.yahoo.com/",
                "http://www.google.com/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
               ]
for url in example_urls:
    print(is_malicious_in_virustotal(url))
    print(is_malicious_in_google_safebrowsing(url))
    print(is_malicious_in_virustotal(url) or is_malicious_in_google_safebrowsing(url))
    time.sleep(40) 

### WHOIS features

- Name of the domain provider
- Ownership period

In [65]:
def get_domain_related_features(domain, url_features):
    domain_info = whois.whois(domain)
    if not domain_info:
        return

    # print(domain_info.__dict__)
    url_features['domain_provider'] = domain_info.registrar if domain_info.registrar else None

    if (isinstance(domain_info.expiration_date, list) and
        isinstance(domain_info.creation_date, list)):
            if (isinstance(domain_info.expiration_date[0], datetime.datetime) and
                isinstance(domain_info.creation_date[0], datetime.datetime)):
                url_features['domain_valid_period'] = (domain_info.expiration_date[0] - 
                                                       domain_info.creation_date[0]).days

    elif (isinstance(domain_info.expiration_date, datetime.datetime) and
          isinstance(domain_info.creation_date, datetime.datetime)):
        url_features['domain_valid_period'] = (domain_info.expiration_date - 
                                               domain_info.creation_date).days

#### Phishing  URL  and  domain  name  obfuscation  techniques tend  to  produce  long  URLs  composed  of  many  terms. [ref](https://arxiv.org/pdf/1510.06501.pdf)

In [66]:
def analyze_url_features(url, Majestic_million_list):
    # RDNRank default to 1000001 for those websites that are not in the Majestic million list
    url_features = {'url': url, 'url_length': len(url),
                    'is_blacklisted': False, 'is_IPbased': False,
                    'domain_age':None, 'FreeURL_dot_cnt': 0,
                    'level_domain_cnt': None, 'FQDN_length': None,
                    'mld_length': None, 'url_terms_cnt': 0,
                    'RDNRank': 1000001, 'domain_provider': None,
                    'domain_valid_period': None
                   }

    # Tried also phishtank API but it does not seem like working correctly
    url_features['is_blacklisted'] = is_malicious_in_virustotal(url) or is_malicious_in_google_safebrowsing(url)

    url_components = parse_url_components(url)
    url_features['is_IPbased'] = validate_ip(url_components['FQDN'])

    # url has domain
    if not url_features['is_IPbased']:
        domain_age = get_domain_age_in_days(url_components['RDN'])
        url_features['domain_age'] = domain_age if domain_age else None

        url_features['level_domain_cnt'] = url_components['FQDN'].count('.') + 1
        url_features['FQDN_length'] = len(url_components['FQDN'])
        url_features['mld_length'] = len(url_components['mld'])

        # update the domain related features
        get_domain_related_features(url_components['RDN'], url_features)
 
        # url_terms_cnt = the number of terms in FQDN + the number of terms in the remaining part of url
        for FQDN_part in url_components['FQDN'].split('.'):
            url_features['url_terms_cnt'] += len(FQDN_part.split('-'))


    url_features['FreeURL_dot_cnt'] = url_components['FreeURL'].count('.')

    # url_terms_cnt = the number of terms in FQDN + the number of terms in the remaining part of url
    for parts in url_components['FreeURL'].split(',')[1:]:
        for part in parts.split('/'):
            if part:
                url_features['url_terms_cnt'] += len(part.split('-'))

    RDN_row = Majestic_million_list[Majestic_million_list['Domain'] == url_components['RDN']]
    if len(RDN_row):
        url_features['RDNRank'] = RDN_row.iloc[0]['GlobalRank']

    # Note the purpose here is not to have RDN as a feature, but as a way to pass it to later functions
    # to get features related to RDN and avoid parse it again there
    url_features['mld'] = url_components['mld']

    #print(url_features)
    return url_features

In [15]:
Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]

In [16]:
Majestic_million_list.head()

Unnamed: 0,GlobalRank,Domain
0,1,facebook.com
1,2,google.com
2,3,youtube.com
3,4,twitter.com
4,5,instagram.com


In [67]:
test_url = "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
analyze_url_features(test_url, Majestic_million_list)

{'url': 'https://www.amazon.co.uk/ap/signin?encoding=UTF8',
 'url_length': 48,
 'is_blacklisted': False,
 'is_IPbased': False,
 'domain_age': 8642,
 'FreeURL_dot_cnt': 0,
 'level_domain_cnt': 4,
 'FQDN_length': 16,
 'mld_length': 6,
 'url_terms_cnt': 6,
 'RDNRank': 194,
 'domain_provider': 'Amazon.com, Inc. t/a Amazon.com, Inc. [Tag = AMAZON-COM]',
 'domain_valid_period': None,
 'mld': 'amazon'}

In [68]:
# Note some of these urls are live phishing sites (as of 2019-03-21) use with caution!
# More can be found at https://www.phishtank.com/
example_urls = ["https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8",
                "http://192.168.0.1/paypal.cgi?fixaccount"
               ]

urls_features = []
for url in example_urls:
    urls_features.append(analyze_url_features(url, Majestic_million_list))
    # To curcumvent the limits set by virustotal on the API calls within a given time
    time.sleep(30)
#print(urls_features)

In [69]:
urls_features_df = pd.DataFrame(urls_features)

In [70]:
urls_features_df.head(7)

Unnamed: 0,url,url_length,is_blacklisted,is_IPbased,domain_age,FreeURL_dot_cnt,level_domain_cnt,FQDN_length,mld_length,url_terms_cnt,RDNRank,domain_provider,domain_valid_period,mld
0,https://www.slideshare.net/weaveworks/client-s...,76,False,False,5108.0,0,3.0,18.0,10.0,9,91,"MarkMonitor, Inc.",6209.0,slideshare
1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,43,True,False,5023.0,3,6.0,35.0,5.0,6,3910,TIMEWEB-RU,5114.0,tmweb
2,https://paypal.co.uk.yatn.eu/m/,31,True,False,,2,5.0,20.0,4.0,6,1000001,,,yatn
3,http://college-eisk.ru/cli/,27,True,False,3091.0,0,2.0,15.0,12.0,4,1000001,R01-RU,3288.0,college-eisk
4,https://dotpay-platnosc3.eu/dotpay/,35,False,False,,0,2.0,19.0,16.0,4,1000001,,,dotpay-platnosc3
5,https://www.amazon.co.uk/ap/signin?encoding=UTF8,48,False,False,8642.0,0,4.0,16.0,6.0,6,194,"Amazon.com, Inc. t/a Amazon.com, Inc. [Tag = A...",,amazon
6,http://192.168.0.1/paypal.cgi?fixaccount,40,False,True,,1,,,,1,1000001,,,192.168.0.1


In [71]:
# write the extracted features to csv file
urls_features_df.to_csv ('url_features.csv', index = False, header=True)

The original majestic_million file contains many unnecessary columns for this project and is too large
so here we keep only necessary columns


```Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]
Majestic_million_list.to_csv ('majestic_million.csv', index = True, header=True)```

## Features that need to be obtained by actually visiting the page, thus be cautions and use only legitimate website to test following functions 

Features include:
- Number of Redirection

#### Number of Redirection

In [72]:
def get_redirect_chain(url):
    res = requests.get(url)
    redirect_number = len(res.history)
    landing_url = res.url

    redirect_chain = []
    if res.history:
        for responses in res.history:
            redirect_chain.append(responses.url)

    # adding landing url
    redirect_chain.append(landing_url)
    return redirect_chain

In [73]:
test_url = 'http://www.example.org'
redirect_chain = get_redirect_chain(test_url)

redirect_number = len(redirect_chain) - 1
landing_url = redirect_chain[-1]
print(redirect_number)
print(landing_url)

0
http://www.example.org/


#### Parse webpage content

- Title
- Text
- HREF links
- Image urls


### to do: write a function to validate url

In [74]:
def get_website_hyperlinks(tree):
    hyperlinks = []
    for atag in tree.xpath('//a[@href]'):
        hyperlinks.append(atag.attrib['href'])
        # print(atag.attrib['href'], atag.text_content())
    return hyperlinks

In [75]:
def get_website_text(page_content):
    h = html2text.HTML2Text()
    h.ignore_links = True
    text = h.handle(page_content)
    # Escapte special characters and spaces
    return " ".join(re.findall(r"(?i)\b[a-z]+\b", text))

In [76]:
# linked image url
def get_website_iURLs(tree):
    iURLs = []
    for imgtag in tree.xpath('//img[@src]'):
        iURLs.append(imgtag.attrib['src'])
    return iURLs

In [77]:
def get_website_style_tags(tree):
    style_tags = []
    for element in tree.xpath('//@style'):
        # print(element)
        style_tags.append(element)
    return style_tags

In [78]:
# SS_URL: external  style  sheets
def get_website_SS_URLs(tree):
    SS_URLs = []
    for linktag in tree.xpath('//link[(@rel="stylesheet" or @type="text/css") and @href]'):
        SS_URLs.append(linktag.attrib['href'])
    return SS_URLs

In [79]:
def parse_web_content(url):
    web_content = {'title': '', 'text': '',
                   'input_number': 0, 'iframe_number': 0,
                  'img_urls': [], 'href_links': [],
                  'styles': [], 'SS_urls': []}

    page = requests.get(url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)

    title_raw = tree.xpath('//title')[0].text_content()
    web_content['title'] = " ".join(re.findall(r"(?i)\b[a-z]+\b", title_raw))
    web_content['text'] = get_website_text(page_content)
    web_content['iframe_number'] = len(tree.xpath('//iframe'))
    web_content['input_number'] = len(tree.xpath('//input'))
    web_content['img_urls'] = get_website_iURLs(tree)
    web_content['href_links'] = get_website_hyperlinks(tree)
    web_content['styles'] = get_website_style_tags(tree)
    web_content['SS_urls'] = get_website_SS_URLs(tree)

    return web_content

In [80]:
test_url = 'https://www.amazon.co.uk'
web_content = parse_web_content(test_url)
print(web_content)

{'title': 'Robot Check', 'text': 'Enter the characters you see below Sorry we just need to make sure you re not a robot For best results please make sure your browser is accepting cookies Type the characters you see in this image https images na ssl images amazon com captcha druexhzz jpg Try different image Continue shopping Conditions of Use Sale Privacy Notice C Amazon com Inc or its affiliates https fls eu amazon co uk oc csi OP requestId js', 'input_number': 3, 'iframe_number': 0, 'img_urls': ['https://images-na.ssl-images-amazon.com/captcha/druexhzz/Captcha_epfxdkhhqi.jpg', 'https://fls-eu.amazon.co.uk/1/oc-csi/1/OP/requestId=MRA0MGHW54G0M6X5P0BN&js=0'], 'href_links': ['https://www.amazon.co.uk/gp/help/customer/display.html/ref=footer_cou?ie=UTF8&nodeId=1040616', 'https://www.amazon.co.uk/gp/help/customer/display.html/ref=footer_privacy?ie=UTF8&nodeId=502584'], 'styles': ['min-width:350px;padding:44px 0 !important', 'width: 350px; margin: 0 auto'], 'SS_urls': ['https://images-na.s

In [81]:
def get_FreeURL(url):
    # relative reference
    regex = re.compile(r'^(?:http|ftp)s?://', re.IGNORECASE)
    if not re.match(regex, url):
        return ''

    ext_result = tldextract.extract(url)
    FQDN = '.'.join(part for part in ext_result if part)

    FreeURL = ext_result.subdomain + ',' + url.split(FQDN)[1]
    return FreeURL

In [82]:
def is_mld_in_href_FreeURL(href_links, mld):
    for href_link in href_links:
        FreeURL = get_FreeURL(href_link)
        if mld in FreeURL:
            return True
    return False

In [83]:
def get_url_homepage(url):
    ext_result = tldextract.extract(url)
    protocol = url.split("://")[0]

    homepage_url =  protocol + "://"
    if ext_result.subdomain == 'www':
        homepage_url += 'www.'

    homepage_url += ext_result.registered_domain
    if url[-1] == '/':
        homepage_url += '/'

    return homepage_url

In [84]:
example_urls = ["http://twitter.com/",
                "http://github.com",
                "http://www.yahoo.com/",
                "http://www.google.com/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
               ]
for url in example_urls:
    print(get_url_homepage(url))
    print(url == get_url_homepage(url))

http://twitter.com/
True
http://github.com
True
http://www.yahoo.com/
True
http://www.google.com/
True
https://www.amazon.co.uk
False


In [85]:
def jaccard_similarity(list1, list2):
    if not len(list1) and not len(list1):
        return 1
        
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [86]:
def initialize_comparison_features():
    return {'hyper_link_jaccard': 1, 'hyper_link_mld_jaccard': 1,
            'ss_jaccard': 1, 'ss_url_jaccard': 1,
            'ss_url_mld_jaccard': 1, 'img_url_jaccard': 1,
            'img_url_mld_jaccard': 1, 'title_jaccard': 1,
            'is_hompage_linked': 1}

In [87]:
def get_mld_of_urls(urls):
    mlds = []
    for url in urls:
        mlds.append(tldextract.extract(url).domain)
    return mlds

In [88]:
def analyze_comparison_features(homepage_content, inputpage_content):
    comparison_features = initialize_comparison_features()
    comparison_features['hyper_link_jaccard'] = jaccard_similarity(homepage_content['href_links'],
                                                                 inputpage_content['href_links'])
    homepage_hyper_link_mlds = get_mld_of_urls(homepage_content['href_links'])
    inputpage_hyper_link_mlds = get_mld_of_urls(inputpage_content['href_links'])    
    comparison_features['hyper_link_mld_jaccard'] = jaccard_similarity(homepage_hyper_link_mlds,
                                                                      inputpage_hyper_link_mlds)

    comparison_features['ss_jaccard'] = jaccard_similarity(homepage_content['styles'],
                                                          inputpage_content['styles'])

   # external style sheets 
    comparison_features['ss_url_jaccard'] = jaccard_similarity(homepage_content['SS_urls'],
                                                          inputpage_content['SS_urls'])
    homepage_ss_url_mlds = get_mld_of_urls(homepage_content['SS_urls'])
    inputpage_ss_url_mlds = get_mld_of_urls(inputpage_content['SS_urls'])    
    comparison_features['ss_url_mld_jaccard'] = jaccard_similarity(homepage_ss_url_mlds,
                                                                   inputpage_ss_url_mlds)

    # image url
    comparison_features['img_url_jaccard'] = jaccard_similarity(homepage_content['img_urls'],
                                                          inputpage_content['img_urls'])
    homepage_img_url_mlds = get_mld_of_urls(homepage_content['img_urls'])
    inputpage_img_url_mlds = get_mld_of_urls(inputpage_content['img_urls'])    
    comparison_features['img_url_mld_jaccard'] = jaccard_similarity(homepage_img_url_mlds,
                                                                   inputpage_img_url_mlds)


    # title
    comparison_features['title_jaccard'] = jaccard_similarity(homepage_content['title'].split(' '),
                                                          inputpage_content['title'].split(' '))

    return comparison_features

In [89]:
# all the features that need to be obtained by actually sending http request to the page
def analyze_page_visit_features(url, mld_of_starting_url):
    page_visit_features = {'redirect_number': 0, 'landing_url': url,
                           'title_term_cnt': 0, 'text_term_cnt': 0,
                           'iframe_number': 0, 'input_number': 0,
                           'image_number': 0, 'href_number': 0,
                           'mld_equals': False, 'starting_mld_in_title': False,
                           'starting_mld_in_text': False, 'starting_mld_in_href_FreeURL':False,
                           'landing_mld_in_title': False, 'landing_mld_in_text': False,
                           'landing_mld_in_href_FreeURL': False
                          }

    redirect_chain = get_redirect_chain(url)
    landing_url = redirect_chain[-1]

    page_visit_features['redirect_number'] = len(redirect_chain) - 1
    page_visit_features['landing_url'] = landing_url

    web_content = parse_web_content(url)
    page_visit_features['title_term_cnt'] = len(web_content['title'].split(' '))
    page_visit_features['text_term_cnt'] = len(web_content['text'].split(' '))
    page_visit_features['iframe_number'] = web_content['iframe_number']
    page_visit_features['input_number'] = web_content['input_number']
    page_visit_features['image_number'] = len(web_content['img_urls'])
    page_visit_features['href_number'] = len(web_content['href_links'])

    page_visit_features['starting_mld_in_title'] = mld_of_starting_url in web_content['title'].lower()
    page_visit_features['starting_mld_in_text'] = mld_of_starting_url in web_content['text'].lower()
    mld_of_landing_url =  tldextract.extract(landing_url).domain  
    if mld_of_landing_url == mld_of_starting_url:
        page_visit_features['mld_equals'] = True
        page_visit_features['landing_mld_in_title'] = page_visit_features['starting_mld_in_title']
        page_visit_features['landing_mld_in_text'] = page_visit_features['starting_mld_in_text']
    else:
        page_visit_features['landing_mld_in_title'] = mld_of_landing_url in web_content['title'].lower()
        page_visit_features['landing_mld_in_text'] = mld_of_landing_url in web_content['text'].lower()

    page_visit_features['starting_mld_in_href_FreeURL'] = is_mld_in_href_FreeURL(web_content['href_links'],
                                                                                 mld_of_starting_url)
    page_visit_features['landing_mld_in_href_FreeURL'] = is_mld_in_href_FreeURL(web_content['href_links'],
                                                                                 mld_of_landing_url)


    # comparison features
    comparison_features = initialize_comparison_features()
    homepage_url = get_url_homepage(url)
    if homepage_url != url:
        homepage_content = parse_web_content(homepage_url)
        comparison_features = analyze_comparison_features(homepage_content, web_content)

        comparison_features['is_hompage_linked'] = 1 if homepage_url in web_content['href_links'] else 0

    page_visit_features.update(comparison_features)
    return page_visit_features

In [90]:
test_url = "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
analyze_page_visit_features(test_url, 'amazon')

{'redirect_number': 0,
 'landing_url': 'https://www.amazon.co.uk/ap/signin?encoding=UTF8',
 'title_term_cnt': 1,
 'text_term_cnt': 70,
 'iframe_number': 0,
 'input_number': 0,
 'image_number': 3,
 'href_number': 1,
 'mld_equals': True,
 'starting_mld_in_title': False,
 'starting_mld_in_text': True,
 'starting_mld_in_href_FreeURL': False,
 'landing_mld_in_title': False,
 'landing_mld_in_text': True,
 'landing_mld_in_href_FreeURL': False,
 'hyper_link_jaccard': 0.0,
 'hyper_link_mld_jaccard': 0.08333333333333333,
 'ss_jaccard': 0.0,
 'ss_url_jaccard': 0.0,
 'ss_url_mld_jaccard': 1.0,
 'img_url_jaccard': 0.0,
 'img_url_mld_jaccard': 0.5,
 'title_jaccard': 0.0,
 'is_hompage_linked': 1}

In [91]:
test_url = "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
features_dict = analyze_url_features(test_url, Majestic_million_list)
features_dict.update(analyze_page_visit_features(test_url, features_dict['mld']))
print(features_dict)

{'url': 'https://www.amazon.co.uk/ap/signin?encoding=UTF8', 'url_length': 48, 'is_blacklisted': False, 'is_IPbased': False, 'domain_age': 8642, 'FreeURL_dot_cnt': 0, 'level_domain_cnt': 4, 'FQDN_length': 16, 'mld_length': 6, 'url_terms_cnt': 6, 'RDNRank': 194, 'domain_provider': 'Amazon.com, Inc. t/a Amazon.com, Inc. [Tag = AMAZON-COM]', 'domain_valid_period': None, 'mld': 'amazon', 'redirect_number': 0, 'landing_url': 'https://www.amazon.co.uk/ap/signin?encoding=UTF8', 'title_term_cnt': 1, 'text_term_cnt': 70, 'iframe_number': 0, 'input_number': 0, 'image_number': 3, 'href_number': 1, 'mld_equals': True, 'starting_mld_in_title': False, 'starting_mld_in_text': True, 'starting_mld_in_href_FreeURL': False, 'landing_mld_in_title': False, 'landing_mld_in_text': True, 'landing_mld_in_href_FreeURL': False, 'hyper_link_jaccard': 0.0, 'hyper_link_mld_jaccard': 1.0, 'ss_jaccard': 0.0, 'ss_url_jaccard': 0.0, 'ss_url_mld_jaccard': 1.0, 'img_url_jaccard': 0.0, 'img_url_mld_jaccard': 0.5, 'title_ja

In [92]:
example_urls = ["http://twitter.com/",
                "http://github.com",
                "http://www.yahoo.com/",
                "http://www.google.com/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
               ]

all_features = []
for url in example_urls:
    features_dict = analyze_url_features(url, Majestic_million_list)
    features_dict.update(analyze_page_visit_features(url, features_dict['mld']))
    del features_dict["mld"]
    all_features.append(features_dict)
    time.sleep(30)

In [None]:
all_features_df = pd.DataFrame(all_features)
all_features_df.head()

## Compute feature sets by comparing the characteristics of the input page and those of the homepag. Ref from [DeltaPhish](https://arxiv.org/pdf/1707.00317.pdf)

#### About relative url and absolute url [here](http://www.dirigodev.com/blog/seo-web-best-practices/relative-vs-absolute-urls-seo/)
