```
! pip install tldextract
! pip install pysafebrowsing
! pip install whois
! pip install html2text
```

In [51]:
import requests
import json
from urllib.parse import urlparse
import pandas as pd
import tldextract
import os
import xml.etree.ElementTree as ET
from pysafebrowsing import SafeBrowsing
import whois
from lxml import html
import html2text
import re
import time

In [2]:
def validate_ip(s):
    a = s.split('.')
    if len(a) != 4:
        return False
    for x in a:
        if not x.isdigit():
            return False
        i = int(x)
        if i < 0 or i > 255:
            return False
    return True

In [3]:
validate_ip('www.slideshare.net')
validate_ip('127.0.0.1')
validate_ip('222.999.0.1')

False

Components includes following components and is shown in this image ![URL structure](url_structure.png)

- FQDN: Fully Qualified Domain Name
- mld: main level domain
- FreeURL
- RDN: Registered Domain Name

[ref](https://arxiv.org/pdf/1510.06501.pdf)

In [4]:
def parse_url_components(url):
    ext_result = tldextract.extract(url)
    FQDN = '.'.join(part for part in ext_result if part)
    mld = ext_result.domain
    RDN = ext_result.registered_domain

    FreeURL = ext_result.subdomain + ',' + url.split(FQDN)[1]
    protocol = url.split("://")[0]

    url_components = {'protocol': protocol,'FQDN': FQDN, 'RDN': RDN, 'mld': mld, 'FreeURL': FreeURL}
    return url_components

In [5]:
test_url = "http://college-eisk.ru/cli/"
parse_url_components(test_url)

{'protocol': 'http',
 'FQDN': 'college-eisk.ru',
 'RDN': 'college-eisk.ru',
 'mld': 'college-eisk',
 'FreeURL': ',/cli/'}

In [6]:
def get_domain_age_in_days(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/age/" + domain
    data = requests.get(show).json()
    return data['result'] if 'result' in data else None

### PhishTank and Google Safebrowsing blacklist

Note: Safebrowsing API needs to be activated in google cloud before it can be correctly used and google API key is exported in the console environment since it is more secure than writing it here.

In [7]:
# This API does not seem like working correctly. It marks Twitter and Google as phishing.
def is_phishtank_blacklisted(url):
    response = requests.post('https://checkurl.phishtank.com/checkurl/index.php?url=' + url)
    # print(response.text)
    res = response.content.decode(response.encoding)
    root = ET.fromstring(res)

    result = root.find('results').find('url0')
    is_blacklisted = result.find('in_database').text
    return True if is_blacklisted == 'true' else False

In [59]:
test_url = "https://paypal.co.uk.yatn.eu/m/"
is_phishtank_blacklisted(test_url)

True

In [10]:
def is_malicious_in_google_safebrowsing(url):
    googleapikey = os.environ['GOOGLEAPIKEY']
    s = SafeBrowsing(googleapikey)
    response = s.lookup_urls([url])
    # print(response)
    return (response[url]['malicious'])

In [11]:
test_url = 'http://malware.testing.google.test/testing/malware/'
is_malicious_in_google_safebrowsing(test_url)

True

In [72]:
# Note: there is limitation how much request you can send to virustotal API, so when you have mutiple
# urls to check, better to use time.sleep() between callings to this function
def is_malicious_in_virustotal(url):
    virustotal_apikey = os.environ['VIRUSTOTALKEY']
    params = {'apikey': virustotal_apikey, 'resource': url}
    response = requests.post('https://www.virustotal.com/vtapi/v2/url/report', data=params)
    return True if response.json()['positives'] else False

In [82]:
test_url = "https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus"
is_malicious_in_virustotal(test_url)

False

In [73]:
example_urls = ["http://twitter.com/",
                "http://github.com",
                "http://www.yahoo.com/",
                "http://www.google.com/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
               ]
for url in example_urls:
    print(is_malicious_in_virustotal(url))
    print(is_malicious_in_google_safebrowsing(url))
    print(is_malicious_in_virustotal(url) or is_malicious_in_google_safebrowsing(url))
    time.sleep(30) 

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


### WHOIS features

- Name of the domain provider
- Ownership period

In [60]:
domain_info = whois.query('test')
if domain_info:
    # print(domain_info.__dict__)
    registrar = domain_info.registrar if domain_info.registrar else None

    if domain_info.expiration_date and domain_info.creation_date:
        ownership_period = (domain_info.expiration_date - domain_info.creation_date).days

#### Phishing  URL  and  domain  name  obfuscation  techniques tend  to  produce  long  URLs  composed  of  many  terms. [ref](https://arxiv.org/pdf/1510.06501.pdf)

In [102]:
def analyze_url_features(url, Majestic_million_list):
    # RDNRank default to 1000001 for those websites that are not in the Majestic million list
    url_features = {'url': url, 'url_length': len(url),
                    'is_blacklisted': False, 'is_IPbased': False,
                    'domain_age':None, 'FreeURL_dot_cnt': 0,
                    'level_domain_cnt': None, 'FQDN_length': None,
                    'mld_length': None, 'url_terms_cnt': 0,
                    'RDNRank': 1000001, 'domain_provider': None,
                    'domain_valid_period': None
                   }

    # Tried also phishtank API but it does not seem like working correctly
    url_features['is_blacklisted'] = is_malicious_in_virustotal(url) or is_malicious_in_google_safebrowsing(url)

    url_components = parse_url_components(url)
    url_features['is_IPbased'] = validate_ip(url_components['FQDN'])

    # url has domain
    if not url_features['is_IPbased']:
        domain_age = get_domain_age_in_days(url_components['RDN'])
        url_features['domain_age'] = domain_age if domain_age else None

        url_features['level_domain_cnt'] = url_components['FQDN'].count('.') + 1
        url_features['FQDN_length'] = len(url_components['FQDN'])
        url_features['mld_length'] = len(url_components['mld'])

        # query the domain information
        domain_info = whois.query(url_components['RDN'])
        if domain_info:
            # print(domain_info.__dict__)
            url_features['domain_provider'] = domain_info.registrar if domain_info.registrar else None

            if domain_info.expiration_date and domain_info.creation_date:
                url_features['domain_valid_period'] = (domain_info.expiration_date - domain_info.creation_date).days

        # url_terms_cnt = the number of terms in FQDN + the number of terms in the remaining part of url
        for FQDN_part in url_components['FQDN'].split('.'):
            url_features['url_terms_cnt'] += len(FQDN_part.split('-'))


    url_features['FreeURL_dot_cnt'] = url_components['FreeURL'].count('.')

    # url_terms_cnt = the number of terms in FQDN + the number of terms in the remaining part of url
    for parts in url_components['FreeURL'].split(',')[1:]:
        for part in parts.split('/'):
            if part:
                url_features['url_terms_cnt'] += len(part.split('-'))

    RDN_row = Majestic_million_list[Majestic_million_list['Domain'] == url_components['RDN']]
    if len(RDN_row):
        url_features['RDNRank'] = RDN_row.iloc[0]['GlobalRank']

    # Note the purpose here is not to have RDN as a feature, but as a way to pass it to later functions
    # to get features related to RDN and avoid parse it again there
    url_features['mld'] = url_components['mld']

    #print(url_features)
    return url_features

In [62]:
Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]

In [63]:
Majestic_million_list.head()

Unnamed: 0,GlobalRank,Domain
0,1,facebook.com
1,2,google.com
2,3,youtube.com
3,4,twitter.com
4,5,instagram.com


In [103]:
test_url = 'https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus'
analyze_url_features(test_url, Majestic_million_list)

{'url': 'https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus',
 'url_length': 76,
 'is_blacklisted': False,
 'is_IPbased': False,
 'domain_age': 5107,
 'FreeURL_dot_cnt': 0,
 'level_domain_cnt': 3,
 'FQDN_length': 18,
 'mld_length': 10,
 'url_terms_cnt': 9,
 'RDNRank': 91,
 'domain_provider': 'MarkMonitor Inc.',
 'domain_valid_period': 6209,
 'mld': 'slideshare'}

In [84]:
# Note some of these urls are live phishing sites (as of 2019-03-21) use with caution!
# More can be found at https://www.phishtank.com/
example_urls = ["https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8",
                "http://192.168.0.1/paypal.cgi?fixaccount"
               ]

urls_features = []
for url in example_urls:
    urls_features.append(analyze_url_features(url, Majestic_million_list))
    # To curcumvent the limits set by virustotal on the API calls within a given time
    time.sleep(30)
#print(urls_features)

In [85]:
urls_features_df = pd.DataFrame(urls_features)

In [86]:
urls_features_df.head(7)

Unnamed: 0,url,url_length,is_blacklisted,is_IPbased,domain_age,FreeURL_dot_cnt,level_domain_cnt,FQDN_length,mld_length,url_terms_cnt,RDNRank,domain_provider,domain_valid_period
0,https://www.slideshare.net/weaveworks/client-s...,76,False,False,5107.0,0,3.0,18.0,10.0,9,91,MarkMonitor Inc.,6209.0
1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,43,True,False,5021.0,3,6.0,35.0,5.0,6,3910,TIMEWEB-RU,5114.0
2,https://paypal.co.uk.yatn.eu/m/,31,True,False,,2,5.0,20.0,4.0,6,1000001,Tucows.com Co.,
3,http://college-eisk.ru/cli/,27,True,False,3090.0,0,2.0,15.0,12.0,4,1000001,R01-RU,3288.0
4,https://dotpay-platnosc3.eu/dotpay/,35,False,False,,0,2.0,19.0,16.0,4,1000001,PDR Ltd.,
5,https://www.amazon.co.uk/ap/signin?encoding=UTF8,48,False,False,8641.0,0,4.0,16.0,6.0,6,194,,8892.0
6,http://192.168.0.1/paypal.cgi?fixaccount,40,False,True,,1,,,,1,1000001,,


In [87]:
# write the extracted features to csv file
urls_features_df.to_csv ('url_features.csv', index = False, header=True)

The original majestic_million file contains many unnecessary columns for this project and is too large
so here we keep only necessary columns


```Majestic_million_list = pd.read_csv("majestic_million.csv")[['GlobalRank', 'Domain']]
Majestic_million_list.to_csv ('majestic_million.csv', index = True, header=True)```

## Features that need to be obtained by actually visiting the page, thus be cautions and use only legitimate website to test following functions 

Features include:
- Number of Redirection

#### Number of Redirection

In [88]:
def get_redirect_chain(url):
    res = requests.get(url)
    redirect_number = len(res.history)
    landing_url = res.url

    redirect_chain = []
    if res.history:
        for responses in res.history:
            redirect_chain.append(responses.url)

    # adding landing url
    redirect_chain.append(landing_url)
    return redirect_chain

In [89]:
test_url = 'http://www.example.org'
redirect_chain = get_redirect_chain(test_url)

redirect_number = len(redirect_chain) - 1
landing_url = redirect_chain[-1]
print(redirect_number)
print(landing_url)

0
http://www.example.org/


#### Parse webpage content

- Title
- Text
- HREF links
- Image urls


### to do: write a function to validate url

In [90]:
def get_website_hyperlinks(tree):
    hyperlinks = []
    for atag in tree.xpath('//a[@href]'):
        hyperlinks.append(atag.attrib['href'])
        # print(atag.attrib['href'], atag.text_content())
    return hyperlinks

In [91]:
def get_website_text(page_content):
    h = html2text.HTML2Text()
    h.ignore_links = True
    text = h.handle(page_content)
    # Escapte special characters and spaces
    return " ".join(re.findall(r"(?i)\b[a-z]+\b", text))  

In [92]:
# linked image url
def get_website_iURLs(tree):
    iURLs = []
    for imgtag in tree.xpath('//img[@src]'):
        iURLs.append(imgtag.attrib['src'])
    return iURLs

In [93]:
def parse_web_content(url):
    web_content = {'title': '', 'text': '',
                   'input_number': 0, 'iframe_number': 0,
                  'img_urls': [], 'href_links': []}

    page = requests.get(url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)

    web_content['title'] = tree.xpath('//title')[0].text_content()
    web_content['text'] = get_website_text(page_content)
    web_content['iframe_number'] = len(tree.xpath('//iframe'))
    web_content['input_number'] = len(tree.xpath('//input'))
    web_content['img_urls'] = get_website_iURLs(tree)
    web_content['href_links'] = get_website_hyperlinks(tree)

    return web_content

In [94]:
test_url = 'http://twitter.com/'
web_content = parse_web_content(test_url)
print(web_content)

{'title': 'Twitter. Se tapahtuu nyt.', 'text': 'Havaitsimme ettei JavaScript ole selaimessasi Haluaisitko Twitterin vanhalle alustalle Siirry Unohditko salasanasi Kirjaudu Katso maailmalla tapahtuu juuri nyt Liity Twitteriin Kirjaudu bird Created with Sketch Seuraa mielenkiintosi kohteita Kuule puhuttaa Liity keskusteluun Twitterin palveluita Toimimme kanssa ja muun muassa tilastoihin mukauttamiseen ja mainoksiin Sulje Twitter com runsaasti Jos et saa selaimesi asetuksista voit saada paremman kokemuksen mobiilisivustollamme Twitter runsaasti Ota selaimesi asetuksissa ennen Tietoja Ohjekeskus Blogi Tila Ehdot Mainosten tiedot Sovellukset Mainosta Markkinointi Yritykset Hakemisto Asetukset C Twitter Sulje Edellinen Seuraava Sulje Siirry profiiliin Tallennetut haut Poista keskustelussa Varmennettu tiliSuojatut twiitit Ehdotetut Varmennettu tiliSuojatut twiitit Varmennettu tiliSuojatut twiitit Sulje Mainosta Sulje Peruuta Twiittaa sijainti ilmoittaen Voit twiitteihisi sijainnin esimerkiksi

In [150]:
def get_FreeURL(url):
    # relative reference
    regex = re.compile(r'^(?:http|ftp)s?://', re.IGNORECASE)
    if not re.match(regex, url):
        return ''

    ext_result = tldextract.extract(url)
    FQDN = '.'.join(part for part in ext_result if part)

    FreeURL = ext_result.subdomain + ',' + url.split(FQDN)[1]
    return FreeURL

In [157]:
def is_mld_in_href_FreeURL(href_links, mld):
    for href_link in href_links:
        FreeURL = get_FreeURL(href_link)
        if mld in FreeURL:
            return True
    return False

In [161]:
# all the features that need to be obtained by actually sending http request to the page
def analyze_page_visit_features(url, mld_of_starting_url):
    page_visit_features = {'redirect_number': 0, 'landing_url': url,
                           'title_term_cnt': 0, 'text_term_cnt': 0,
                           'iframe_number': 0, 'input_number': 0,
                           'image_number': 0, 'href_number': 0,
                           'mld_equals': False, 'starting_mld_in_title': False,
                           'starting_mld_in_text': False, 'starting_mld_in_href_FreeURL':False,
                           'landing_mld_in_title': False, 'landing_mld_in_text': False,
                           'landing_mld_in_href_FreeURL': False
                          }

    redirect_chain = get_redirect_chain(url)
    landing_url = redirect_chain[-1]

    page_visit_features['redirect_number'] = len(redirect_chain) - 1
    page_visit_features['landing_url'] = landing_url

    web_content = parse_web_content(url)
    page_visit_features['title_term_cnt'] = len(web_content['title'].split(' '))
    page_visit_features['text_term_cnt'] = len(web_content['text'].split(' '))
    page_visit_features['iframe_number'] = web_content['iframe_number']
    page_visit_features['input_number'] = web_content['input_number']
    page_visit_features['image_number'] = len(web_content['img_urls'])
    page_visit_features['href_number'] = len(web_content['href_links'])

    page_visit_features['starting_mld_in_title'] = mld_of_starting_url in web_content['title'].lower()
    page_visit_features['starting_mld_in_text'] = mld_of_starting_url in web_content['text'].lower()
    mld_of_landing_url =  tldextract.extract(landing_url).domain  
    if mld_of_landing_url == mld_of_starting_url:
        page_visit_features['mld_equals'] = True
        page_visit_features['landing_mld_in_title'] = page_visit_features['starting_mld_in_title']
        page_visit_features['landing_mld_in_text'] = page_visit_features['starting_mld_in_text']
    else:
        page_visit_features['landing_mld_in_title'] = mld_of_landing_url in web_content['title'].lower()
        page_visit_features['landing_mld_in_text'] = mld_of_landing_url in web_content['text'].lower()

    page_visit_features['starting_mld_in_href_FreeURL'] = is_mld_in_href_FreeURL(web_content['href_links'],
                                                                                 mld_of_starting_url)
    page_visit_features['landing_mld_in_href_FreeURL'] = is_mld_in_href_FreeURL(web_content['href_links'],
                                                                                 mld_of_landing_url)

    return page_visit_features

In [159]:
test_url = 'http://twitter.com/'
analyze_page_visit_features(test_url, 'twitter')

{'redirect_number': 1,
 'landing_url': 'https://twitter.com/',
 'title_term_cnt': 4,
 'text_term_cnt': 365,
 'iframe_number': 6,
 'input_number': 28,
 'image_number': 0,
 'href_number': 39,
 'mld_equals': True,
 'starting_mld_in_title': True,
 'starting_mld_in_text': True,
 'starting_mld_in_href_FreeURL': True,
 'landing_mld_in_title': True,
 'landing_mld_in_text': True,
 'landing_mld_in_href_FreeURL': True}

In [166]:
test_url = "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
features_dict = analyze_url_features(test_url, Majestic_million_list)
features_dict.update(analyze_page_visit_features(test_url, features_dict['mld']))
print(features_dict)

{'url': 'https://www.amazon.co.uk/ap/signin?encoding=UTF8', 'url_length': 48, 'is_blacklisted': False, 'is_IPbased': False, 'domain_age': 8641, 'FreeURL_dot_cnt': 0, 'level_domain_cnt': 4, 'FQDN_length': 16, 'mld_length': 6, 'url_terms_cnt': 6, 'RDNRank': 194, 'domain_provider': None, 'domain_valid_period': 8892, 'mld': 'amazon', 'redirect_number': 0, 'landing_url': 'https://www.amazon.co.uk/ap/signin?encoding=UTF8', 'title_term_cnt': 6, 'text_term_cnt': 64, 'iframe_number': 0, 'input_number': 0, 'image_number': 3, 'href_number': 1, 'mld_equals': True, 'starting_mld_in_title': False, 'starting_mld_in_text': True, 'starting_mld_in_href_FreeURL': False, 'landing_mld_in_title': False, 'landing_mld_in_text': True, 'landing_mld_in_href_FreeURL': False}


In [167]:
example_urls = ["http://twitter.com/",
                "http://github.com",
                "http://www.yahoo.com/",
                "http://www.google.com/",
                "https://www.amazon.co.uk/ap/signin?encoding=UTF8"
               ]

all_features = []
for url in example_urls:
    features_dict = analyze_url_features(url, Majestic_million_list)
    features_dict.update(analyze_page_visit_features(url, features_dict['mld']))
    del features_dict["mld"]
    all_features.append(features_dict)
    time.sleep(30)

In [98]:
all_features_df = pd.DataFrame(all_features)
all_features_df.head()

Unnamed: 0,url,url_length,is_blacklisted,is_IPbased,domain_age,FreeURL_dot_cnt,level_domain_cnt,FQDN_length,mld_length,url_terms_cnt,...,domain_provider,domain_valid_period,redirect_number,landing_url,title_term_cnt,text_term_cnt,iframe_number,input_number,image_number,href_number
0,http://twitter.com/,19,False,False,7373,0,2,11,7,2,...,"CSC Corporate Domains, Inc.",7671,1,https://twitter.com/,4,365,6,28,0,39
1,http://github.com,17,False,False,4554,0,2,10,6,2,...,MarkMonitor Inc.,4749,1,https://github.com/,8,1335,0,38,31,106
2,http://www.yahoo.com/,21,False,False,9202,0,3,13,5,3,...,MarkMonitor Inc.,10228,1,https://www.yahoo.com/,1,1717,0,9,21,90
3,http://www.google.com/,22,False,False,8231,0,3,14,6,3,...,MarkMonitor Inc.,11322,0,http://www.google.com/,1,30,0,10,2,19
4,https://www.amazon.co.uk/ap/signin?encoding=UTF8,48,False,False,8641,0,4,16,6,6,...,,8892,0,https://www.amazon.co.uk/ap/signin?encoding=UTF8,6,70,0,0,3,1


### Compute feature sets by comparing the characteristics of the input page and those of the homepag. Ref from [DeltaPhish](https://arxiv.org/pdf/1707.00317.pdf)

Install necessary libraries
```
!pip install lxml
!pip install requests
```

In [None]:
homepage_url = 'https://www.google.com/'
homepage_hyperlinks = get_website_hyperlinks(website_url)

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [None]:
inputpage_url = 'https://calendar.google.com/calendar/r?tab=wc'
inputpage_hyperlinks = get_website_hyperlinks(inputpage_url)

In [None]:
jaccard_similarity(homepage_hyperlinks, inputpage_hyperlinks)

In [None]:
print(homepage_hyperlinks)

In [None]:
print(inputpage_hyperlinks)

In [None]:
len(set(homepage_hyperlinks).intersection(inputpage_hyperlinks))

#### About relative url and absolute url [here](http://www.dirigodev.com/blog/seo-web-best-practices/relative-vs-absolute-urls-seo/)


In [None]:
def get_second_level_domain(url):
    parse_result = urlparse(url)
    domain = parse_result.netloc

    second_level_domain = '.'.join(domain.split('.')[-2:])
    return second_level_domain

In [None]:
# 2LD (second-level  domain)
inputpage_2LDs = []
for url in inputpage_hyperlinks:
    sencond_level_domain = get_second_level_domain(url)
    if sencond_level_domain:
        inputpage_2LDs.append(sencond_level_domain)
    else:
        inputpage_2LDs.append(get_second_level_domain(inputpage_url))

In [None]:
homepage_2LDs = []
for url in homepage_hyperlinks:
    sencond_level_domain = get_second_level_domain(url)
    if sencond_level_domain:
        homepage_2LDs.append(sencond_level_domain)
    # relative url
    else:
        homepage_2LDs.append(get_second_level_domain(homepage_url))

In [None]:
print(set(homepage_2LDs))

In [None]:
print(homepage_2LDs)

In [None]:
print(set(inputpage_2LDs))

In [None]:
jaccard_similarity(homepage_2LDs, inputpage_2LDs)

### Style tags

In [None]:
def get_website_style_tags(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    style_tags = []
    for element in tree.xpath('//@style'):
        # print(element)
        style_tags.append(element)
    return style_tags

In [None]:
homepage_url = 'https://www.google.com/'
homepage_styles = get_website_style_tags(homepage_url)

In [None]:
inputpage_url = 'https://calendar.google.com/calendar/r?tab=wc'
inputpage_styles = get_website_style_tags(inputpage_url)

In [None]:
print(inputpage_styles)

In [None]:
print(homepage_styles)

In [None]:
jaccard_similarity(homepage_styles, inputpage_styles)

### external  style  sheets

In [None]:
# SSURL: external  style  sheets
def get_website_SSURL(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    SS_URLs = []
    for linktag in tree.xpath('//link[@rel="stylesheet" or @type="text/css"]'):
        SS_URLs.append(linktag.attrib['href'])
    return SS_URLs

In [None]:
homepage_SSURLs = get_website_SSURL('https://www.w3schools.com/tags/tag_link.asp')

In [None]:
print(homepage_SSURLs)

### Image URL

In [None]:
# linked image url
def get_website_iURLs(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    iURLs = []
    for imgtag in tree.xpath('//img'):
        iURLs.append(imgtag.attrib['src'])
        #print(imgtag.attrib['src'])
    return iURLs

In [None]:
hompage_iurls = get_website_iURLs(homepage_url)

In [None]:
print(hompage_iurls)

In [None]:
inputpage_iurls = get_website_iURLs(inputpage_url)

In [None]:
print(inputpage_iurls)

In [None]:
jaccard_similarity(hompage_iurls, inputpage_iurls)

### Title

In [None]:
def get_website_title(website_url):
    page = requests.get(website_url)
    page_content = page.content.decode(page.encoding)
    tree = html.fromstring(page_content)
    return tree.xpath('//title')[0].text_content()

In [None]:
get_website_title(homepage_url)

In [None]:
get_website_title(inputpage_url)

### X-links

In [None]:
x_link = True if homepage_url in inputpage_hyperlinks else False

In [None]:
print(x_link)