In [196]:
import json
from haralyzer import HarParser, HarPage
from http.cookies import SimpleCookie
from tld import get_fld
import re
import datetime
import datetime
from dateutil import parser
from datetime import datetime
import maxminddb 
import sys

In [197]:
har_parser = HarParser.from_file("amazon.nl.har")
file = har_parser
har_parser_2 = HarParser.from_file("amazon.nl_adblocker.har")
filename = har_parser_2

Extract the total count of requests.

In [198]:
def find_num_requests(har_parser):
    num_reqs = 0
    for page in har_parser.pages:
        num_reqs = num_reqs + len(page.entries)
        
    return(num_reqs)

Number of requests containing cookies

In [199]:
def num_requests_with_cookies(har_parser):
    num_requests_w_cookies = 0
    for page in har_parser.pages:
        for entry in page.entries:
            if entry.request.cookies:
                num_requests_w_cookies = num_requests_w_cookies +1
    return num_requests_w_cookies

Number of responses containing cookies

In [200]:
def num_responses_w_cookies(har_parser):   
    num_responses_w_cookies = 0
    for page in har_parser.pages:
        for entry in page.entries:
            header_names = [ header['name'] for header in entry.response.headers]
            if 'set-cookie' in header_names:
                num_responses_w_cookies = num_responses_w_cookies +1
    return num_responses_w_cookies

● third_party_domains: list of distinct third-party domains (eTLD+1)

In [201]:
def third_party_domain(har_parser):
    third_party_domains = []
    first_party_domain = 'amazon.nl'
    FIRST_PARTY_DOMAINS = [first_party_domain]
    for page in har_parser.pages:
        for entry in page.entries:
            req_domain = get_fld(entry.request.url)
            if req_domain not in FIRST_PARTY_DOMAINS:
                third_party_domains.append(req_domain)

    third_party_domains = list(set(third_party_domains))
    return third_party_domains


● cookie_domains: list of distinct cookie domain attributes (using the cookies field)


In [202]:
def domains_w_cookies(har_parser):
    cookie_domains = []
    for page in har_parser.pages:
        for entry in page.entries:
            domains_temp = [cookie['domain'] for cookie in entry.request.cookies]

    domains = list(set(domains_temp))
    return domains

● xorigin_cookie_domains: list of cookie domains set via HTTP response headers, with
SameSite=None, and lifespan >= 90 days


In [203]:
def xorigin_cookie_domains(har_parser):
    xorigin_cookie_domains = set()

    for page in har_parser.pages:
        for entry in page.entries:
            domain = get_fld(entry.request.url)
            for response in entry.response.headers:
                if response['name'] == 'set-cookie':
                    sameSite = re.search("sameSite=(.*?);",response['value'],re.IGNORECASE)
                    expires = re.search("expires=(.*?);",response['value'],re.IGNORECASE) 
                    expireDate = datetime.strptime(parser.parse(expires.group(1)).strftime("%Y-%m-%d"), "%Y-%m-%d")

                    if (expireDate - datetime.now() ).days <= 0:
                        lifespan_cond = False

                    else:

                        lifespan = (expireDate - datetime.now()).days
                        if lifespan >= 90:
                            lifespan_cond = True

                        else:
                            lifespan_cond = False

                    if sameSite and expires and sameSite.group(1).lower() =="none" and lifespan_cond: 
                        xorigin_cookie_domains.add(domain)   

    xorigin_cookie_domains = list(xorigin_cookie_domains)
    return xorigin_cookie_domains

● server_countries: list of distinct server countries (using the serverIPAddress field and the 
geolocation databases linked below)


In [204]:
#Get the country name of specified IP
def getCountryName(ipAddress):
    response = ''

    with maxminddb.open_database('dbip-country-lite.mmdb') as reader:
        reader.get(ipAddress)
        try:
            response = reader.country(ipAddress)
            return response.country.name
        except:
            msg = "[*] AddressNotFoundError for ip: %s" % (ipAddress)
            return("unknown")

def server_countries(file):
    server_country_list = set()
    for page in har_parser.pages:
        for entry in page.entries:     
            req_domain = entry.serverAddress

            if req_domain != "":
                server_country_name = getCountryName(req_domain)
                if server_country_name != "unknown":
                    server_country_list.add(server_country_name)

    server_country_list = list(server_country_list)
    return(server_country_list)

In [205]:
#Check whether the domain is in either one of the domain list
def is_domain_listed(request_domain):
    with open('easylist-justdomains.txt') as file:
        for line in file:
            domains = line.strip()
        if request_domain in domains:
            return True
        else:
            with open('easyprivacy-justdomains.txt') as f2:
                domains2 = [line.rstrip() for line in f2]
                if request_domain in domains2:
                    return True
                else:
                    return False

In [206]:
#Get the first 128 char of given URL
def get_url_first_128_char(url):
    if len(url)<=128:
        return(url)
    else:
        return(url[0:128])

● requests: a list of dictionaries, where each dictionary contains the following 
request/response details:
○ request_domain: String; e.g. example.com
○ server_country: String; e.g. Germany; “unknown” if server IP is unavailable
○ num_request_cookies: Integer
○ num_response_cookies: Integer
○ is_tracker: Boolean; whether the request hostname or domain is listed in EasyList 
or EasyPrivacy “just domains” blocklists
○ url_first_128_char: String; the first 128 characters of the URL; e.g. 
https://example.com/pixel.gif

In [207]:
 def requests(har_parser):
    requests = []
    num_request_cookies = 0
    num_response_cookies = 0
    
    for page in har_parser.pages:
        for entry in page.entries:    
            num_requests_w_cookies_temp = 0
            if entry.request.cookies:
                num_requests_w_cookies_temp = num_requests_w_cookies_temp +1
        
            request_domain = get_fld(entry.request.url)
            
            req_Ip_Adress = entry.serverAddress
            req_Ip_Adress = re.sub(r"[\([{})\]]", "", req_Ip_Adress)
            
            
            if req_Ip_Adress != "":
                server_country = getCountryName(req_Ip_Adress)
            else:
                server_country = 'unknown'            
            
            for request_header in entry.request.headers:
                if request_header['name'] == 'cookie':
                    num_request_cookies += 1 
            for reponse_header in entry.response.headers:
                if reponse_header['name'] == 'set-cookie':
                    num_response_cookies += 1 
            is_tracker = is_domain_listed(request_domain)
            url_first_128_char = get_url_first_128_char(entry.request.url)
          
            request_response = {}
            request_response['request_domain'] = request_domain
            request_response['server_country'] = server_country
            request_response['num_request_cookies'] = num_request_cookies
            request_response['num_response_cookies'] = num_response_cookies
            request_response['is_tracker'] = is_tracker
            request_response['url_first_128_char'] = url_first_128_char

            requests.append(request_response)
            
            
    return requests

In [208]:
appDict = {
    "num_reqs": find_num_requests(file),
    "num_requests_w_cookies":num_requests_with_cookies(file),
    "num_responses_w_cookies":num_responses_w_cookies(file),
    "third_party_domains":third_party_domain(file),
    "domains_w_cookies": domains_w_cookies(file),
    "server_countries" : server_countries(file),
    "xorigin_cookie_domains": xorigin_cookie_domains(file),
    "requests": requests(file),
}

In [209]:
with open('amazon.nl.json', 'w') as fp:
    json.dump(appDict, fp)

In [210]:
appDict = {
    "num_reqs": find_num_requests(filename),
    "num_requests_w_cookies":num_requests_with_cookies(filename),
    "num_responses_w_cookies":num_responses_w_cookies(filename),
    "third_party_domains":third_party_domain(filename),
    "domains_w_cookies": domains_w_cookies(filename),
    "server_countries" : server_countries(filename),
    "xorigin_cookie_domains": xorigin_cookie_domains(filename),
    "requests": requests(filename),
}

In [211]:
with open('amazon.nl_adblocker.json', 'w') as fp:
    json.dump(appDict, fp)