In [2]:
import os
import json
from urllib.parse import urlparse
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from tld import get_fld
import pandas as pd
from IPython.display import display
from collections import Counter, defaultdict
import dns.resolver
import tldextract

# PREP

In [None]:
# 3 - table for most prevalent domains + isTracker
def get_tracking_domains_list():
    with open('disconnect-tracking-services.json', 'r') as f:
        tracking_services = json.load(f)

    all_tracker_domains = set()
    for category in tracking_services['categories'].values():
        for org in category:
            for url in org.values():
                for domains in url.values():
                    all_tracker_domains.update(domains)
    return list(all_tracker_domains)

TRACKING_DOMAINS = get_tracking_domains_list()
GOV_DIR = '../crawl_data_gov'
NEWS_DIR = '../crawl_data_news'
EXT = 'har'

# EXTRACT ALL NECESSARY DATA FROM HAR FILES IN ONE ITERATION

## Helpers

In [None]:
RESPONSE_CODES = [301,308,302,303,307]

def get_all_info_from_entries(entries, gov_domain, filename):
	# for 1, 3
	distinct_third_party_domains = {}
	num_third_party_requests = 0

	# for 4
	num_get_reqs = 0
	num_post_reqs = 0
	num_other_reqs = 0

	# for 7
	accept_ch_hints = {}
	visited_hints_in_website = []

	# for 8
	redirection_pairs = defaultdict(set)

	# for 9
	first_party_subdomains = set()

	for entry in entries:
		# for 1, 3 
		num_third_party_requests, distinct_third_party_domains = get_third_party_domain_counts(entry, gov_domain, distinct_third_party_domains, num_third_party_requests)
		
		# for 4
		if entry['request']['method'] == "GET":
			num_get_reqs += 1
		elif entry['request']['method'] == "POST":
			num_post_reqs += 1
		else:
			num_other_reqs += 1

		# # 7 - ch header
		# accept_ch_hints, visited_hints_in_website = get_accept_ch_header_requests(entry, accept_ch_hints, visited_hints_in_website)
		
		# # 8 - cross-domain redirections
		# if entry['response']['status'] in RESPONSE_CODES:
		# 	source = get_fld(entry['request']['url'])
		# 	target = get_fld(entry['response']['redirectURL'])
			
		# 	if is_cross_domain(source, target):
		# 		redirection_pairs[(source, target)].add(filename)
		
		# # 9 - CNAME
		# main_domain = "ad.nl"  # TODO  why ? for gov ad.nl,  different for news
		# request_url = entry["request"]["url"]
		# hostname = urlparse(request_url).hostname
		# if hostname:
		# 	domain = get_fld(request_url)
		# 	if domain == main_domain:
		# 		first_party_subdomains.add(hostname)

	return num_third_party_requests, distinct_third_party_domains, num_get_reqs, num_post_reqs, num_other_reqs


def is_cross_domain(url1, url2):
    return url1 != url2
	
	
def get_third_party_domain_counts(entry, gov_domain, distinct_third_party_domains, num_third_party_requests):
	url = entry['request']['url']
	hostname = urlparse(url).hostname
	if hostname and gov_domain not in hostname:
		num_third_party_requests += 1
		tld_1 = get_fld(url)
		if tld_1 not in distinct_third_party_domains:
			distinct_third_party_domains[tld_1] = 0
		else: 
			distinct_third_party_domains[tld_1] += 1
	return num_third_party_requests, distinct_third_party_domains


def get_accept_ch_header_requests(entry, accept_ch_hints, visited_hints_in_website):
	headers = entry['response']['headers']
	for header in headers:
		if header['name'] == 'accept-ch':
			client_hints = header['value'].split(',')
			for hint in client_hints:
				hint = hint.strip()
				if hint in visited_hints_in_website:
					continue
				accept_ch_hints[hint] = accept_ch_hints.get(hint, 0) + 1
				visited_hints_in_website.append(hint)
	return accept_ch_hints, visited_hints_in_website

In [None]:
# ONE BIG FUNC FOR EVERYTHING
# gov_dir = '..\crawl_data_gov'
# news_dir = '..\crawl_data_news'
GIV_DIR = '../crawl_data_gov'
NEWS_DIR = '../crawl_data_news'
EXT = 'har'
num_requests_gov = []
num_requests_news = []
num_third_party_requests_gov = []	
num_third_party_requests_news = []
num_third_party_domains_gov = []
num_third_party_domains_news = []
client_hints_gov = []
client_hints_news = []

num_get_reqs_gov = 0
num_get_reqs_news = 0
num_post_reqs_gov = 0
num_post_reqs_news = 0
num_other_reqs_gov = 0
num_other_reqs_news = 0
gov_redirection_pairs = defaultdict(set)
news_redirection_pairs = defaultdict(set)
gov_first_party_subdomains = set()
news_first_party_subdomains = set()


def get_all_info_form_hars(dir):
    all_third_party_entries = {}
    for file in os.listdir(dir):
        if file.endswith(EXT):
            with open(dir + '/' + file, 'r', encoding='utf-8') as har_file:
                har_data = json.load(har_file)
                entries = har_data['log']['entries']

                num_requests = len(entries)
                num_requests_gov.append(num_requests)

                num_third_party_requests, distinct_third_party_domains, num_get_reqs, num_post_reqs, num_other_reqs = get_all_info_from_entries(entries, file.split('.')[0], file)


                # num_third_party_requests, distinct_third_party_domains = get_third_party_domain_counts(entries, file.split('.')[0])
                # num_third_party_requests_gov.append(num_third_party_requests)
                # num_third_party_domains_gov.append(len(distinct_third_party_domains))

                # client_hints = get_accept_ch_header_requests(entries)
                # client_hints_gov.append(client_hints)

                # 3 - count most prevalent 3rd party domains
                # all_third_party_entries_gov = dict(Counter(all_third_party_entries_gov) + Counter(distinct_third_party_domains))                
    return num_third_party_requests, distinct_third_party_domains, num_get_reqs, num_post_reqs, num_other_reqs


# STATS AND VIZUALIZATIONS

## 1 - BOXPOTS WITH SOME METRICS

In [None]:
def create_boxplot(data, title, x_label, y_label, x_ticks=["Government", "News"], filename="boxplot.png"):
    sns.set(rc={'figure.figsize': (10, 10)})
    sns.set(style="whitegrid")
    ax = sns.boxplot(data=data)
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_xticklabels(x_ticks)
    ax.set_ylabel(y_label)
    plt.savefig(f'boxplots/{filename}')
    plt.show()

In [None]:
with open('crawl_data_gov_times') as f:
    times_gov_dict = json.load(f)

with open('crawl_data_news_times') as f:
    times_news_dict = json.load(f)

times_gov_list = [dict_val for dict_val in times_gov_dict.values()]
times_news_list = [dict_val for dict_val in times_news_dict.values()]


In [None]:
create_boxplot([num_requests_gov, num_requests_news], "Number of Requests of Government and News Websites", "Website Type", "Number of Requests", filename="gov_news_requests.png")

In [None]:
create_boxplot([times_gov_list, times_news_list], "Loading Time of Government and News Websites", "Website Type", "Loading Time (s)", filename="gov_news_loading_time.png")


In [None]:
create_boxplot([num_third_party_requests_gov, num_third_party_requests_news], "Number of Third Party Requests of Government and News Websites", "Website Type", "Number of Third Party Requests", filename="gov_news_third_party_requests.png")

In [None]:
create_boxplot([num_third_party_domains_gov, num_third_party_domains_news], "Number of Third Party Domains of Government and News Websites", "Website Type", "Number of Third Party Domains", filename="gov_news_third_party_domains.png")

## 2 - TABLE WITH MIN MEDIAN MAX FOR THE METRICS

In [None]:
metrics = {"loading_time": [times_gov_list, times_news_list], "num_requests": [num_requests_gov, num_requests_news], "num_third_party_requests": [num_third_party_requests_gov, num_third_party_requests_news], "num_third_party_domains": [num_third_party_domains_gov, num_third_party_domains_news]}
for metric, values in metrics.items():
    print(f"Metric: {metric}")
    print('News:')
    print(f"Min: {min(values[1])}")
    print(f"Median: {sum(values[1]) / len(values[1])}")
    print(f"Max: {max(values[1])}")
    print('Government:')
    print(f"Min: {min(values[0])}")
    print(f"Median: {sum(values[0]) / len(values[0])}")
    print(f"Max: {max(values[0])}")


    # TODO PLEASE MAKE A TABLE IN CODE AS WELL!!!

## 3 - Table with prevalent third-party domains and indication whether they are classified as a tracker

In [None]:
def prevalent_tracker_domains_table(third_party_entries):
    data = {
        'Third-party domain': list(third_party_entries.keys()), 
        'Number of distinct websites': list(third_party_entries.values()),    
    }
    table = pd.DataFrame(data)
    table = table.sort_values(by=['Number of distinct websites'], ascending=False, ignore_index=True)
    table['isTracker'] = table['Third-party domain'].apply(lambda x: True if x in TRACKING_DOMAINS else False)
    return table[:10]

In [None]:
tracker_table_news = prevalent_tracker_domains_table(all_third_party_entries_news)
tracker_table_news.to_latex()

In [None]:
tracker_table_gov = prevalent_tracker_domains_table(all_third_party_entries_gov)
tracker_table_gov.to_latex()

## 4 - 

## 7 ACCEPT-CH ANALYSIS TABLE

In [None]:
sorted_client_hints_news = {k: v for k, v in sorted(client_hints_news[0].items(), key=lambda item: item[1], reverse=True)}
sorted_client_hints_gov = {k: v for k, v in sorted(client_hints_gov[0].items(), key=lambda item: item[1], reverse=True)}

# Combine client hints from all websites
all_client_hints_gov = {}
for hints in client_hints_gov:
    for hint, count in hints.items():
        all_client_hints_gov[hint] = all_client_hints_gov.get(hint, 0) + count

# Sort client hints by count
sorted_all_client_hints_gov = {k: v for k, v in sorted(all_client_hints_gov.items(), key=lambda item: item[1], reverse=True)}

all_client_hints_news = {}
for hints in client_hints_news:
    for hint, count in hints.items():
        all_client_hints_news[hint] = all_client_hints_news.get(hint, 0) + count

sorted_all_client_hints_news = {k: v for k, v in sorted(all_client_hints_news.items(), key=lambda item: item[1], reverse=True)}

# Gov and news websites client hints combined
all_client_hints = {}
for hint, count in sorted_all_client_hints_gov.items():
    all_client_hints[hint] = all_client_hints.get(hint, 0) + count

for hint, count in sorted_all_client_hints_news.items():
    all_client_hints[hint] = all_client_hints.get(hint, 0) + count

sorted_all_client_hints = {k: v for k, v in sorted(all_client_hints.items(), key=lambda item: item[1], reverse=True)}

# Get 3 most common client hints
top_3_client_hints = list(sorted_all_client_hints.keys())[:3]
print(top_3_client_hints)

# Get counts for top 3 client hints
top_3_client_hints_counts = []
counts_gov = {}
counts_news = {}
for hint in top_3_client_hints: 
    counts_gov[hint] = sorted_all_client_hints_gov.get(hint, 0)
    counts_news[hint] = sorted_all_client_hints_news.get(hint, 0)
print(counts_gov)
print(counts_news)