In [1]:
import json
import os
import pandas as pd
from IPython.display import display
from tld import get_fld
from collections import Counter, defaultdict

In [2]:
def is_cross_domain(url1, url2):
    return url1 != url2

In [3]:
gov_dir = '..\crawl_data_gov'
news_dir = '..\crawl_data_news'
ext = 'har'
num_get_reqs_gov = 0
num_get_reqs_news = 0
num_post_reqs_gov = 0
num_post_reqs_news = 0
num_other_reqs_gov = 0
num_other_reqs_news = 0
response_codes = [301,308,302,303,307]
gov_redirection_pairs = defaultdict(set)
news_redirection_pairs = defaultdict(set)

for file in os.listdir(gov_dir):
    if file.endswith(ext):
        with open('..\crawl_data_gov\\' + file, 'r', encoding='utf-8') as har_file:
            har_data = json.load(har_file)
            entries = har_data['log']['entries']

            for entry in entries:
                if entry['request']['method'] == "GET":
                    num_get_reqs_gov += 1
                elif entry['request']['method'] == "POST":
                    num_post_reqs_gov += 1
                else:
                    num_other_reqs_gov += 1
                
                if entry['response']['status'] in response_codes:
                    source = get_fld(entry['request']['url'])
                    target = get_fld(entry['response']['redirectURL'])
                    
                    if is_cross_domain(source, target):
                        gov_redirection_pairs[(source, target)].add(file)
    else:
        continue

for file in os.listdir(news_dir):
    if file.endswith(ext):
        with open('..\crawl_data_news\\' + file, 'r', encoding='utf-8') as har_file:
            har_data = json.load(har_file)
            entries = har_data['log']['entries']

            for entry in entries:
                if entry['request']['method'] == "GET":
                    num_get_reqs_news += 1
                elif entry['request']['method'] == "POST":
                    num_post_reqs_news += 1
                else:
                    num_other_reqs_news += 1

                if entry['response']['status'] in response_codes:
                    source = get_fld(entry['request']['url'])
                    target = get_fld(entry['response']['redirectURL'])
                    
                    if is_cross_domain(source, target):
                        news_redirection_pairs[(source, target)].add(file)
    else:
        continue

In [4]:
gov_pair_counts = {pair: len(websites) for pair, websites in gov_redirection_pairs.items()}
gov_most_common_elements = Counter(gov_pair_counts).most_common(3)

news_pair_counts = {pair: len(websites) for pair, websites in news_redirection_pairs.items()}
news_most_common_elements = Counter(news_pair_counts).most_common(3)

In [5]:
data = {
    'HTTP Method': ['GET', 'POST', 'other'],
    'Crawl-news': [num_get_reqs_news, num_post_reqs_news, num_other_reqs_news],
    'Crawl-gov': [num_get_reqs_gov, num_post_reqs_gov, num_other_reqs_gov]
}

df = pd.DataFrame(data)
totals = df[['Crawl-gov', 'Crawl-news']].sum()

df['Crawl-gov %'] = (df['Crawl-gov'] / totals['Crawl-gov']) * 100
df['Crawl-news %'] = (df['Crawl-news'] / totals['Crawl-news']) * 100

df['Crawl-gov'] = df['Crawl-gov'].astype(str) + " (" + df['Crawl-gov %'].round(1).astype(str) + "%)"
df['Crawl-news'] = df['Crawl-news'].astype(str) + " (" + df['Crawl-news %'].round(1).astype(str) + "%)"
df = df.drop(columns=['Crawl-gov %', 'Crawl-news %'])

display(df.style.hide(axis='index'))

HTTP Method,Crawl-news,Crawl-gov
GET,14364 (89.5%),1087 (93.5%)
POST,1677 (10.5%),75 (6.5%)
other,5 (0.0%),0 (0.0%)


In [6]:
# Data is only available for news websites, since government websites don't have any cross-domain redirections
cross_domain_data = {
    'Source domain': [news_most_common_elements[0][0][0], news_most_common_elements[1][0][0], news_most_common_elements[2][0][0]],
    'Target domain': [news_most_common_elements[0][0][1], news_most_common_elements[1][0][1], news_most_common_elements[2][0][1]],
    'Number of distinct websites': [news_most_common_elements[0][1], news_most_common_elements[1][1], news_most_common_elements[2][1]]
}
cross_domain_df = pd.DataFrame(cross_domain_data)
display(cross_domain_df.style.hide(axis='index'))

Source domain,Target domain,Number of distinct websites
casalemedia.com,doubleclick.net,16
doubleclick.net,casalemedia.com,16
adnxs.com,doubleclick.net,12
