In [20]:
import json
import os
import pandas as pd
from IPython.display import display
from tld import get_fld
from collections import Counter, defaultdict
from urllib.parse import urlparse
import dns.resolver
import tldextract

In [21]:
def is_cross_domain(url1, url2):
    return url1 != url2

In [22]:
gov_dir = '..\crawl_data_gov'
news_dir = '..\crawl_data_news'
ext = 'har'
num_get_reqs_gov = 0
num_get_reqs_news = 0
num_post_reqs_gov = 0
num_post_reqs_news = 0
num_other_reqs_gov = 0
num_other_reqs_news = 0
response_codes = [301,308,302,303,307]
gov_redirection_pairs = defaultdict(set)
news_redirection_pairs = defaultdict(set)
gov_first_party_subdomains = set()
news_first_party_subdomains = set()

for file in os.listdir(gov_dir):
    if file.endswith(ext):
        with open('..\crawl_data_gov\\' + file, 'r', encoding='utf-8') as har_file:
            har_data = json.load(har_file)
            entries = har_data['log']['entries']

            for entry in entries:
                if entry['request']['method'] == "GET":
                    num_get_reqs_gov += 1
                elif entry['request']['method'] == "POST":
                    num_post_reqs_gov += 1
                else:
                    num_other_reqs_gov += 1
                
                if entry['response']['status'] in response_codes:
                    source = get_fld(entry['request']['url'])
                    target = get_fld(entry['response']['redirectURL'])
                    
                    if is_cross_domain(source, target):
                        gov_redirection_pairs[(source, target)].add(file)
                
                main_domain = "ad.nl"
                request_url = entry["request"]["url"]
                hostname = urlparse(request_url).hostname
                if hostname:
                    domain = get_fld(request_url)
                    if domain == main_domain:
                        gov_first_party_subdomains.add(hostname)
    else:
        continue

for file in os.listdir(news_dir):
    if file.endswith(ext):
        with open('..\crawl_data_news\\' + file, 'r', encoding='utf-8') as har_file:
            har_data = json.load(har_file)
            entries = har_data['log']['entries']
            main_domain = file.split('_')[0]

            for entry in entries:
                if entry['request']['method'] == "GET":
                    num_get_reqs_news += 1
                elif entry['request']['method'] == "POST":
                    num_post_reqs_news += 1
                else:
                    num_other_reqs_news += 1

                if entry['response']['status'] in response_codes:
                    source = get_fld(entry['request']['url'])
                    target = get_fld(entry['response']['redirectURL'])
                    
                    if is_cross_domain(source, target):
                        news_redirection_pairs[(source, target)].add(file)
                    
                request_url = entry["request"]["url"]
                hostname = urlparse(request_url).hostname
                if hostname and hostname != main_domain:
                    domain = get_fld(request_url, fail_silently=True)
                    if domain == main_domain:
                        news_first_party_subdomains.add(hostname)
    else:
        continue

In [27]:
with open('..\domain_map.json', 'r', encoding='utf-8') as domain_map:
    domain_map_data = json.load(domain_map)

gov_cname_records = []
news_cname_records = []
for gov_first_party_subdomain in gov_first_party_subdomains:
    try:
        answers = dns.resolver.resolve(gov_first_party_subdomain, 'CNAME')
        for rdata in answers:
            cname_record = rdata.target.to_text().rstrip(".")
            try:
                extracted = tldextract.extract(cname_record)
                main_domain = f"{extracted.domain}.{extracted.suffix}"
                cname_entity = domain_map_data[main_domain]['entityName']
            except:
                print("domain not found: " + cname_record)
                cname_entity = ""
            gov_cname_records.append((gov_first_party_subdomain, cname_record, cname_entity))
    except Exception as e:
        print(f"DNS error: {e}")

for news_first_party_subdomain in news_first_party_subdomains:
    try:
        answers = dns.resolver.resolve(news_first_party_subdomain, 'CNAME')
        for rdata in answers:
            cname_record = rdata.target.to_text().rstrip(".")
            try:
                extracted = tldextract.extract(cname_record)
                main_domain = f"{extracted.domain}.{extracted.suffix}"
                cname_entity = domain_map_data[main_domain]['entityName']
            except:
                print("domain not found: " + cname_record)
                cname_entity = ""
            news_cname_records.append((news_first_party_subdomain, cname_record, cname_entity))
    except Exception as e:
        print(f"DNS error: {e}")

domain not found: ff53ed04-b5fc-4ccf-b357-c1c4a2dd582a.customer.scalia.network
domain not found: cs964.wpc.9465e.mucdn.net
domain not found: 598061811971.gigya-api.com
domain not found: www-msn-com.a-0003.a-msedge.net
domain not found: sportdaten.n-tv.de.cname.weltsport.org
domain not found: a80a9201ee101bbc3.awsglobalaccelerator.com
DNS error: The DNS response does not contain an answer to the question: edition.cnn.com. IN CNAME
domain not found: frontend.prod.utiq-aws.net
domain not found: umto.adfuel.turnerapps.com
domain not found: samizdat-graphql.prd.map.nytimes.xovr.nyt.net
domain not found: gedi-repubblica-www-produzione.cdn.zephr.com
domain not found: ff53ed04-b5fc-4ccf-b357-c1c4a2dd582a.customer.scalia.network
DNS error: The DNS response does not contain an answer to the question: emc2.lefigaro.fr. IN CNAME
domain not found: ccm.npo.nl
DNS error: The DNS response does not contain an answer to the question: www.telegraaf.nl. IN CNAME


In [28]:
gov_pair_counts = {pair: len(websites) for pair, websites in gov_redirection_pairs.items()}
gov_most_common_elements = Counter(gov_pair_counts).most_common(3)

news_pair_counts = {pair: len(websites) for pair, websites in news_redirection_pairs.items()}
news_most_common_elements = Counter(news_pair_counts).most_common(3)

In [29]:
data = {
    'HTTP Method': ['GET', 'POST', 'other'],
    'Crawl-news': [num_get_reqs_news, num_post_reqs_news, num_other_reqs_news],
    'Crawl-gov': [num_get_reqs_gov, num_post_reqs_gov, num_other_reqs_gov]
}

df = pd.DataFrame(data)
totals = df[['Crawl-gov', 'Crawl-news']].sum()

df['Crawl-gov %'] = (df['Crawl-gov'] / totals['Crawl-gov']) * 100
df['Crawl-news %'] = (df['Crawl-news'] / totals['Crawl-news']) * 100

df['Crawl-gov'] = df['Crawl-gov'].astype(str) + " (" + df['Crawl-gov %'].round(1).astype(str) + "%)"
df['Crawl-news'] = df['Crawl-news'].astype(str) + " (" + df['Crawl-news %'].round(1).astype(str) + "%)"
df = df.drop(columns=['Crawl-gov %', 'Crawl-news %'])

display(df.style.hide(axis='index'))

HTTP Method,Crawl-news,Crawl-gov
GET,14364 (89.5%),1087 (93.5%)
POST,1677 (10.5%),75 (6.5%)
other,5 (0.0%),0 (0.0%)


In [30]:
# Data is only available for news websites, since government websites don't have any cross-domain redirections
cross_domain_data = {
    'Source domain': [news_most_common_elements[0][0][0], news_most_common_elements[1][0][0], news_most_common_elements[2][0][0]],
    'Target domain': [news_most_common_elements[0][0][1], news_most_common_elements[1][0][1], news_most_common_elements[2][0][1]],
    'Number of distinct websites': [news_most_common_elements[0][1], news_most_common_elements[1][1], news_most_common_elements[2][1]]
}
cross_domain_df = pd.DataFrame(cross_domain_data)
display(cross_domain_df.style.hide(axis='index'))

Source domain,Target domain,Number of distinct websites
casalemedia.com,doubleclick.net,16
doubleclick.net,casalemedia.com,16
adnxs.com,doubleclick.net,12


In [33]:
gov_cname_data = {
    'First-party subdomain': [cname_record[0] for cname_record in gov_cname_records],
    'CNAME record': [cname_record[1] for cname_record in gov_cname_records],
    'CNAME entity': [cname_record[2] for cname_record in gov_cname_records]
}
gov_cname_df = pd.DataFrame(gov_cname_data)
display(gov_cname_df.style.hide(axis='index'))

news_cname_data = {
    'First-party subdomain': [cname_record[0] for cname_record in news_cname_records],
    'CNAME record': [cname_record[1] for cname_record in news_cname_records],
    'CNAME entity': [cname_record[2] for cname_record in news_cname_records]
}
news_cname_df = pd.DataFrame(news_cname_data)
display(news_cname_df.style.hide(axis='index'))

First-party subdomain,CNAME record,CNAME entity


First-party subdomain,CNAME record,CNAME entity
browser.events.data.msn.com,global.asimov.events.data.trafficmanager.net,Microsoft Corporation
temptation.ad.nl,dpp-ad-wc.edgekey.net,Akamai Technologies
bilder.n-tv.de,duxqe05z34l4c.cloudfront.net,"Amazon Technologies, Inc."
confiant.msn.com,confiant.msn.com.edgekey.net,Akamai Technologies
aamt.nbcnews.com,nbcnews.com.ssl.sc.omtrdc.net,Adobe Inc.
guce.yahoo.com,real.rotation.guce.aws.oath.cloud,Verizon Media
federated-id.live.api.bbc.co.uk,federated-id.live.d918634911cdcd24.xhst.bbci.co.uk,British Broadcasting Corporation
www.lefigaro.fr,www.lefigaro.fr.edgekey.net,Akamai Technologies
pbs.www.aol.com,edge.gycpi.b.yahoodns.net,Verizon Media
www.ouest-france.fr,ouest-france.edgekey.net,Akamai Technologies
