In [6]:
import pandas as pd
from tld import get_fld
import pickle
import sys
from collections import defaultdict, Counter
import os
import random
from leak_common import add_adblocked_status, get_domain, add_rank_col, get_initiators, add_sniffer_domain_col, find_prominence, is_req_off_site_direction, get_entity, check_sniff_initiators, belong_to_same_entity, get_sniffs
import networkx as nx
import matplotlib.pyplot as plt

In [7]:
with open('extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

# Read from pickles

In [38]:
df = pd.read_csv('nyc_desktop_reject_all.csv', sep='\t', encoding='utf-8')

# Additional Columns

In [39]:
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked) | (df.ublock_blocked) | (df.request_url_domain.isin(extra_tracker_domains))

df['tracker_related'] = df.third_party_req & df.is_blocked

# All Requests

In [14]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [15]:
print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

all_reqs:  22340
unique_all_reqs:  7887
all_reqs_email:  19379
unique_all_reqs_email:  6551


# Third Party Requests

In [16]:
third_party_reqs = df[df.third_party_req & ~df.is_req_off_site_direction]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [17]:
print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

third_party_reqs:  12967
unique_third_party_reqs:  4647
third_party_reqs_email:  12415
unique_third_party_reqs_email:  4442


# Tracking Related Requests

In [18]:
tracker_related_reqs_df = df[~df.is_req_off_site_direction & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
#tracker_related_reqs_df = df[(df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [19]:
print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

tracker_related_reqs_df:  8088
unique_tracking_related_df:  3366
email_leaks:  7713
unique_email_leaks:  3242


# High Level Statistics

In [41]:
print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  4395
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  2633
Distinct websites where emails are leaked to only tracker domains:  1844


# Prominence

In [46]:
prominence_list = find_prominence(unique_tracking_related_df[unique_tracking_related_df.search_type=='email'])

In [47]:
prominence_df = pd.DataFrame(prominence_list, columns=['domain','prominence', 'number_of_sites']) 

In [48]:
prominence_df.sort_values('prominence',ascending=False).head(20)

Unnamed: 0,domain,prominence,number_of_sites
17,taboola.com,0.030292,327
22,bizible.com,0.017296,160
11,fullstory.com,0.007558,182
3,zenaps.com,0.004873,113
4,awin1.com,0.00485,112
5,yandex.com,0.004194,121
20,adroll.com,0.003962,117
62,glassboxdigital.io,0.003188,6
1,listrakbi.com,0.002485,91
0,bronto.com,0.002458,90


## Leak types

In [52]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     2362
post_leaks    1004
Name: leak_type, dtype: int64

## Email leaks

In [53]:
unique_email_leaks[unique_email_leaks['rank_of_site']<2000].drop_duplicates('initial_url').sort_values('rank_of_site')[['initial_url', 'rank_of_site']].head(12)

Unnamed: 0,initial_url,rank_of_site
20907,usatoday.com,154
12723,trello.com,242
6835,independent.co.uk,243
3357,shopify.com,300
2780,marriott.com,328
13998,newsweek.com,567
6211,prezi.com,705
18033,branch.io,754
15334,prothomalo.com,1153
8002,codecademy.com,1311
