In [36]:
import pandas as pd
from leak_common import find_prominence
import pickle

In [37]:
with open('../pkls_csvs/extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

In [38]:
df = pd.read_csv('../pkls_csvs/follow_up_crawl_desktop_fra.csv', sep='\t')

## Additional Columns

In [39]:
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked == 'block') | (df.ublock_blocked == 'block') | (df.request_url_domain.isin(extra_tracker_domains))

df['tracker_related'] = df.third_party_req & df.is_blocked

In [40]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

# Third Party Requests

third_party_reqs = df[df.third_party_req & ~df.is_same_party]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

# Tracking Related Requests

tracker_related_reqs_df = df[~df.is_same_party & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
#tracker_related_reqs_df = df[(df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
pwd_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'pwd']

print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('pwd_leaks: ', len(pwd_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

# High Level Statistics

print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

all_reqs:  16330
unique_all_reqs:  4810
all_reqs_email:  15595
unique_all_reqs_email:  4495
third_party_reqs:  10153
unique_third_party_reqs:  2929
third_party_reqs_email:  10130
unique_third_party_reqs_email:  2918
tracker_related_reqs_df:  6232
unique_tracking_related_df:  2047
email_leaks:  6220
pwd_leaks:  12
unique_email_leaks:  2041
Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  3238
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  1885
Distinct websites where emails are leaked to only tracker domains:  1291


In [41]:
prominence_list = find_prominence(tracker_related_reqs_df[tracker_related_reqs_df.search_type=='email'])

prominence_df = pd.DataFrame(prominence_list, columns=['domain','prominence', 'number_of_sites']) 

prominence_df.sort_values('prominence',ascending=False).head(20)

Unnamed: 0,domain,prominence,number_of_sites
6,bizible.com,0.012544,149
5,fullstory.com,0.006351,163
30,gravatar.com,0.00555,57
53,glassboxdigital.io,0.0032,5
20,zenaps.com,0.003133,85
21,awin1.com,0.003064,83
35,taboola.com,0.002703,34
22,listrakbi.com,0.002639,94
16,yandex.com,0.002,62
12,facebook.com,0.00177,24
