In [33]:
import pandas as pd
import pickle
from leak_common import find_prominence

In [34]:
with open('../pkls_csvs/extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

In [35]:
df = pd.read_csv('../pkls_csvs/follow_up_crawl_desktop_nyc.csv', sep='\t')

In [36]:
## Additional Columns
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked == 'block') | (df.ublock_blocked == 'block') | (df.request_url_domain.isin(extra_tracker_domains))

df['tracker_related'] = df.third_party_req & df.is_blocked

In [37]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

# Third Party Requests

third_party_reqs = df[df.third_party_req & ~df.is_same_party]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

# Tracking Related Requests

tracker_related_reqs_df = df[~df.is_same_party & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
#tracker_related_reqs_df = df[(df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
pwd_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'pwd']

print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

# High Level Statistics

print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

all_reqs:  22322
unique_all_reqs:  9619
all_reqs_email:  21589
unique_all_reqs_email:  9294
third_party_reqs:  16130
unique_third_party_reqs:  7724
third_party_reqs_email:  16093
unique_third_party_reqs_email:  7708
tracker_related_reqs_df:  11797
unique_tracking_related_df:  6557
email_leaks:  11777
unique_email_leaks:  6548
Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  4399
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  3067
Distinct websites where emails are leaked to only tracker domains:  2496


In [38]:
distinct_websites_email_leaks = unique_email_leaks.drop_duplicates('initial_hostname')
prominence_list_email = find_prominence(unique_email_leaks)
prominence_email_df = pd.DataFrame(prominence_list_email, columns=['domain','prominence', 'number_of_sites']) 
prominence_email_df.sort_values(by='prominence', ascending=False)

Unnamed: 0,domain,prominence,number_of_sites
2,rlcdn.com,0.081446,616
4,taboola.com,0.018976,257
8,bizible.com,0.018976,187
17,bouncex.net,0.016517,185
3,yahoo.com,0.012791,286
...,...,...,...
105,d1lu3pmaz2ilpx.cloudfront.net,0.000010,1
153,brontops.com,0.000010,1
125,netcoresmartech.com,0.000010,1
149,glassboxcloud.com,0.000010,1
