In [6]:
import pandas as pd
import pickle
from collections import defaultdict
from leak_common import get_domain, find_prominence
import matplotlib.pyplot as plt

In [7]:
with open('../pkls_csvs/extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

# Read from pickles

In [38]:
log_details = pd.read_pickle("../pkls_csvs/fra_log_details.pkl")
df = pd.read_csv("../pkls_csvs/desktop_no_action_fra_crawl.csv", sep='\t', encoding='utf-8')
## Due to copyright restrictions, we can't share categories info
# with open('../pkls_csvs/categories.pkl', 'rb') as handle:
#     sites_categories = pickle.load(handle)

# Additional Columns

In [39]:
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked) | (df.ublock_blocked) | (df.request_url_domain.isin(extra_tracker_domains))

In [40]:
df['tracker_related'] = df.third_party_req & df.is_blocked

# All Requests

In [14]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [15]:
print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

all_reqs:  22340
unique_all_reqs:  7887
all_reqs_email:  19379
unique_all_reqs_email:  6551


# Third Party Requests

In [16]:
third_party_reqs = df[df.third_party_req & ~df.is_req_off_site_direction]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [17]:
print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

third_party_reqs:  12967
unique_third_party_reqs:  4647
third_party_reqs_email:  12415
unique_third_party_reqs_email:  4442


# Tracking Related Requests

In [18]:
tracker_related_reqs_df = df[~df.is_req_off_site_direction & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
#tracker_related_reqs_df = df[(df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [19]:
print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

tracker_related_reqs_df:  8088
unique_tracking_related_df:  3366
email_leaks:  7713
unique_email_leaks:  3242


# High Level Statistics

In [41]:
print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  4395
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  2633
Distinct websites where emails are leaked to only tracker domains:  1844


# Prominence

In [46]:
prominence_list = find_prominence(unique_tracking_related_df[unique_tracking_related_df.search_type=='email'])

In [47]:
prominence_df = pd.DataFrame(prominence_list, columns=['domain','prominence', 'number_of_sites']) 

In [48]:
prominence_df.sort_values('prominence',ascending=False).head(20)

Unnamed: 0,domain,prominence,number_of_sites
17,taboola.com,0.030292,327
22,bizible.com,0.017296,160
11,fullstory.com,0.007558,182
3,zenaps.com,0.004873,113
4,awin1.com,0.00485,112
5,yandex.com,0.004194,121
20,adroll.com,0.003962,117
62,glassboxdigital.io,0.003188,6
1,listrakbi.com,0.002485,91
0,bronto.com,0.002458,90


## Leak types

In [52]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     2362
post_leaks    1004
Name: leak_type, dtype: int64

## Email leaks

In [53]:
unique_email_leaks[unique_email_leaks['rank_of_site']<2000].drop_duplicates('initial_url').sort_values('rank_of_site')[['initial_url', 'rank_of_site']].head(12)

Unnamed: 0,initial_url,rank_of_site
20907,usatoday.com,154
12723,trello.com,242
6835,independent.co.uk,243
3357,shopify.com,300
2780,marriott.com,328
13998,newsweek.com,567
6211,prezi.com,705
18033,branch.io,754
15334,prothomalo.com,1153
8002,codecademy.com,1311


In [35]:
distinct_websites_email_leaks = unique_email_leaks.drop_duplicates('initial_hostname')

In [525]:
unique_email_leaks.encoding.value_counts()

unencoded                 673
urlencode                 562
sha256                    390
urlencode-sha256          360
urlencode-sha_salted_1    225
sha_salted_1              225
md5                       210
urlencode-urlencode       170
urlencode-md5             149
base64                    112
urlencode-base64           70
sha1                       52
urlencode-sha1             38
base64-urlencode            2
base64-md5                  1
base64-base64               1
urlencode-sha512            1
sha512                      1
Name: encoding, dtype: int64

## Tracked But NOT Sniffed

In [533]:
email_leaks[email_leaks.email_sniffed==False].drop_duplicates('initial_hostname')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,is_req_off_site_direction,req_domain_entity,sniff_initiators,is_req_domain_among_sniff_initiators,req_type,ublock_blocked,tds_blocked,req_domain_category,ends_with_c,ends_with_co
13,cosicadam0+tirebuyer.com@gmail.com,email,urlencode,url_leaks,https://www.tirebuyer.com/,tirebuyer.com,tirebuyer.com,www.tirebuyer.com,tirebuyer.com,https://sca1.listrakbi.com/hWEdiEkwhcns/cart/f...,...,False,Listrak,set(),False,Image,True,True,"Ad Motivated Tracking, Advertising",False,False
79,cosicadam0+ru-mi.com@gmail.c,email,urlencode,url_leaks,https://ru-mi.com/,ru-mi.com,ru-mi.com,ru-mi.com,ru-mi.com,https://cdn.caltat.com/RegisterEvent.ashx?code...,...,False,,set(),False,Script,True,False,"Analytics and Tracking, Cart Abandonment",True,False
203,cosicadam0+academy.com@gmail.co,email,urlencode,url_leaks,https://www.academy.com/,academy.com,academy.com,www.academy.com,academy.com,https://sca1.listrakbi.com/c6dmlqeqKI30/cart/f...,...,False,Listrak,set(),False,Image,True,True,"Ad Motivated Tracking, Advertising",False,True
257,cosicadam0+rincondelvago.com@gmail.com,email,sha256,url_leaks,https://www.rincondelvago.com/,rincondelvago.com,rincondelvago.com,www.rincondelvago.com,rincondelvago.com,https://trc.taboola.com/sg/tfa-eid/1/um/?uils=...,...,False,"Taboola, Inc.",set(),False,Image,True,True,"Ad Motivated Tracking, Advertising",False,False
322,cosicadam0+caratlane.com@gmail.com,email,sha256,post_leaks,https://www.caratlane.com/,caratlane.com,caratlane.com,www.caratlane.com,caratlane.com,https://www.facebook.com/tr/,...,False,"Facebook, Inc.",set(),False,Other,True,True,"Ad Motivated Tracking, Advertising",False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21933,cosicadam0+megabonus.com@gmail.co,email,unencoded,post_leaks,https://megabonus.com/,megabonus.com,megabonus.com,megabonus.com,megabonus.com,https://mc.yandex.ru/webvisor/37456880?wmode=0...,...,False,Yandex LLC,set(),False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
21985,cosicadam0+bruna.nl@gmail.com,email,urlencode,url_leaks,https://www.bruna.nl/,bruna.nl,bruna.nl,www.bruna.nl,bruna.nl,https://tr.datatrics.com/?action_name=Online%2...,...,False,,set(),False,Image,True,False,"Analytics and Tracking, Personalization",False,False
22038,cosicadam0+protective.com@gmail.c,email,unencoded,post_leaks,https://www.protective.com/,protective.com,protective.com,www.protective.com,protective.com,https://rs.fullstory.com/rec/bundle?OrgId=TSE6...,...,False,FullStory,set(),False,XHR,True,True,"Session Replay, Analytics",True,False
22293,cosicadam0+courierpostonline.com@gmail.com,email,sha256,url_leaks,https://www.courierpostonline.com/,courierpostonline.com,courierpostonline.com,www.courierpostonline.com,courierpostonline.com,https://trc.taboola.com/sg/tfa-eid/1/um/?uils=...,...,False,"Taboola, Inc.",set(),False,Image,True,True,"Ad Motivated Tracking, Advertising",False,False


# Categories

In [537]:
categories_100K_final_dict = defaultdict(int)
for site_url, category in sites_categories.items():
    splitted_category = category.strip().split(', ')
    for each_category in splitted_category:
        categories_100K_final_dict[each_category] += 1

In [538]:
log_details['initial_hostname']
filled_sites = set(log_details['initial_hostname'])
log_details['initial_url'] = log_details.apply (lambda row: get_domain('http://' + row['initial_hostname']), axis=1)
categories_filled_final_dict = defaultdict(int)
filled_sites_domain_set = set(log_details.initial_url)
for site_url, category in sites_categories.items():
    if site_url in filled_sites_domain_set:
        splitted_category = category.strip().split(', ')
        for each_category in splitted_category:
            categories_filled_final_dict[each_category] += 1

In [539]:
unique_email_sites_df = unique_tracking_related_df[unique_tracking_related_df.search_type=='email'].drop_duplicates('initial_hostname')
leaky_categories_dict = defaultdict(int)
for index, row in unique_email_sites_df.iterrows():
    splitted_category = row['category'].strip().split(', ')
#     print(row)
    for each_category in splitted_category:
        leaky_categories_dict[each_category] += 1

In [540]:
category_dict = dict()
tuples = []
for category, value in categories_100K_final_dict.items():
    filled_num = 0
    leaky_num = 0
    total_num = value
    if category in categories_filled_final_dict.keys():
        filled_num = categories_filled_final_dict[category]
    if category in leaky_categories_dict.keys():
        leaky_num = leaky_categories_dict[category]
    tuple_el = (category, total_num, filled_num, leaky_num)
    tuples.append(tuple_el)

In [541]:
categories_final = pd.DataFrame(tuples, columns=['category', 'total', 'filled', 'leaky']) 
categories_final['percentage'] = categories_final['leaky'] *100/ categories_final['filled']
categories_final['percentage'].astype(float).round(1)
categories_filtered = categories_final[categories_final.total>1000].sort_values(by='percentage', ascending=False)
categories_filtered['percentage'] = categories_filtered['percentage'].astype(float).round(1)
len(categories_filtered),categories_filtered

(23,
                      category  total  filled  leaky  percentage
 53             Fashion/Beauty   1669    1176    131        11.1
 7             Online Shopping   5395    3658    345         9.4
 18               General News   7390    3579    235         6.6
 5           Software/Hardware   4933    2834    138         4.9
 4                    Business  13462    7805    377         4.8
 22    Marketing/Merchandising   4964    3167    119         3.8
 16          Internet Services   7974    4627    171         3.7
 39                     Travel   2519    1355     46         3.4
 26                     Health   2516    1389     44         3.2
 23            Finance/Banking   3699    1505     41         2.7
 38                     Sports   1910    1044     28         2.7
 8                Portal Sites   1544     682     17         2.5
 10        Education/Reference  10190    4185     87         2.1
 12              Entertainment   5297    2610     47         1.8
 64         Recreati

## Leaks on sites where CMP detected

In [54]:
len(tracker_related_reqs_df[tracker_related_reqs_df.was_cmp_detected].drop_duplicates('initial_hostname'))

202

## Is req domain among the sniff initiators?

In [55]:
not_in_sniff_inits = set(unique_email_leaks[~unique_email_leaks.is_req_domain_among_sniff_initiators].drop_duplicates('initial_hostname').initial_hostname)
in_sniff_inits = set(unique_email_leaks[unique_email_leaks.is_req_domain_among_sniff_initiators].drop_duplicates('initial_hostname').initial_hostname)
len(not_in_sniff_inits.difference(in_sniff_inits)), len(not_in_sniff_inits), len(in_sniff_inits), len(unique_email_leaks.drop_duplicates('initial_hostname'))

(565, 635, 1279, 1844)

In [56]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     2362
post_leaks    1004
Name: leak_type, dtype: int64

# HTTP WEB SOCKET

In [642]:
unique_email_leaks['starts_with_wss'] = list(
    map(lambda x: x.startswith('wss'), unique_email_leaks['request_url'])) 

In [643]:
unique_email_leaks[unique_email_leaks.starts_with_wss].drop_duplicates('request_url_domain')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,req_domain_entity,sniff_initiators,is_req_domain_among_sniff_initiators,req_type,ublock_blocked,tds_blocked,req_domain_category,ends_with_c,ends_with_co,starts_with_wss
67,cosicadam0+chopard.com@gmail.co,email,unencoded,post_leaks,https://www.chopard.com/intl/,chopard.com,chopard.com,www.chopard.com,chopard.com,wss://am.freshrelevance.com/,...,,"{'chopard.com', 'dkpklk99llpj0.cloudfront.net'}",False,WebSocket,False,False,"Analytics and Tracking, Conversion Optimization",False,True,True
379,cosicadam0+bettercloud.com@gmail.c,email,urlencode,post_leaks,https://www.bettercloud.com/,bettercloud.com,bettercloud.com,www.bettercloud.com,bettercloud.com,wss://ws11.hotjar.com/api/v2/client/ws,...,,"{'bizible.com', 'marketo.com', 'hotjar.com'}",False,WebSocket,True,True,"Session Replay, Analytics",True,False,True
11162,cosicadam0+cityfurniture.com@gmail.co,email,unencoded,post_leaks,https://www.cityfurniture.com/,cityfurniture.com,cityfurniture.com,www.cityfurniture.com,cityfurniture.com,wss://input.noibu.com/pv_part,...,,set(),False,WebSocket,True,False,"Widgets, Error Tracking",False,True,True


In [380]:
unique_email_leaks['starts_with_http'] = list(
    map(lambda x: x.startswith('http:'), unique_email_leaks['request_url'])) 

unique_email_leaks[unique_email_leaks.starts_with_http].drop_duplicates('initial_hostname')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,sniff_initiators,is_req_domain_among_sniff_initiators,req_type,ublock_blocked,tds_blocked,req_domain_category,ends_with_c,ends_with_co,starts_with_wss,starts_with_http
697,cosicadam0+ashland.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ashland.com/,ashland.com,ashland.com,www.ashland.com,ashland.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,"{'ashland.com', 'gstatic.com'}",False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
2260,cosicadam0+onfido.com@gmail.com,email,urlencode-urlencode,url_leaks,https://onfido.com/,onfido.com,onfido.com,onfido.com,onfido.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'onfido.com'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
2739,cosicadam0+pressgazette.co.uk@gmail.com,email,urlencode-urlencode,url_leaks,https://pressgazette.co.uk/,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'pressgazette.co.uk'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
4138,cosicadam0+ntrepidcorp.com@gmail.com,email,urlencode-urlencode,url_leaks,https://ntrepidcorp.com/,ntrepidcorp.com,ntrepidcorp.com,ntrepidcorp.com,ntrepidcorp.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'ntrepidcorp.com'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
6948,cosicadam0+advertisers.contobox.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.advertisers.contobox.com/,contobox.com,contobox.com,www.advertisers.contobox.com,contobox.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'contobox.com'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
7040,cosicadam0+ctbuh.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/,ctbuh.org,ctbuh.org,www.ctbuh.org,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'ctbuh.org'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
11555,cosicadam0+eastern.edu@gmail.com,email,urlencode-urlencode,url_leaks,https://www.eastern.edu/,eastern.edu,eastern.edu,www.eastern.edu,eastern.edu,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'eastern.edu'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
13186,cosicadam0+affiliatesummit.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.affiliatesummit.com/,affiliatesummit.com,affiliatesummit.com,www.affiliatesummit.com,affiliatesummit.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'affiliatesummit.com'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
13503,cosicadam0+iiba.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.iiba.org/,iiba.org,iiba.org,www.iiba.org,iiba.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,set(),False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
16225,cosicadam0+skyscrapercenter.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/?redirect=true,ctbuh.org,skyscrapercenter.com,www.skyscrapercenter.com,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'ctbuh.org'},False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,False,False,True
