In [1]:
import pandas as pd
import pickle
from collections import defaultdict
from leak_common import get_domain, find_prominence
import matplotlib.pyplot as plt

In [2]:
with open('../pkls_csvs/extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

# Read from pickles

In [49]:
log_details = pd.read_pickle("../pkls_csvs/fra_mobile_log_details.pkl")
df = pd.read_csv("../pkls_csvs/mobile_no_action_fra_crawl.csv", sep='\t', encoding='utf-8')
## Due to copyright restrictions, we can't share categories info
# with open('../pkls_csvs/categories.pkl', 'rb') as handle:
#     sites_categories = pickle.load(handle)

# Additional Columns

In [50]:
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked) | (df.ublock_blocked) | (df.request_url_domain.isin(extra_tracker_domains))

In [51]:
df['tracker_related'] = df.third_party_req & df.is_blocked

# All Requests

In [52]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [53]:
print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

all_reqs:  20706
unique_all_reqs:  7392
all_reqs_email:  17861
unique_all_reqs_email:  6039


# Third Party Requests

In [54]:
third_party_reqs = df[df.third_party_req & ~df.is_req_off_site_direction]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [55]:
print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

third_party_reqs:  11950
unique_third_party_reqs:  4327
third_party_reqs_email:  11512
unique_third_party_reqs_email:  4162


# Tracking Related Requests

In [56]:
tracker_related_reqs_df = df[~df.is_req_off_site_direction & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [57]:
print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

tracker_related_reqs_df:  7509
unique_tracking_related_df:  3180
email_leaks:  7201
unique_email_leaks:  3080


# High Level Statistics

In [58]:
print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  4049
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  2465
Distinct websites where emails are leaked to only tracker domains:  1745


# Prominence

In [59]:
prominence_list = find_prominence(unique_tracking_related_df[unique_tracking_related_df.search_type=='email'])

In [60]:
prominence_df = pd.DataFrame(prominence_list, columns=['domain','prominence', 'number_of_sites']) 

In [61]:
prominence_df.sort_values('prominence',ascending=False).head(20)

Unnamed: 0,domain,prominence,number_of_sites
2,taboola.com,0.028498,304
14,bizible.com,0.013878,143
8,fullstory.com,0.006247,170
1,yandex.com,0.004896,103
3,zenaps.com,0.004797,109
4,awin1.com,0.004774,108
24,adroll.com,0.00416,120
20,zoominfo.com,0.003584,18
56,glassboxdigital.io,0.003502,6
0,listrakbi.com,0.00236,91


## Leak types

In [62]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     2257
post_leaks     923
Name: leak_type, dtype: int64

## Email leaks

In [63]:
unique_email_leaks[unique_email_leaks['rank_of_site']<2000].drop_duplicates('initial_url').sort_values('rank_of_site')[['initial_url', 'rank_of_site']].head(12)

Unnamed: 0,initial_url,rank_of_site
19326,usatoday.com,154
6385,independent.co.uk,243
3244,shopify.com,300
2687,marriott.com,328
6163,prnewswire.com,341
13170,newsweek.com,567
5790,prezi.com,705
16698,branch.io,754
8484,libsyn.com,887
9951,myanimelist.net,1089


In [64]:
distinct_websites_email_leaks = unique_email_leaks.drop_duplicates('initial_hostname')

In [65]:
unique_email_leaks.encoding.value_counts()

unencoded                 616
urlencode                 524
sha256                    388
urlencode-sha256          343
urlencode-sha_salted_1    217
sha_salted_1              217
md5                       216
urlencode-md5             159
urlencode-urlencode       152
base64                    102
urlencode-base64           66
sha1                       43
urlencode-sha1             32
urlencode-sha512            2
sha512                      2
base64-base64               1
Name: encoding, dtype: int64

## Tracked But NOT Sniffed

In [66]:
email_leaks[email_leaks.email_sniffed==False].drop_duplicates('initial_hostname')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,tracker_related,xpath,id,latest_url,last_page_domain_iframe,is_req_off_site_direction,req_type,ublock_blocked,tds_blocked,req_domain_category
3,cosiceve0+tirebuyer.com@gmail.c,email,urlencode,url_leaks,https://www.tirebuyer.com/,tirebuyer.com,tirebuyer.com,www.tirebuyer.com,tirebuyer.com,https://sca1.listrakbi.com/hWEdiEkwhcns/cart/f...,...,True,"$x(""//*[@id='ltkpopup-email']"")",ltkpopup-email,https://www.tirebuyer.com/,tirebuyer.com,False,Image,True,True,"['Audience Measurement', 'Ad Motivated Trackin..."
204,cosiceve0+academy.com@gmail.com,email,urlencode,url_leaks,https://www.academy.com/,academy.com,academy.com,www.academy.com,academy.com,https://sca1.listrakbi.com/c6dmlqeqKI30/cart/f...,...,True,"$x(""//*[@id='ltkpopup-email']"")",ltkpopup-email,https://www.academy.com/,academy.com,False,Image,True,True,"['Audience Measurement', 'Ad Motivated Trackin..."
570,cosiceve0+deliverr.com@gmail.co,email,unencoded,post_leaks,https://deliverr.com/,deliverr.com,deliverr.com,deliverr.com,deliverr.com,https://rs.fullstory.com/rec/bundle?OrgId=C8DB...,...,True,"$x(""//*[@id='login_email_header']"")",login_email_header,https://deliverr.com/,deliverr.com,False,XHR,True,True,"['Analytics', 'Session Replay']"
661,cosiceve0+omaze.com@gmail.com,email,sha_salted_1,url_leaks,https://www.omaze.com/,omaze.com,omaze.com,www.omaze.com,omaze.com,https://www.zenaps.com/a/b.php?merchantId=1944...,...,True,"$x(""//*[@id='ozEmailOptinCustomerEmail']"")",ozEmailOptinCustomerEmail,https://www.omaze.com/,omaze.com,False,Image,False,False,[]
839,cosiceve0+pro-essay-writer.com@gmail.co,email,unencoded,post_leaks,https://pro-essay-writer.com/,pro-essay-writer.com,pro-essay-writer.com,pro-essay-writer.com,pro-essay-writer.com,https://mc.yandex.ru/webvisor/25694771?wmode=0...,...,True,"$x(""//*[@id='modal-email']"")",modal-email,https://pro-essay-writer.com/,pro-essay-writer.com,False,XHR,True,True,"['Audience Measurement', 'Ad Motivated Trackin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20334,cosiceve0+megabonus.com@gmail.com,email,unencoded,post_leaks,https://megabonus.com/,megabonus.com,megabonus.com,megabonus.com,megabonus.com,https://mc.yandex.ru/webvisor/37456880?wmode=0...,...,True,"$x(""//*[@id='auth-form-password']"")",auth-form-password,https://auth.megabonus.com/?redirect_url=https...,megabonus.com,False,XHR,True,True,"['Audience Measurement', 'Ad Motivated Trackin..."
20377,cosiceve0+bruna.nl@gmail.com,email,urlencode,url_leaks,https://www.bruna.nl/,bruna.nl,bruna.nl,www.bruna.nl,bruna.nl,https://tr.datatrics.com/?action_name=Online%2...,...,True,"$x(""//*[@id='newsletter']"")",newsletter,https://www.bruna.nl/,bruna.nl,False,Image,True,False,[]
20437,cosiceve0+protective.com@gmail.com,email,unencoded,post_leaks,https://www.protective.com/,protective.com,protective.com,www.protective.com,protective.com,https://rs.fullstory.com/rec/bundle?OrgId=TSE6...,...,True,"$x(""//*[@id='password']"")",password,https://myaccount.protective.com/login,protective.com,False,XHR,True,True,"['Analytics', 'Session Replay']"
20550,cosiceve0+ripl.com@gmail.co,email,urlencode,url_leaks,https://www.ripl.com/,ripl.com,ripl.com,www.ripl.com,ripl.com,https://www.upsellit.com/hound/saveData.jsp?si...,...,True,"$x(""//*[@id='input_comp-k9zv53du']"")",input_comp-k9zv53du,https://www.ripl.com/,ripl.com,False,Script,True,False,[]


# Categories

In [67]:
categories_100K_final_dict = defaultdict(int)
for site_url, category in sites_categories.items():
    splitted_category = category.strip().split(', ')
    for each_category in splitted_category:
        categories_100K_final_dict[each_category] += 1

In [68]:
log_details['initial_hostname']
filled_sites = set(log_details['initial_hostname'])
log_details['initial_url'] = log_details.apply (lambda row: get_domain('http://' + row['initial_hostname']), axis=1)
categories_filled_final_dict = defaultdict(int)
filled_sites_domain_set = set(log_details.initial_url)
for site_url, category in sites_categories.items():
    if site_url in filled_sites_domain_set:
        splitted_category = category.strip().split(', ')
        for each_category in splitted_category:
            categories_filled_final_dict[each_category] += 1

In [71]:
unique_email_sites_df = unique_tracking_related_df[unique_tracking_related_df.search_type=='email'].drop_duplicates('initial_hostname')
leaky_categories_dict = defaultdict(int)
for index, row in unique_email_sites_df.iterrows():
    splitted_category = sites_categories[row['initial_url']].strip().split(', ')
#     print(row)
    for each_category in splitted_category:
        leaky_categories_dict[each_category] += 1

In [75]:
category_dict = dict()
tuples = []
for category, value in categories_100K_final_dict.items():
    filled_num = 0
    leaky_num = 0
    total_num = value
    if category in categories_filled_final_dict.keys():
        filled_num = categories_filled_final_dict[category]
    if category in leaky_categories_dict.keys():
        leaky_num = leaky_categories_dict[category]
    tuple_el = (category, total_num, filled_num, leaky_num)
    tuples.append(tuple_el)

In [76]:
categories_final = pd.DataFrame(tuples, columns=['category', 'total', 'filled', 'leaky']) 
categories_final['percentage'] = categories_final['leaky'] *100/ categories_final['filled']
categories_final['percentage'].astype(float).round(1)
categories_filtered = categories_final[categories_final.total>1000].sort_values(by='percentage', ascending=False)
categories_filtered['percentage'] = categories_filtered['percentage'].astype(float).round(1)
len(categories_filtered),categories_filtered

(23,
                      category  total  filled  leaky  percentage
 53             Fashion/Beauty   1669    1159    127        11.0
 7             Online Shopping   5395    3656    333         9.1
 18               General News   7390    3526    233         6.6
 4                    Business  13462    8099    347         4.3
 5           Software/Hardware   4933    2884    121         4.2
 22    Marketing/Merchandising   4964    3143    111         3.5
 39                     Travel   2519    1369     44         3.2
 16          Internet Services   7974    4973    156         3.1
 38                     Sports   1910    1070     26         2.4
 26                     Health   2516    1491     35         2.3
 23            Finance/Banking   3699    1978     44         2.2
 64         Recreation/Hobbies   1098     740     14         1.9
 12              Entertainment   5297    2587     45         1.7
 8                Portal Sites   1544     749     13         1.7
 17  Technical/Busin

## Leaks on sites where CMP detected

In [77]:
len(tracker_related_reqs_df[tracker_related_reqs_df.was_cmp_detected].drop_duplicates('initial_hostname'))

188

## Leak types

In [80]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     2257
post_leaks     923
Name: leak_type, dtype: int64

# HTTP WEB SOCKET

In [81]:
unique_email_leaks['starts_with_wss'] = list(
    map(lambda x: x.startswith('wss'), unique_email_leaks['request_url'])) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_email_leaks['starts_with_wss'] = list(


In [82]:
unique_email_leaks[unique_email_leaks.starts_with_wss].drop_duplicates('request_url_domain')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,xpath,id,latest_url,last_page_domain_iframe,is_req_off_site_direction,req_type,ublock_blocked,tds_blocked,req_domain_category,starts_with_wss
753,cosiceve0+fastly.com@gmail.c,email,urlencode,post_leaks,https://www.fastly.com/,fastly.com,fastly.com,www.fastly.com,fastly.com,wss://ws3.hotjar.com/api/v2/client/ws,...,"$x(""//*[@id='gatsby-focus-wrapper']/main/div/d...",,https://www.fastly.com/,fastly.com,False,WebSocket,True,True,"['Audience Measurement', 'Analytics', 'Session...",True
913,cosiceve0+sunwing.ca@gmail.c,email,unencoded,post_leaks,https://www.sunwing.ca/en/,sunwing.ca,sunwing.ca,www.sunwing.ca,sunwing.ca,wss://am.freshrelevance.com/,...,"$x(""//*[@id='okta-signin-password']"")",okta-signin-password,https://www.sunwing.ca/en/user-account/login,sunwing.ca,False,WebSocket,False,False,[],True
10588,cosiceve0+cityfurniture.com@gmail.c,email,unencoded,post_leaks,https://www.cityfurniture.com/,cityfurniture.com,cityfurniture.com,www.cityfurniture.com,cityfurniture.com,wss://input.noibu.com/pv_part,...,"$x(""//*[@id='ds-wrap']/div[1]/div/div/div/div[...",,https://www.cityfurniture.com/,cityfurniture.com,False,WebSocket,True,False,[],True


In [83]:
unique_email_leaks['starts_with_http'] = list(
    map(lambda x: x.startswith('http:'), unique_email_leaks['request_url'])) 

unique_email_leaks[unique_email_leaks.starts_with_http].drop_duplicates('initial_hostname')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_email_leaks['starts_with_http'] = list(


Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,id,latest_url,last_page_domain_iframe,is_req_off_site_direction,req_type,ublock_blocked,tds_blocked,req_domain_category,starts_with_wss,starts_with_http
639,cosiceve0+ashland.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ashland.com/,ashland.com,ashland.com,www.ashland.com,ashland.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,843993_107244pi_843993_107244,https://solving.ashland.com/l/843993/2020-08-2...,ashland.com,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
2194,cosiceve0+onfido.com@gmail.com,email,urlencode-urlencode,url_leaks,https://onfido.com/,onfido.com,onfido.com,onfido.com,onfido.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,885253_8614pi_885253_8614,https://go.onfido.com/l/885253/2020-09-16/29g1,onfido.com,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
2660,cosiceve0+pressgazette.co.uk@gmail.com,email,urlencode-urlencode,url_leaks,https://pressgazette.co.uk/,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,http://go.pardot.com/form/checkEmailAjax/accou...,...,375012_527938pi_375012_527938,https://go.pressgazette.co.uk/l/375012/2021-01...,pressgazette.co.uk,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
3298,cosiceve0+rating-widget.com@gmail.com,email,urlencode-base64,url_leaks,http://rating-widget.com/,rating-widget.com,rating-widget.com,rating-widget.com,rating-widget.com,"http://hn.inspectlet.com/pdata?d=cinmi,18273,3...",...,,http://rating-widget.com/,rating-widget.com,False,Image,True,False,"['Analytics', 'Session Replay', 'Third-Party A...",False,True
6508,cosiceve0+advertisers.contobox.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.advertisers.contobox.com/,contobox.com,contobox.com,www.advertisers.contobox.com,contobox.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,764333_14369pi_764333_14369,https://www2.contobox.com/l/764333/2019-10-03/...,contobox.com,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
6608,cosiceve0+ctbuh.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/,ctbuh.org,ctbuh.org,www.ctbuh.org,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,878592_4768pi_878592_4768,https://marketing.ctbuh.org/l/878592/2020-08-2...,ctbuh.org,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
8838,cosiceve0+vendini.com@gmail.com,email,urlencode-urlencode,url_leaks,https://vendini.com/,vendini.com,vendini.com,vendini.com,vendini.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,814053_979pi_814053_979,https://www2.vendini.com/l/814053/2019-11-01/2...,vendini.com,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
12448,cosiceve0+affiliatesummit.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.affiliatesummit.com/,affiliatesummit.com,affiliatesummit.com,www.affiliatesummit.com,affiliatesummit.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,528152_120597pi_528152_120597,https://info.affiliatesummit.com/l/528152/2020...,affiliatesummit.com,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
12736,cosiceve0+iiba.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.iiba.org/,iiba.org,iiba.org,www.iiba.org,iiba.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,Emailpi_Email,https://go.iiba.org/email-signup?utm_source=We...,iiba.org,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
15088,cosiceve0+skyscrapercenter.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/?redirect=true,ctbuh.org,skyscrapercenter.com,www.skyscrapercenter.com,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,878592_4768pi_878592_4768,https://marketing.ctbuh.org/l/878592/2020-08-2...,ctbuh.org,False,XHR,True,True,"['Embedded Content', 'Ad Motivated Tracking', ...",False,True
