In [1]:
import pandas as pd
import pickle
from collections import defaultdict
from leak_common import get_domain, find_prominence
import matplotlib.pyplot as plt

In [2]:
with open('extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

# Read from pickles

In [3]:
log_details = pd.read_pickle("../../nyc_mobile_log_details.pkl")
df = pd.read_csv("mobile_no_action_nyc_crawl.csv", sep='\t', encoding='utf-8')
# Due to copyright restrictions, we can't share categories info
# with open('../pkls_csvs/categories.pkl', 'rb') as handle:
#     sites_categories = pickle.load(handle)

# Additional Columns

In [4]:
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked) | (df.ublock_blocked) | (df.request_url_domain.isin(extra_tracker_domains))

In [5]:
df['tracker_related'] = df.third_party_req & df.is_blocked

# All Requests

In [6]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [7]:
print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

all_reqs:  26073
unique_all_reqs:  11155
all_reqs_email:  23242
unique_all_reqs_email:  9792


# Third Party Requests

In [8]:
third_party_reqs = df[df.third_party_req & ~df.is_req_off_site_direction]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [9]:
print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

third_party_reqs:  17112
unique_third_party_reqs:  8021
third_party_reqs_email:  16678
unique_third_party_reqs_email:  7854


# Tracking Related Requests

In [10]:
tracker_related_reqs_df = df[~df.is_req_off_site_direction & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [11]:
print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

tracker_related_reqs_df:  12115
unique_tracking_related_df:  6569
email_leaks:  11804
unique_email_leaks:  6466


# High Level Statistics

In [12]:
print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  5126
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  3513
Distinct websites where emails are leaked to only tracker domains:  2744


# Prominence

In [13]:
prominence_list = find_prominence(unique_tracking_related_df[unique_tracking_related_df.search_type=='email'])

In [14]:
prominence_df = pd.DataFrame(prominence_list, columns=['domain','prominence', 'number_of_sites']) 

In [15]:
prominence_df.sort_values('prominence',ascending=False).head(20)

Unnamed: 0,domain,prominence,number_of_sites
3,rlcdn.com,0.044991,404
11,taboola.com,0.039607,335
18,bizible.com,0.020878,172
5,bouncex.net,0.017625,182
1,fullstory.com,0.01159,235
6,zenaps.com,0.00982,115
7,awin1.com,0.009797,114
4,yahoo.com,0.007354,266
2,yandex.ru,0.005454,120
33,adroll.com,0.004921,122


## Leak types

In [16]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     5339
post_leaks    1230
Name: leak_type, dtype: int64

## Email leaks

In [17]:
unique_email_leaks[unique_email_leaks['rank_of_site']<2000].drop_duplicates('initial_url').sort_values('rank_of_site')[['initial_url', 'rank_of_site']].head(12)

Unnamed: 0,initial_url,rank_of_site
647,issuu.com,95
24210,usatoday.com,154
4595,time.com,191
2343,udemy.com,196
17538,healthline.com,217
10398,foxnews.com,234
3645,zendesk.com,239
9804,webmd.com,288
4087,shopify.com,300
3300,marriott.com,328


In [18]:
distinct_websites_email_leaks = unique_email_leaks.drop_duplicates('initial_hostname')

In [19]:
unique_email_leaks.encoding.value_counts()

sha256                          1110
urlencode-sha256                1065
unencoded                        824
urlencode                        597
md5                              468
urlencode-md5                    401
sha1                             260
urlencode-sha1                   247
base64                           235
sha_salted_1                     229
urlencode-sha_salted_1           229
urlencode-base64                 192
lzstring-urlencode               187
urlencode-lzstring-urlencode     186
urlencode-urlencode              151
urlencode-custom_map_1            78
urlencode-sha512                   2
sha512                             2
base64-base64                      1
base64-sha256                      1
base64-sha1                        1
Name: encoding, dtype: int64

## Tracked But NOT Sniffed

In [20]:
email_leaks[email_leaks.email_sniffed==False].drop_duplicates('initial_hostname')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,tracker_related,xpath,id,latest_url,last_page_domain_iframe,is_req_off_site_direction,req_type,ublock_blocked,tds_blocked,req_domain_category
3,cosiceve0+tirebuyer.com@gmail.co,email,urlencode,url_leaks,https://www.tirebuyer.com/,tirebuyer.com,tirebuyer.com,www.tirebuyer.com,tirebuyer.com,https://sca1.listrakbi.com/hWEdiEkwhcns/cart/f...,...,True,"$x(""//*[@id='ltkpopup-email']"")",ltkpopup-email,https://www.tirebuyer.com/,tirebuyer.com,False,Image,True,True,"Ad Motivated Tracking, Advertising"
31,cosiceve0+allaboutjazz.com@gmail.com,email,urlencode-sha256,url_leaks,https://www.allaboutjazz.com/index.php?&width=412,allaboutjazz.com,allaboutjazz.com,www.allaboutjazz.com,allaboutjazz.com,https://api.rlcdn.com/api/identity/envelope?pi...,...,True,"$x(""//*[@id='section-subscribe']/div/div/div/d...",,https://www.allaboutjazz.com/index.php?&width=412,allaboutjazz.com,False,Fetch,True,True,"Ad Motivated Tracking, Advertising"
172,cosiceve0+vivaterra.com@gmail.co,email,urlencode-custom_map_1,url_leaks,https://www.vivaterra.com/,vivaterra.com,vivaterra.com,www.vivaterra.com,vivaterra.com,https://track.securedvisit.com/citecapture/?cc...,...,True,"$x(""//*[@id='footer']/div[2]/div[1]/div/div[2]...",,https://www.vivaterra.com/,vivaterra.com,False,Script,True,True,"Ad Motivated Tracking, Advertising"
252,cosiceve0+aurea.com@gmail.com,email,urlencode,url_leaks,https://www.aurea.com/,aurea.com,aurea.com,www.aurea.com,aurea.com,https://cdn.bizible.com/m/blr?e=OMFbqazoSXRsTs...,...,True,"$x(""//*[@id='Email']"")",Email,https://content.aurea.com/worx-registration,aurea.com,False,Image,True,False,"Analytics and Tracking, Marketing Automation"
275,cosiceve0+on-running.com@gmail.com,email,lzstring-urlencode,url_leaks,https://www.on-running.com/en-us/,on-running.com,on-running.com,www.on-running.com,on-running.com,https://events.bouncex.net/track.gif/user?wklz...,...,True,"$x(""//*[@id='email']"")",email,https://www.on-running.com/en-us/,on-running.com,False,Image,True,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25530,cosiceve0+megabonus.com@gmail.co,email,unencoded,post_leaks,https://megabonus.com/,megabonus.com,megabonus.com,megabonus.com,megabonus.com,https://mc.yandex.ru/webvisor/37456880?wmode=0...,...,True,"$x(""//*[@id='auth-form-password']"")",auth-form-password,https://auth.megabonus.com/?redirect_url=https...,megabonus.com,False,XHR,True,True,"Ad Motivated Tracking, Advertising"
25575,cosiceve0+bruna.nl@gmail.c,email,urlencode,url_leaks,https://www.bruna.nl/,bruna.nl,bruna.nl,www.bruna.nl,bruna.nl,https://tr.datatrics.com/?action_name=Online%2...,...,True,"$x(""//*[@id='newsletter']"")",newsletter,https://www.bruna.nl/,bruna.nl,False,Image,True,False,"Analytics and Tracking, Personalization"
25633,cosiceve0+protective.com@gmail.co,email,unencoded,post_leaks,https://www.protective.com/,protective.com,protective.com,www.protective.com,protective.com,https://rs.fullstory.com/rec/bundle?OrgId=TSE6...,...,True,"$x(""//*[@id='password']"")",password,https://myaccount.protective.com/login,protective.com,False,XHR,True,True,"Session Replay, Analytics"
25780,cosiceve0+ripl.com@gmail.c,email,urlencode,url_leaks,https://www.ripl.com/,ripl.com,ripl.com,www.ripl.com,ripl.com,https://www.upsellit.com/hound/saveData.jsp?si...,...,True,"$x(""//*[@id='input_comp-k9zv53du']"")",input_comp-k9zv53du,https://www.ripl.com/,ripl.com,False,Script,True,False,"Analytics and Tracking, Cart Abandonment"


# Categories

In [21]:
categories_100K_final_dict = defaultdict(int)
for site_url, category in sites_categories.items():
    splitted_category = category.strip().split(', ')
    for each_category in splitted_category:
        categories_100K_final_dict[each_category] += 1

In [22]:
log_details['initial_hostname']
filled_sites = set(log_details['initial_hostname'])
log_details['initial_url'] = log_details.apply (lambda row: get_domain('http://' + row['initial_hostname']), axis=1)
categories_filled_final_dict = defaultdict(int)
filled_sites_domain_set = set(log_details.initial_url)
for site_url, category in sites_categories.items():
    if site_url in filled_sites_domain_set:
        splitted_category = category.strip().split(', ')
        for each_category in splitted_category:
            categories_filled_final_dict[each_category] += 1

In [23]:
unique_email_sites_df = unique_tracking_related_df[unique_tracking_related_df.search_type=='email'].drop_duplicates('initial_hostname')
leaky_categories_dict = defaultdict(int)
for index, row in unique_email_sites_df.iterrows():
    splitted_category = sites_categories[row['initial_url']].strip().split(', ')
#     print(row)
    for each_category in splitted_category:
        leaky_categories_dict[each_category] += 1

In [24]:
category_dict = dict()
tuples = []
for category, value in categories_100K_final_dict.items():
    filled_num = 0
    leaky_num = 0
    total_num = value
    if category in categories_filled_final_dict.keys():
        filled_num = categories_filled_final_dict[category]
    if category in leaky_categories_dict.keys():
        leaky_num = leaky_categories_dict[category]
    tuple_el = (category, total_num, filled_num, leaky_num)
    tuples.append(tuple_el)

In [25]:
categories_final = pd.DataFrame(tuples, columns=['category', 'total', 'filled', 'leaky']) 
categories_final['percentage'] = categories_final['leaky'] *100/ categories_final['filled']
categories_final['percentage'].astype(float).round(1)
categories_filtered = categories_final[categories_final.total>1000].sort_values(by='percentage', ascending=False)
categories_filtered['percentage'] = categories_filtered['percentage'].astype(float).round(1)
len(categories_filtered),categories_filtered

(23,
                      category  total  filled  leaky  percentage
 53             Fashion/Beauty   1669    1305    219        16.8
 7             Online Shopping   5395    3977    529        13.3
 64         Recreation/Hobbies   1098     753     97        12.9
 18               General News   7390    3832    378         9.9
 6                  Blogs/Wiki   5415    3140    236         7.5
 22    Marketing/Merchandising   4964    3275    180         5.5
 4                    Business  13462    8295    434         5.2
 5           Software/Hardware   4933    2932    148         5.0
 39                     Travel   2519    1464     69         4.7
 38                     Sports   1910    1094     50         4.6
 16          Internet Services   7974    5063    182         3.6
 12              Entertainment   5297    2686     95         3.5
 26                     Health   2516    1553     52         3.3
 23            Finance/Banking   3699    2056     56         2.7
 40         Public I

## Leaks on sites where CMP detected

In [26]:
len(tracker_related_reqs_df[tracker_related_reqs_df.was_cmp_detected].drop_duplicates('initial_hostname'))

209

## Leak types

In [27]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     5339
post_leaks    1230
Name: leak_type, dtype: int64

# HTTP WEB SOCKET

In [28]:
unique_email_leaks['starts_with_wss'] = list(
    map(lambda x: x.startswith('wss'), unique_email_leaks['request_url'])) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_email_leaks['starts_with_wss'] = list(


In [29]:
unique_email_leaks[unique_email_leaks.starts_with_wss].drop_duplicates('request_url_domain')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,xpath,id,latest_url,last_page_domain_iframe,is_req_off_site_direction,req_type,ublock_blocked,tds_blocked,req_domain_category,starts_with_wss
77,cosiceve0+chopard.com@gmail.com,email,unencoded,post_leaks,https://www.chopard.com/intl/,chopard.com,chopard.com,www.chopard.com,chopard.com,wss://am.freshrelevance.com/,...,"$x(""//*[@id='email']"")",email,https://www.chopard.com/intl/chopard_newslette...,chopard.com,False,WebSocket,False,False,"Analytics and Tracking, Conversion Optimization",True
444,cosiceve0+bettercloud.com@gmail.c,email,urlencode,post_leaks,https://www.bettercloud.com/,bettercloud.com,bettercloud.com,www.bettercloud.com,bettercloud.com,wss://ws2.hotjar.com/api/v2/client/ws,...,"$x(""//*[@id='Email']"")",Email,https://www.bettercloud.com/,bettercloud.com,False,WebSocket,True,True,"Session Replay, Analytics",True
1822,cosiceve0+verabradley.com@gmail.co,email,urlencode-custom_map_1,post_leaks,https://verabradley.com/,verabradley.com,verabradley.com,verabradley.com,verabradley.com,wss://input.noibu.com/pv_part,...,"$x(""//*[@id='Email']"")",Email,https://verabradley.com/,verabradley.com,False,WebSocket,True,False,"Widgets, Error Tracking",True
5528,cosiceve0+mercuryinsurance.com@gmail.co,email,unencoded,post_leaks,https://www.mercuryinsurance.com/,mercuryinsurance.com,mercuryinsurance.com,www.mercuryinsurance.com,mercuryinsurance.com,wss://collection.decibelinsight.net/i/13502/ws/,...,"$x(""//*[@id='login-password']"")",login-password,https://cp.mercuryinsurance.com/cas/login?serv...,mercuryinsurance.com,False,WebSocket,True,False,,True


In [30]:
unique_email_leaks['starts_with_http'] = list(
    map(lambda x: x.startswith('http:'), unique_email_leaks['request_url'])) 

unique_email_leaks[unique_email_leaks.starts_with_http].drop_duplicates('initial_hostname')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_email_leaks['starts_with_http'] = list(


Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,id,latest_url,last_page_domain_iframe,is_req_off_site_direction,req_type,ublock_blocked,tds_blocked,req_domain_category,starts_with_wss,starts_with_http
2697,cosiceve0+onfido.com@gmail.com,email,urlencode-urlencode,url_leaks,https://onfido.com/,onfido.com,onfido.com,onfido.com,onfido.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,885253_8614pi_885253_8614,https://go.onfido.com/l/885253/2020-09-16/29g1,onfido.com,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
3245,cosiceve0+pressgazette.co.uk@gmail.com,email,urlencode-urlencode,url_leaks,https://pressgazette.co.uk/,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,http://go.pardot.com/form/checkEmailAjax/accou...,...,375012_527938pi_375012_527938,https://go.pressgazette.co.uk/l/375012/2021-01...,pressgazette.co.uk,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
4180,cosiceve0+rating-widget.com@gmail.com,email,base64,url_leaks,http://rating-widget.com/,rating-widget.com,rating-widget.com,rating-widget.com,rating-widget.com,"http://hn.inspectlet.com/pdata?d=cinmi,18206,3...",...,,http://rating-widget.com/,rating-widget.com,False,Image,True,False,"Third-Party Analytics Marketing, Session Replay",False,True
8043,cosiceve0+advertisers.contobox.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.advertisers.contobox.com/,contobox.com,contobox.com,www.advertisers.contobox.com,contobox.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,764333_14369pi_764333_14369,https://www2.contobox.com/l/764333/2019-10-03/...,contobox.com,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
8153,cosiceve0+ctbuh.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/,ctbuh.org,ctbuh.org,www.ctbuh.org,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,878592_4768pi_878592_4768,https://marketing.ctbuh.org/l/878592/2020-08-2...,ctbuh.org,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
11107,cosiceve0+vendini.com@gmail.com,email,urlencode-urlencode,url_leaks,https://vendini.com/,vendini.com,vendini.com,vendini.com,vendini.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,814053_979pi_814053_979,https://www2.vendini.com/l/814053/2019-11-01/2...,vendini.com,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
15757,cosiceve0+affiliatesummit.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.affiliatesummit.com/,affiliatesummit.com,affiliatesummit.com,www.affiliatesummit.com,affiliatesummit.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,528152_120597pi_528152_120597,https://info.affiliatesummit.com/l/528152/2020...,affiliatesummit.com,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
16036,cosiceve0+iiba.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.iiba.org/,iiba.org,iiba.org,www.iiba.org,iiba.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,Emailpi_Email,https://go.iiba.org/email-signup?utm_source=We...,iiba.org,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
19192,cosiceve0+skyscrapercenter.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/?redirect=true,ctbuh.org,skyscrapercenter.com,www.skyscrapercenter.com,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,878592_4768pi_878592_4768,https://marketing.ctbuh.org/l/878592/2020-08-2...,ctbuh.org,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
19781,cosiceve0+avoxi.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.avoxi.com/,avoxi.com,avoxi.com,www.avoxi.com,avoxi.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,673993_73068pi_673993_73068,https://marketing.avoxi.com/l/673993/2020-08-1...,avoxi.com,False,XHR,True,True,"Ad Motivated Tracking, Advertising",False,True
