In [4]:
import pandas as pd
import pickle
from collections import defaultdict
from leak_common import get_domain, find_prominence
import matplotlib.pyplot as plt

In [5]:
with open('../pkls_csvs/extra_tracker_domains.pkl', 'rb') as handle:
    extra_tracker_domains = pickle.load(handle)

# Read from pickles

In [6]:
log_details = pd.read_pickle("../pkls_csvs/nyc_log_details.pkl")
df = pd.read_csv("../pkls_csvs/desktop_no_action_nyc_crawl.csv", sep='\t', encoding='utf-8')
## Due to copyright restrictions, we can't share categories info
# with open('../pkls_csvs/categories.pkl', 'rb') as handle:
#     sites_categories = pickle.load(handle)

# Additional Columns

In [7]:
df['is_blocked'] = df.easy_list_blocked | df.easy_privacy_blocked | df.disconnect_blocked | df.whotracksme_blocked | (df.tds_blocked) | (df.ublock_blocked) | (df.request_url_domain.isin(extra_tracker_domains))

In [8]:
df['tracker_related'] = df.third_party_req & df.is_blocked

# All Requests

In [9]:
all_reqs = df
unique_all_reqs = all_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
all_reqs_email = all_reqs[all_reqs.search_type == 'email']
unique_all_reqs_email = all_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [10]:
print('all_reqs: ', len(all_reqs))
print('unique_all_reqs: ', len(unique_all_reqs))
print('all_reqs_email: ', len(all_reqs_email))
print('unique_all_reqs_email: ', len(unique_all_reqs_email))

all_reqs:  41936
unique_all_reqs:  21156
all_reqs_email:  38924
unique_all_reqs_email:  19791


# Third Party Requests

In [11]:
third_party_reqs = df[df.third_party_req & ~df.is_req_off_site_direction]
unique_third_party_reqs = third_party_reqs.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])
third_party_reqs_email = third_party_reqs[third_party_reqs.search_type == 'email']
unique_third_party_reqs_email = third_party_reqs_email.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [12]:
print('third_party_reqs: ', len(third_party_reqs))
print('unique_third_party_reqs: ', len(unique_third_party_reqs))
print('third_party_reqs_email: ', len(third_party_reqs_email))
print('unique_third_party_reqs_email: ', len(unique_third_party_reqs_email))

third_party_reqs:  32583
unique_third_party_reqs:  17898
third_party_reqs_email:  32004
unique_third_party_reqs_email:  17680


# Tracking Related Requests

In [13]:
tracker_related_reqs_df = df[~df.is_req_off_site_direction & (df.leak_type!='response_cookie_leaks') & (df.leak_type!='response_location_leaks') &(df.tracker_related) & ((df.final_url_domain == df.last_page_domain) | (df.initial_url == df.last_page_domain))]
unique_tracking_related_df = tracker_related_reqs_df.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'],keep='last')
email_leaks = tracker_related_reqs_df[tracker_related_reqs_df.search_type == 'email']
unique_email_leaks = email_leaks.drop_duplicates(['initial_hostname', 'search_type', 'request_url_domain', 'encoding', 'leak_type'])

In [14]:
print('tracker_related_reqs_df: ', len(tracker_related_reqs_df))
print('unique_tracking_related_df: ', len(unique_tracking_related_df))
print('email_leaks: ', len(email_leaks))
print('unique_email_leaks: ', len(unique_email_leaks))

tracker_related_reqs_df:  22295
unique_tracking_related_df:  12747
email_leaks:  21898
unique_email_leaks:  12621


# High Level Statistics

In [15]:
print('Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains: ',len(all_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to both 3rd parties and tracker domains: ',len(unique_third_party_reqs_email.drop_duplicates('initial_hostname')))
print('Distinct websites where emails are leaked to only tracker domains: ',len(unique_email_leaks.drop_duplicates('initial_hostname')))

Distinct websites where emails are leaked to both 1st, 3rd parties and tracker domains:  5518
Distinct websites where emails are leaked to both 3rd parties and tracker domains:  3790
Distinct websites where emails are leaked to only tracker domains:  2950


# Prominence

In [16]:
prominence_list = find_prominence(unique_tracking_related_df[unique_tracking_related_df.search_type=='email'])

In [17]:
prominence_df = pd.DataFrame(prominence_list, columns=['domain','prominence', 'number_of_sites']) 

In [18]:
prominence_df.sort_values('prominence',ascending=False).head(20)

Unnamed: 0,domain,prominence,number_of_sites
6,rlcdn.com,0.055384,524
25,taboola.com,0.0499,383
42,bouncex.net,0.022474,189
10,bizible.com,0.021203,191
39,zenaps.com,0.011115,119
40,awin1.com,0.011092,118
5,fullstory.com,0.01056,230
21,listrakbi.com,0.006596,226
13,pippio.com,0.006513,138
43,smarterhq.io,0.006382,32


## Leak types

In [19]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     11389
post_leaks     1358
Name: leak_type, dtype: int64

## Email leaks

In [20]:
unique_email_leaks[unique_email_leaks['rank_of_site']<2000].drop_duplicates('initial_url').sort_values('rank_of_site')[['initial_url', 'rank_of_site']].head(12)

Unnamed: 0,initial_url,rank_of_site
7754,issuu.com,95
15237,businessinsider.com,128
12214,usatoday.com,154
30632,time.com,191
18747,udemy.com,196
36995,healthline.com,217
26711,foxnews.com,234
37767,trello.com,242
19331,theverge.com,278
12788,webmd.com,288


In [21]:
distinct_websites_email_leaks = unique_email_leaks.drop_duplicates('initial_hostname')

In [22]:
unique_email_leaks.encoding.value_counts()

sha256                          2173
urlencode-sha256                1857
md5                             1752
urlencode-md5                   1666
sha1                            1062
urlencode-sha1                  1022
unencoded                        889
urlencode                        628
urlencode-sha_salted_1           237
sha_salted_1                     237
base64                           229
lzstring-urlencode               195
urlencode-lzstring-urlencode     194
urlencode-base64                 186
urlencode-urlencode              168
urlencode-custom_map_1            74
base64-sha1                       34
base64-sha256                      8
base64-md5                         3
sha512                             2
urlencode-sha512                   2
base64-urlencode                   2
base64-base64                      1
Name: encoding, dtype: int64

## Tracked But NOT Sniffed

In [23]:
email_leaks[email_leaks.email_sniffed==False].drop_duplicates('initial_hostname')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,is_req_off_site_direction,req_domain_entity,sniff_initiators,is_req_domain_among_sniff_initiators,ublock_blocked,tds_blocked,ends_with_c,ends_with_co,cname,req_domain_category
0,cosicadam0+houston.org@gmail.com,email,unencoded,post_leaks,https://www.houston.org/,houston.org,houston.org,www.houston.org,houston.org,https://r.lr-ingest.io/i?a=hou67o%2Fghp-client...,...,False,,set(),False,True,False,False,False,False,
478,cosicadam0+bruna.nl@gmail.c,email,urlencode,url_leaks,https://www.bruna.nl/,bruna.nl,bruna.nl,www.bruna.nl,bruna.nl,https://tr.datatrics.com/?action_name=Online%2...,...,False,,set(),False,True,False,True,False,False,
495,cosicadam0+democratandchronicle.com@gmail.com,email,sha256,url_leaks,https://www.democratandchronicle.com/,democratandchronicle.com,democratandchronicle.com,www.democratandchronicle.com,democratandchronicle.com,https://trc.taboola.com/sg/tfa-eid/1/um/?uils=...,...,False,"Taboola, Inc.",set(),False,True,True,False,False,False,"Third-Party Analytics Marketing, Audience Meas..."
618,cosicadam0+burlingtonfreepress.com@gmail.com,email,sha256,url_leaks,https://www.burlingtonfreepress.com/,burlingtonfreepress.com,burlingtonfreepress.com,www.burlingtonfreepress.com,burlingtonfreepress.com,https://trc.taboola.com/sg/tfa-eid/1/um/?uils=...,...,False,"Taboola, Inc.",{'taboola.com'},True,True,True,False,False,False,"Third-Party Analytics Marketing, Audience Meas..."
703,cosicadam0+tennisexpress.com@gmail.co,email,urlencode,url_leaks,https://www.tennisexpress.com/,tennisexpress.com,tennisexpress.com,www.tennisexpress.com,tennisexpress.com,https://sca1.listrakbi.com/2wZX5FvW1Fkg/cart/f...,...,False,Listrak,set(),False,True,True,False,True,False,"Analytics, Third-Party Analytics Marketing"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40973,cosicadam0+rapyd.net@gmail.com,email,urlencode-urlencode,url_leaks,https://www.rapyd.net/,rapyd.net,rapyd.net,www.rapyd.net,rapyd.net,https://d.adroll.com/emailc/5NQSP5JB7ZH2XGXJDA...,...,False,"AdRoll, Inc.",set(),False,True,True,False,False,False,"Advertising, Ad Motivated Tracking"
41006,cosicadam0+mediavoice.com@gmail.co,email,base64,url_leaks,https://mediavoice.com/login/?next=/,mediavoice.com,mediavoice.com,mediavoice.com,mediavoice.com,"https://hn.inspectlet.com/pdata?d=cinmi,12136,...",...,False,Inspectlet,{'inspectlet.com'},True,True,False,False,True,False,"Third-Party Analytics Marketing, Session Replay"
41010,cosicadam0+codeavengers.com@gmail.co,email,urlencode,post_leaks,https://www.codeavengers.com/,codeavengers.com,codeavengers.com,www.codeavengers.com,codeavengers.com,https://track-v2.funnelytics.io/set-attributes,...,False,,set(),False,False,False,False,True,False,
41090,cosicadam0+atlantisbahamas.com@gmail.com,email,base64,url_leaks,https://www.atlantisbahamas.com/,atlantisbahamas.com,atlantisbahamas.com,www.atlantisbahamas.com,atlantisbahamas.com,https://shopper.shop.pe/pixel.png?info=eyJlbWF...,...,False,Add Shoppers,set(),False,True,False,False,False,False,


# Categories

In [24]:
categories_100K_final_dict = defaultdict(int)
for site_url, category in sites_categories.items():
    splitted_category = category.strip().split(', ')
    for each_category in splitted_category:
        categories_100K_final_dict[each_category] += 1

In [25]:
log_details['initial_hostname']
filled_sites = set(log_details['initial_hostname'])
log_details['initial_url'] = log_details.apply (lambda row: get_domain('http://' + row['initial_hostname']), axis=1)
categories_filled_final_dict = defaultdict(int)
filled_sites_domain_set = set(log_details.initial_url)
for site_url, category in sites_categories.items():
    if site_url in filled_sites_domain_set:
        splitted_category = category.strip().split(', ')
        for each_category in splitted_category:
            categories_filled_final_dict[each_category] += 1

In [26]:
unique_email_sites_df = unique_tracking_related_df[unique_tracking_related_df.search_type=='email'].drop_duplicates('initial_hostname')
leaky_categories_dict = defaultdict(int)
for index, row in unique_email_sites_df.iterrows():
    splitted_category = row['category'].strip().split(', ')
#     print(row)
    for each_category in splitted_category:
        leaky_categories_dict[each_category] += 1

In [27]:
category_dict = dict()
tuples = []
for category, value in categories_100K_final_dict.items():
    filled_num = 0
    leaky_num = 0
    total_num = value
    if category in categories_filled_final_dict.keys():
        filled_num = categories_filled_final_dict[category]
    if category in leaky_categories_dict.keys():
        leaky_num = leaky_categories_dict[category]
    tuple_el = (category, total_num, filled_num, leaky_num)
    tuples.append(tuple_el)

In [28]:
categories_final = pd.DataFrame(tuples, columns=['category', 'total', 'filled', 'leaky']) 
categories_final['percentage'] = categories_final['leaky'] *100/ categories_final['filled']
categories_final['percentage'].astype(float).round(1)
categories_filtered = categories_final[categories_final.total>1000].sort_values(by='percentage', ascending=False)
categories_filtered['percentage'] = categories_filtered['percentage'].astype(float).round(1)
len(categories_filtered),categories_filtered

(23,
                      category  total  filled  leaky  percentage
 53             Fashion/Beauty   1669    1179    224        19.0
 7             Online Shopping   5395    3744    567        15.1
 64         Recreation/Hobbies   1098     760     95        12.5
 18               General News   7390    3848    392        10.2
 6                  Blogs/Wiki   5415    3055    237         7.8
 4                    Business  13462    7924    484         6.1
 22    Marketing/Merchandising   4964    3218    192         6.0
 39                     Travel   2519    1379     82         5.9
 5           Software/Hardware   4933    2855    162         5.7
 38                     Sports   1910    1002     56         5.6
 26                     Health   2516    1439     69         4.8
 16          Internet Services   7974    4671    199         4.3
 12              Entertainment   5297    2619     98         3.7
 23            Finance/Banking   3699    1518     49         3.2
 10        Education

## Leaks on sites where CMP detected

In [29]:
len(tracker_related_reqs_df[tracker_related_reqs_df.was_cmp_detected].drop_duplicates('initial_hostname'))

228

## Is req domain among the sniff initiators?

In [30]:
not_in_sniff_inits = set(unique_email_leaks[~unique_email_leaks.is_req_domain_among_sniff_initiators].drop_duplicates('initial_hostname').initial_hostname)
in_sniff_inits = set(unique_email_leaks[unique_email_leaks.is_req_domain_among_sniff_initiators].drop_duplicates('initial_hostname').initial_hostname)
len(not_in_sniff_inits.difference(in_sniff_inits)), len(not_in_sniff_inits), len(in_sniff_inits), len(unique_email_leaks.drop_duplicates('initial_hostname'))

(1639, 1762, 1311, 2950)

In [31]:
unique_tracking_related_df.leak_type.value_counts()

url_leaks     11389
post_leaks     1358
Name: leak_type, dtype: int64

# HTTP WEB SOCKET

In [32]:
unique_email_leaks['starts_with_wss'] = list(
    map(lambda x: x.startswith('wss'), unique_email_leaks['request_url'])) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_email_leaks['starts_with_wss'] = list(


In [33]:
unique_email_leaks[unique_email_leaks.starts_with_wss].drop_duplicates('request_url_domain')

Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,req_domain_entity,sniff_initiators,is_req_domain_among_sniff_initiators,ublock_blocked,tds_blocked,ends_with_c,ends_with_co,cname,req_domain_category,starts_with_wss
66,cosicadam0+omaha.com@gmail.com,email,unencoded,post_leaks,https://omaha.com/,omaha.com,omaha.com,omaha.com,omaha.com,wss://am.freshrelevance.com/,...,,set(),False,False,False,False,False,False,,True
2222,cosicadam0+menlosecurity.com@gmail.c,email,urlencode,post_leaks,https://www.menlosecurity.com/,menlosecurity.com,menlosecurity.com,www.menlosecurity.com,menlosecurity.com,wss://ws10.hotjar.com/api/v2/client/ws,...,,"{'marketo.com', 'hotjar.com'}",False,True,True,True,False,False,"Audience Measurement, Session Replay",True
2302,cosicadam0+fully.com@gmail.com,email,sha256,post_leaks,https://www.fully.com/,fully.com,fully.com,www.fully.com,fully.com,wss://input.noibu.com/pv_part,...,,"{'noibu.com', 'rejoiner.com', 'fully.com', 'js...",False,True,False,False,False,False,,True
9907,cosicadam0+trivago.co.uk@gmail.c,email,unencoded,post_leaks,https://www.trivago.co.uk/,trivago.co.uk,trivago.co.uk,www.trivago.co.uk,trivago.co.uk,wss://collection.decibelinsight.net/i/13786/ws/,...,,set(),False,True,False,True,False,False,,True


In [34]:
unique_email_leaks['starts_with_http'] = list(
    map(lambda x: x.startswith('http:'), unique_email_leaks['request_url'])) 

unique_email_leaks[unique_email_leaks.starts_with_http].drop_duplicates('initial_hostname')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_email_leaks['starts_with_http'] = list(


Unnamed: 0,search,search_type,encoding,leak_type,final_url,final_url_domain,initial_url,initial_hostname,last_page_domain,request_url,...,sniff_initiators,is_req_domain_among_sniff_initiators,ublock_blocked,tds_blocked,ends_with_c,ends_with_co,cname,req_domain_category,starts_with_wss,starts_with_http
753,cosicadam0+onfido.com@gmail.com,email,urlencode-urlencode,url_leaks,https://onfido.com/,onfido.com,onfido.com,onfido.com,onfido.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'onfido.com'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
1704,cosicadam0+actionforhealthykids.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.actionforhealthykids.org/,actionforhealthykids.org,actionforhealthykids.org,www.actionforhealthykids.org,actionforhealthykids.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'actionforhealthykids.org'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
4372,cosicadam0+pressgazette.co.uk@gmail.com,email,urlencode-urlencode,url_leaks,https://pressgazette.co.uk/,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,pressgazette.co.uk,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'pressgazette.co.uk'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
9214,cosicadam0+iiba.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.iiba.org/,iiba.org,iiba.org,www.iiba.org,iiba.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,set(),False,True,True,False,False,False,"Analytics, Embedded Content",False,True
9420,cosicadam0+ctbuh.org@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/,ctbuh.org,ctbuh.org,www.ctbuh.org,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'ctbuh.org'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
10033,cosicadam0+skyscrapercenter.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ctbuh.org/?redirect=true,ctbuh.org,skyscrapercenter.com,www.skyscrapercenter.com,ctbuh.org,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'ctbuh.org'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
17923,cosicadam0+avoxi.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.avoxi.com/,avoxi.com,avoxi.com,www.avoxi.com,avoxi.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,"{'avoxi.com', 'bizible.com'}",False,True,True,False,False,False,"Analytics, Embedded Content",False,True
18774,cosicadam0+ashland.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.ashland.com/,ashland.com,ashland.com,www.ashland.com,ashland.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,"{'ashland.com', 'gstatic.com'}",False,True,True,False,False,False,"Analytics, Embedded Content",False,True
21373,cosicadam0+adcolony.com@gmail.com,email,urlencode-urlencode,url_leaks,https://www.adcolony.com/,adcolony.com,adcolony.com,www.adcolony.com,adcolony.com,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'adcolony.com'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
26054,cosicadam0+eastern.edu@gmail.com,email,urlencode-urlencode,url_leaks,https://www.eastern.edu/,eastern.edu,eastern.edu,www.eastern.edu,eastern.edu,http://go.pardot.com/form/checkEmailAjax/accou...,...,{'eastern.edu'},False,True,True,False,False,False,"Analytics, Embedded Content",False,True
