## Analysis for ciphersuite list with distance == 4

In [1]:
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
import sqlite3
import time
import hashlib
import re
import datetime
import matplotlib.pyplot as plt
import matplotlib
import editdistance
import functools
from IPython.display import clear_output
import seaborn as sns
import random
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
from __future__ import division
import math
from bisect import bisect_left
from collections import namedtuple

### Readin dataset

In [3]:
df = pd.read_csv("../datasets/ciphersuites_withpki.csv")

In [4]:
df_dist4 = df.loc[df['distance_2lib_sim'] == 4]

In [5]:
df_dist4.shape

(8818, 21)

In [6]:
df_dist4['cipher_suites'].value_counts().shape

(107,)

### Helper functions

In [10]:
def dom_category(domain):
    if "google" in domain or "gstatic" in domain or "doubleclick" in domain:
        return "google"
    elif "amazon" in domain or "aws" in domain:
        return "amazon"
    elif "nintendo" in domain:
        return "nintendo"
    elif "roku" in domain:
        return "roku"
    elif "samsung" in domain:
        return "samsung"
    elif "nflx" in domain:
        return "netflix"
    else:
        return domain

### Get domain and domain owner

In [11]:
df_dist4['domain_name'] = df_dist4['domain'].apply(lambda x: x.split(".")[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dist4['domain_name'] = df_dist4['domain'].apply(lambda x: x.split(".")[0])


In [12]:
df_dist4['domain_runby'] = df_dist4['domain_name'].apply(lambda x: dom_category(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dist4['domain_runby'] = df_dist4['domain_name'].apply(lambda x: dom_category(x))


In [13]:
df_dist4.sample(2)

Unnamed: 0,device_id,device_vendor,device_name,cipher_suites,extension_types,tls_version,distance_2lib_sim,likely_libver,sni,if_in_crtsh,server_tls_version,if_tls_fallback,cert_chain_len,chain_issuer_O_parse,chain_issuer_CN,chain_subject_O,chain_subject_CN,cert_validity_days,domain,chain_issuer,chain_category,domain_name,domain_runby
10265,s78c624d016,Google,chromecast,2570+4867+4865+4866+52393+52392+49195+49199+49196+49200+49171+49172+156+157+47+53+10,23+65281+10+11+35+16+5+13+18+51+45+43+27+41,771,4,curl-7.60.076_openssl-1.1.1-pre2,clients4.google.com,True,TLS 1.2,False,3,"['Google Trust Services LLC', 'Google Trust Services LLC', 'GlobalSign nv-sa']",['GTS CA 1C3' 'GTS Root R1' 'GlobalSign Root CA'],['nan' 'Google Trust Services LLC' 'Google Trust Services LLC'],['*.google.com' 'GTS CA 1C3' 'GTS Root R1'],83 days 23:59:59,google.com,public+public+public,Public trust leaf and root certificates,google,google
9137,s692a0a7478,Insignia,roku,49200+49196+49202+49198+49199+49195+49201+49197+165+161+159+164+160+158+157+156+49191+49187+49171+49161+49193+49189+49166+49156+107+105+104+57+55+54+103+63+62+51+49+48+49192+49188+49172+49162+49194+49190+49167+49157+49170+49160+49165+49155+22+16+13+61+53+60+47+10+255,11+10+13+15+13172+16+21,771,4,curl-7.71.068_openssl-1.0.2u,codex.nflxext.com,True,TLS 1.2,False,2,"['DigiCert Inc', 'DigiCert Inc']",['DigiCert TLS RSA SHA256 2020 CA1' 'DigiCert Global Root CA'],['Netflix' 'DigiCert Inc'],['*.1.nflxso.net' 'DigiCert TLS RSA SHA256 2020 CA1'],33 days 06:07:12,nflxext.com,public+public,Public trust leaf and root certificates,nflxext,netflix


### Drop records with unknown devices

In [14]:
dist4_withvendor = df_dist4.loc[df_dist4['device_vendor'] != 'Unknown']

In [15]:
dist4_withvendor.sample(2)

Unnamed: 0,device_id,device_vendor,device_name,cipher_suites,extension_types,tls_version,distance_2lib_sim,likely_libver,sni,if_in_crtsh,server_tls_version,if_tls_fallback,cert_chain_len,chain_issuer_O_parse,chain_issuer_CN,chain_subject_O,chain_subject_CN,cert_validity_days,domain,chain_issuer,chain_category,domain_name,domain_runby
6581,s48778529bc,Amazon,fire,49195+49196+49199+49200+158+159+49161+49162+49171+49172+51+57+50+56+49159+49169+156+157+47+53+5+255,11+10+35+13,771,4,curl-7.71.073_openssl-1.1.0l,msh.amazon.com,True,TLS 1.2,False,4,"['Amazon', 'Amazon', 'Starfield Technologies', 'Starfield Technologies']",['Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority ' 'nan'],['nan' 'Amazon' 'Amazon' 'Starfield Technologies'],['msh.amazon.com' 'Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority '],361 days 23:59:59,amazon.com,public+public+public+public,Public trust leaf and root certificates,amazon,amazon
18927,sf9e3047e2a,Google,google-home,2570+4867+4865+4866+52393+52392+49195+49199+49196+49200+49171+49172+156+157+47+53+10,23+65281+10+11+35+16+5+13+18+51+45+43+27+41,771,4,curl-7.60.076_openssl-1.1.1-pre2,fcm.googleapis.com,True,TLS 1.2,False,3,"['Google Trust Services LLC', 'Google Trust Services LLC', 'GlobalSign nv-sa']",['GTS CA 1C3' 'GTS Root R1' 'GlobalSign Root CA'],['nan' 'Google Trust Services LLC' 'Google Trust Services LLC'],['edgecert.googleapis.com' 'GTS CA 1C3' 'GTS Root R1'],83 days 23:59:59,googleapis.com,public+public+public,Public trust leaf and root certificates,googleapis,google


In [16]:
df_cnt = dist4_withvendor.groupby(['cipher_suites', 'domain_runby'])['device_id'].agg('count').reset_index(name='count')

In [17]:
df_cnt.shape

(493, 3)

In [18]:
df_cnt['domain_runby'].value_counts().shape

(177,)

### Assign shorter ciphersuite names

In [19]:
cipher_cluster_dict = {}
for cipher in df_cnt['cipher_suites'].drop_duplicates():
    cipher_cluster_dict[cipher] = 'C' + str(len(cipher_cluster_dict))

In [20]:
df_test = df_cnt.copy()
df_test['cipher_suites'] = df_test['cipher_suites'].apply(
    lambda v: cipher_cluster_dict[v]
)

### Exclude tracking/ads

In [24]:
import tldextract
blocked_domains = set()

def parse_blocklist():
    with open('../datasets/hosts-blocklists.txt') as fp:
        for line in fp:
            if '::' in line:
                domain = line.split('/')[1].lower().strip()
                if domain:
                    blocked_domains.add(domain)

parse_blocklist()

In [26]:
def is_hostname_blocked(hostname):

    reg_domain = tldextract.extract(hostname).registered_domain.lower().strip()
    if reg_domain:
        return reg_domain in blocked_domains

    return False

In [28]:
df_dist4['if_tracking'] = df_dist4['sni'].apply(lambda x: is_hostname_blocked(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dist4['if_tracking'] = df_dist4['sni'].apply(lambda x: is_hostname_blocked(x))


In [29]:
df_dist4['if_tracking'].value_counts()

False    8389
True      429
Name: if_tracking, dtype: int64

In [30]:
df_filter = df_dist4.loc[(df_dist4['if_tracking'] == False) & (df_dist4['device_vendor'] != 'Unknown')]

In [31]:
df_filter.sample(1)

Unnamed: 0,device_id,device_vendor,device_name,cipher_suites,extension_types,tls_version,distance_2lib_sim,likely_libver,sni,if_in_crtsh,server_tls_version,if_tls_fallback,cert_chain_len,chain_issuer_O_parse,chain_issuer_CN,chain_subject_O,chain_subject_CN,cert_validity_days,domain,chain_issuer,chain_category,domain_name,domain_runby,if_tracking
1353,s10cf4a5a46,Google,google-home,4867+4865+4866+52393+52392+49195+49199+49196+49200+49171+49172+156+157+47+53+10,23+65281+10+11+35+16+5+13+18+51+45+43+27+41,771,4,curl-7.60.076_openssl-1.1.1-pre2,www.youtube.com,True,TLS 1.2,False,3,"['Google Trust Services LLC', 'Google Trust Services LLC', 'GlobalSign nv-sa']",['GTS CA 1C3' 'GTS Root R1' 'GlobalSign Root CA'],['nan' 'Google Trust Services LLC' 'Google Trust Services LLC'],['*.google.com' 'GTS CA 1C3' 'GTS Root R1'],83 days 23:59:59,youtube.com,public+public+public,Public trust leaf and root certificates,youtube,youtube,False


In [32]:
df_filter.drop_duplicates('domain').shape

(162, 24)

In [33]:
df_cnt = df_filter.groupby(['domain'])['cipher_suites'].agg(lambda x: x.nunique()).reset_index(name='unique_cnt')

In [34]:
df_cnt.shape

(162, 2)

In [35]:
domain_lis = df_cnt.loc[df_cnt['unique_cnt'] == 1]['domain'].tolist()

In [36]:
len(domain_lis)

71

### Exclude tracking not int the block list

In [37]:
domain_lis.remove('ispot.tv')
domain_lis.remove('localytics.com')
domain_lis.remove('trakt.tv')

In [38]:
len(domain_lis)

68

In [39]:
df_inlis = df_filter.loc[df_filter['domain'].isin(domain_lis)]

In [40]:
cipher_cluster_dict = {}
for cipher in df_inlis['cipher_suites'].drop_duplicates():
    cipher_cluster_dict[cipher] = 'C' + str(len(cipher_cluster_dict))

In [41]:
df_test = df_inlis.copy()
df_test['cipher_suites'] = df_test['cipher_suites'].apply(
    lambda v: cipher_cluster_dict[v]
)

In [42]:
df_test.sample(1)

Unnamed: 0,device_id,device_vendor,device_name,cipher_suites,extension_types,tls_version,distance_2lib_sim,likely_libver,sni,if_in_crtsh,server_tls_version,if_tls_fallback,cert_chain_len,chain_issuer_O_parse,chain_issuer_CN,chain_subject_O,chain_subject_CN,cert_validity_days,domain,chain_issuer,chain_category,domain_name,domain_runby,if_tracking
14111,sb1f06dc311,Sonos,speaker,C2,13+10+11+22+23+35,771,4,curl-7.71.073_openssl-1.1.0l,t1-1.p-cdn.us,True,TLS 1.2,False,2,"['DigiCert Inc', 'DigiCert Inc']",['GeoTrust RSA CA 2018' 'DigiCert Global Root CA'],['Pandora Media' 'DigiCert Inc'],['*.p-cdn.us' 'GeoTrust RSA CA 2018'],760 days 12:00:00,p-cdn.us,public+public,Public trust leaf and root certificates,p-cdn,p-cdn,False


### Analysis

In [43]:
df_gb = df_test.groupby(['domain'])['device_vendor'].agg(lambda x: x.nunique()).reset_index(name='num_vendor')

In [44]:
df_gb.loc[df_gb['num_vendor'] > 1]

Unnamed: 0,domain,num_vendor
2,aiv-cdn.net,2
6,amazonmusic.com,3
7,arlo.com,2
32,netgear.com,2
38,p-cdn.us,2
58,truste.com,2
66,windows.com,3


In [45]:
df_gb.sample(5)

Unnamed: 0,domain,num_vendor
42,qnap.com,1
48,sesupdate.com,1
41,pndsn.com,1
22,irobotapi.com,1
67,windows.net,1


In [46]:
df_test.loc[df_test['domain'] == "amazonmusic.com"]['device_id'].value_counts().shape

(3,)

In [47]:
df_dist4.loc[df_dist4['domain'] == "amazonmusic.com"].shape

(3, 24)

In [48]:
df_test.loc[df_test['domain'] == 'amazonmusic.com']

Unnamed: 0,device_id,device_vendor,device_name,cipher_suites,extension_types,tls_version,distance_2lib_sim,likely_libver,sni,if_in_crtsh,server_tls_version,if_tls_fallback,cert_chain_len,chain_issuer_O_parse,chain_issuer_CN,chain_subject_O,chain_subject_CN,cert_validity_days,domain,chain_issuer,chain_category,domain_name,domain_runby,if_tracking
10109,s76189a29af,Google,google-home,C2,13+10+11+22+23+35,771,4,curl-7.71.073_openssl-1.1.0l,sonos.amazonmusic.com,True,TLS 1.2,False,4,"['Amazon', 'Amazon', 'Starfield Technologies', 'Starfield Technologies']",['Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority ' 'nan'],['nan' 'Amazon' 'Amazon' 'Starfield Technologies'],['sonos-na.amazon.com' 'Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority '],340 days 23:59:59,amazonmusic.com,public+public+public+public,Public trust leaf and root certificates,amazonmusic,amazon,False
12755,s9c504cf9ea,Amazon,fire,C2,13+10+11+22+23+35,771,4,curl-7.71.073_openssl-1.1.0l,sonos.amazonmusic.com,True,TLS 1.2,False,4,"['Amazon', 'Amazon', 'Starfield Technologies', 'Starfield Technologies']",['Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority ' 'nan'],['nan' 'Amazon' 'Amazon' 'Starfield Technologies'],['sonos-na.amazon.com' 'Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority '],340 days 23:59:59,amazonmusic.com,public+public+public+public,Public trust leaf and root certificates,amazonmusic,amazon,False
13671,sabd59711bc,Sonos,play,C2,13+10+11+22+23+35,771,4,curl-7.71.073_openssl-1.1.0l,sonos.amazonmusic.com,True,TLS 1.2,False,4,"['Amazon', 'Amazon', 'Starfield Technologies', 'Starfield Technologies']",['Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority ' 'nan'],['nan' 'Amazon' 'Amazon' 'Starfield Technologies'],['sonos-na.amazon.com' 'Amazon' 'Amazon Root CA 1'\n 'Starfield Services Root Certificate Authority '],340 days 23:59:59,amazonmusic.com,public+public+public+public,Public trust leaf and root certificates,amazonmusic,amazon,False


In [49]:
for cipher, shortname in cipher_cluster_dict.items():  # for name, age in dictionary.iteritems():  (for Python 2.x)
    if shortname == 'C2':
        print(cipher)

49200+49199+157+156+61+53+60+47+255


In [50]:
df_gb.shape

(68, 2)

In [51]:
df_gb

Unnamed: 0,domain,num_vendor
0,1337x.to,1
1,accuweather.com,1
2,aiv-cdn.net,2
3,amazon-dss.com,1
4,amazon.de,1
...,...,...
63,ueiwsp.com,1
64,vizio.com,1
65,vudu.com,1
66,windows.com,3


In [52]:
df_gb2 = df_test.groupby(['domain'])['device_id'].agg(lambda x: x.nunique()).reset_index(name='num_dev')

In [53]:
df_gb3 = df_test.groupby(['domain'])['sni'].agg('count').reset_index(name='count')

In [54]:
df_tmp = pd.merge(df_gb, df_gb2, how='inner', on='domain')

In [55]:
df_merge = pd.merge(df_tmp, df_gb3, how='inner', on='domain')

In [56]:
df_merge.loc[df_merge['num_dev'] > 1].shape

(32, 4)

In [57]:
pd.set_option('display.max_rows', None)
df_merge.sort_values(by='count', ascending=False)[['domain', 'count', 'num_dev', 'num_vendor']][:70]

Unnamed: 0,domain,count,num_dev,num_vendor
25,meethue.com,62,31,1
35,nintendo.com,39,19,1
3,amazon-dss.com,36,36,1
38,p-cdn.us,26,6,2
7,arlo.com,20,13,2
42,qnap.com,16,10,1
22,irobotapi.com,8,4,1
14,ecobee.com,8,4,1
26,movetv.com,7,4,1
53,snapcraft.io,6,6,1
