## GREASE usage

### Import libraries

In [1]:
import os
import pandas as pd 
import numpy as np
from lxml import etree 
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 200
import os
import glob
import time
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import matplotlib.patches as mpatches
import shutil
from IPython.display import clear_output
pd.set_option('display.max_rows', 500)
from manuf import manuf
from user_agents import parse
import dataframe_image as dfi
import tldextract

### Readin dataset

In [2]:
dev_name = pd.read_csv("../datasets/2014dev_with_dist2lib.csv")

In [3]:
names = dev_name[['device_id', 'device_name', 'device_vendor']]

### Readin the original IoTInspector dataset

#### Access to tls_handshake1.csv.gz can only be granted with IRB approvals

#### The basic idea is to find out ciphersuite GREASE and extension GREASE usage by looking at the original records

In [None]:
data = pd.read_csv('../../tls_handshake1.csv.gz', nrows=10, compression='gzip', error_bad_lines=False)

In [9]:
chunk_size=200000

In [11]:
cipher_grease = pd.DataFrame()
batch_no=1

for chunk in pd.read_csv('../../tls_handshake1.csv.gz',chunksize=chunk_size, compression='gzip', error_bad_lines=False, \
                         usecols=['device_id', 'cipher_suite_uses_grease']): 
    print ("processing ", batch_no, " ...")

    chunk = chunk.loc[chunk['cipher_suite_uses_grease'].notna()]
    chunk['device_id'] = chunk['device_id'].apply(lambda x: str(x).replace(' ', ''))
    chunk = chunk.drop_duplicates()
     
    cipher_grease = cipher_grease.append(chunk)
    cipher_grease = cipher_grease.drop_duplicates()
  
    batch_no+=1
    clear_output(wait=True)

processing  128  ...


In [12]:
cipher_grease.shape

(6267, 2)

In [13]:
cipher_grease['device_id'].value_counts().shape

(6267,)

In [17]:
cipher_grease_merge = pd.merge(cipher_grease, names, on='device_id', how='inner')

In [18]:
cipher_grease_merge['device_id'].value_counts().shape

(501,)

In [21]:
cipher_grease_merge.groupby('device_vendor')['device_id'].agg(lambda x: x.nunique()).reset_index(name='num_unique_dev')\
                   .sort_values(by='num_unique_dev', ascending=False)

Unnamed: 0,device_vendor,num_unique_dev
3,google,379
0,amazon,22
14,samsung,20
10,nvidia,17
15,sony,16
22,xiaomi,7
4,hp,6
19,vizio,6
6,insignia,5
11,onkyo,5


In [22]:
exten_grease = pd.DataFrame()
batch_no=1

for chunk in pd.read_csv('../../tls_handshake1.csv.gz',chunksize=chunk_size, compression='gzip', error_bad_lines=False, \
                         usecols=['device_id', 'extension_uses_grease']): 
    print ("processing ", batch_no, " ...")

    chunk = chunk.loc[chunk['extension_uses_grease'].notna()]
    chunk['device_id'] = chunk['device_id'].apply(lambda x: str(x).replace(' ', ''))
    chunk = chunk.drop_duplicates()
     
    exten_grease = exten_grease.append(chunk)
    exten_grease = exten_grease.drop_duplicates()
  
    batch_no+=1
    clear_output(wait=True)

processing  128  ...


In [23]:
exten_grease.shape

(6306, 2)

In [24]:
exten_grease['device_id'].value_counts().shape

(6306,)

In [25]:
exten_grease_merge = pd.merge(exten_grease, names, on='device_id', how='inner')

In [26]:
exten_grease_merge['device_id'].value_counts().shape

(503,)

In [28]:
exten_grease_merge.groupby('device_vendor')['device_id'].agg(lambda x: x.nunique()).reset_index(name='num_unique_dev')\
                   .sort_values(by='num_unique_dev', ascending=False)

Unnamed: 0,device_vendor,num_unique_dev
3,google,380
0,amazon,23
14,samsung,20
10,nvidia,17
15,sony,16
22,xiaomi,7
4,hp,6
19,vizio,6
6,insignia,5
11,onkyo,5


In [29]:
set(exten_grease_merge['device_id'].tolist()) - set(cipher_grease_merge['device_id'].tolist())

{'s3f2ffcb5e9', 'se207ea9120'}

In [31]:
names.loc[names['device_id'] == 'se207ea9120']

Unnamed: 0,device_id,device_name,device_vendor
5195,se207ea9120,chromecast,google


### Get number of users

In [33]:
chunk_size=200000

In [34]:
user_keys = pd.DataFrame()
batch_no=1

for chunk in pd.read_csv('../../tls_handshake1.csv.gz',chunksize=chunk_size, compression='gzip', error_bad_lines=False, \
                         usecols=['device_id', 'user_key']): 
    print ("processing ", batch_no, " ...")

    chunk = chunk.loc[chunk['user_key'].notna()]
    chunk['device_id'] = chunk['device_id'].apply(lambda x: str(x).replace(' ', ''))
    chunk = chunk.drop_duplicates()
     
    user_keys = user_keys.append(chunk)
    user_keys = user_keys.drop_duplicates()
  
    batch_no+=1
    clear_output(wait=True)

processing  128  ...


In [35]:
names = pd.read_parquet("../datasets/sni_used_to_generate_pcaps.parquet")

In [37]:
user_keys.shape

(22154, 2)

In [38]:
ids = list(set(names['device_id'].tolist()))

In [39]:
user_keys_2014 = user_keys.loc[user_keys['device_id'].isin(ids)]

In [40]:
user_keys_2014.shape

(2028, 2)

In [41]:
user_keys_2014['user_key'].value_counts().shape

(721,)