In [None]:
# todo NOTE: this script does not reproduce original result since we removed device IDs 
# todo NOTE: also this script will encounter runtime/space error since we used device IDs as a key while merge dataframes 

import pandas as pd
import numpy as np

file_cleaned_flow = '../../../Endpoint Mapping Data/Cleaned Flow/cleaned_flow_stat.csv'
cleaned_flow = pd.read_csv(file_cleaned_flow)

cleaned_flow['super_vendor'] = cleaned_flow.apply(
    lambda row: row.vendor_name.lower() if row.vendor_name==row.vendor_name
    else row.gpt_clean_vendor,
    axis=1)


cleaned_flow['generic_category'] = cleaned_flow.apply(
    lambda row: row.man_generic_category if row.man_generic_category==row.man_generic_category
    else row.gpt_generic_category,
    axis=1)


# read all party mapping files
file_all_party_mapping = '../../../Endpoint Mapping Data/Domain Data/all_party_mapping.csv'
all_party_mapping = pd.read_csv(file_all_party_mapping)
# drop extra columns 
all_party_mapping = all_party_mapping[['super_vendor', 'domain', 'party_labels']].drop_duplicates()

# marge with clean flow
clean_flow_party_label = pd.merge(cleaned_flow,
                                  all_party_mapping,
                                  on=['super_vendor', 'domain'],
                                  how='left'
                                  )


# read user information file 
file_user_device_timezone = '../../../Inspector Dataset/New data/user_device_timezone.csv'
user_device_timezone = pd.read_csv(file_user_device_timezone)
user_device_timezone = user_device_timezone[['device_id', 'user_key', 'user_country', 'timezone']]

# merge with timezone file 
clean_flow_party_label = pd.merge(clean_flow_party_label,
                                  user_device_timezone,
                                  on=['device_id'],
                                  how='left'
                                  )

unique_categories = ['Media/TV',
                     'Home Automation',
                     'Voice Assistant',
                     'Surveillance',
                     'Game Console',
                     'Work Appliance',
                     'Home Appliance ',
                     'Generic IoT',]
# 'Vehicle']


clean_flow_party_label['average_out_byte_per_sec'] = clean_flow_party_label['total_out_byte']/clean_flow_party_label['flow_duration']
threshold = 1e6
# Replace values greater than the threshold with NaN
clean_flow_party_label['average_out_byte_per_sec'] = np.where(clean_flow_party_label['average_out_byte_per_sec'] > threshold, 
                                                              np.nan, clean_flow_party_label['average_out_byte_per_sec'])

In [None]:
#original result
# distribution of different types of contacted endpoints vary across various categories of IoT devices

def analyze_rq_1(device_df):
    aggregated_data = []
    
    for category in unique_categories:
        df = device_df[(device_df['generic_category']==category)]
    
        # Get number of domains of each type within this category
        first_party = len(df[df['party_labels']==1]['domain'].unique())
        support_party = len(df[df['party_labels']==2]['domain'].unique())
        third_party = len(df[df['party_labels']==3]['domain'].unique())
    
        first = df[df['party_labels']==1].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "1"})
        support = df[df['party_labels']==2].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "2"})
        Third = df[df['party_labels']==3].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "3"})
    
        devices = df[['device_id']].drop_duplicates()
        devices = devices.merge(first,how ='left').merge(support,how ='left').merge(Third,how='left').fillna(0)
    
        first_party_mean, support_party_mean, third_party_mean = devices[['1','2','3']].mean()
        first_party_std, support_party_std, third_party_std = devices[['1','2','3']].std()
    
    
        first_party_up_stream = df[df['party_labels']==1]['average_out_byte_per_sec'].mean()
        support_party_up_stream = df[df['party_labels']==2]['average_out_byte_per_sec'].mean()
        third_party_up_stream = df[df['party_labels']==3]['average_out_byte_per_sec'].mean()
    
        first_party_up_stream_std = df[df['party_labels']==1]['average_out_byte_per_sec'].std()
        support_party_up_stream_std = df[df['party_labels']==2]['average_out_byte_per_sec'].std()
        third_party_up_stream_std = df[df['party_labels']==3]['average_out_byte_per_sec'].std()
    
    
        data = {'Category': category,
                'first_party': first_party,
                'support_party': support_party,
                'third_party': third_party,
                'first_party_mean': first_party_mean,
                'support_party_mean': support_party_mean,
                'third_party_mean': third_party_mean,
                'first_party_std': first_party_std,
                'support_party_std': support_party_std,
                'third_party_std': third_party_std,
                'first_party_up_stream': first_party_up_stream,
                'support_party_up_stream': support_party_up_stream,
                'third_party_up_stream': third_party_up_stream,
                'first_party_up_stream_std': first_party_up_stream_std,
                'support_party_up_stream_std': support_party_up_stream_std,
                'third_party_up_stream_std': third_party_up_stream_std
                }
    
        aggregated_data.append(data)
    return pd.DataFrame(aggregated_data)

rq_1_df = analyze_rq_1(clean_flow_party_label.copy())

In [None]:
def sample_devices(devices_data, seed, sample_size, device_categories):
    np.random.seed(seed)
    sampled_data = devices_data[devices_data['generic_category'].isin(device_categories)][['device_id',
                                                                                           'generic_category']].drop_duplicates()
    sampled_data = sampled_data.groupby('generic_category').apply(
        lambda x: x.sample(n=sample_size)).reset_index(drop=True)

    sampled_data = sampled_data[['device_id']]
    return pd.merge(sampled_data, clean_flow_party_label,
                    on=['device_id'],
                    how='inner'
                    )

# sampled result
# distribution of different types of contacted endpoints vary across various categories of IoT devices
# On sampled data

def sample_analysis(device_flow_data, device_categories):
    devices_df = device_flow_data.copy()

    aggregated_data = []

    for category in device_categories:
        df = devices_df[(devices_df['generic_category']==category)]

        # Get number of domains of each type within this category
        first_party = len(df[df['party_labels']==1]['domain'].unique())
        support_party = len(df[df['party_labels']==2]['domain'].unique())
        third_party = len(df[df['party_labels']==3]['domain'].unique())

        first = df[df['party_labels']==1].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "1"})
        support = df[df['party_labels']==2].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "2"})
        Third = df[df['party_labels']==3].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "3"})

        devices = df[['device_id']].drop_duplicates()
        devices = devices.merge(first,how ='left').merge(support,how ='left').merge(Third,how='left').fillna(0)

        first_party_mean, support_party_mean, third_party_mean = devices[['1','2','3']].mean()
        first_party_std, support_party_std, third_party_std = devices[['1','2','3']].std()


        first_party_up_stream = df[df['party_labels']==1]['average_out_byte_per_sec'].mean()
        support_party_up_stream = df[df['party_labels']==2]['average_out_byte_per_sec'].mean()
        third_party_up_stream = df[df['party_labels']==3]['average_out_byte_per_sec'].mean()

        first_party_up_stream_std = df[df['party_labels']==1]['average_out_byte_per_sec'].std()
        support_party_up_stream_std = df[df['party_labels']==2]['average_out_byte_per_sec'].std()
        third_party_up_stream_std = df[df['party_labels']==3]['average_out_byte_per_sec'].std()


        data = {'Category': category,
                'first_party': first_party,
                'support_party': support_party,
                'third_party': third_party,
                'first_party_mean': first_party_mean,
                'support_party_mean': support_party_mean,
                'third_party_mean': third_party_mean,
                'first_party_std': first_party_std,
                'support_party_std': support_party_std,
                'third_party_std': third_party_std,
                'first_party_up_stream': first_party_up_stream,
                'support_party_up_stream': support_party_up_stream,
                'third_party_up_stream': third_party_up_stream,
                'first_party_up_stream_std': first_party_up_stream_std,
                'support_party_up_stream_std': support_party_up_stream_std,
                'third_party_up_stream_std': third_party_up_stream_std
                }

        aggregated_data.append(data)
    return pd.DataFrame(aggregated_data)

In [None]:
sampled_devices_df_40 = sample_devices(devices_data=clean_flow_party_label.copy(),
                                       seed=40,
                                       sample_size=3000,
                                       device_categories=['Media/TV', 'Home Automation', 'Voice Assistant'])

sampled_devices_df_42 = sample_devices(devices_data=clean_flow_party_label.copy(),
                                       seed=42,
                                       sample_size=3000,
                                       device_categories=['Media/TV', 'Home Automation', 'Voice Assistant'])

sampled_devices_df_44 = sample_devices(devices_data=clean_flow_party_label.copy(),
                                       seed=44,
                                       sample_size=3000,
                                       device_categories=['Media/TV', 'Home Automation', 'Voice Assistant'])

rq_1_df_sampled_40 = sample_analysis(device_flow_data=sampled_devices_df_40,
                                     device_categories=['Media/TV', 'Home Automation', 'Voice Assistant'])

rq_1_df_sampled_42 = sample_analysis(device_flow_data=sampled_devices_df_42,
                                     device_categories=['Media/TV', 'Home Automation', 'Voice Assistant'])

rq_1_df_sampled_44 = sample_analysis(device_flow_data=sampled_devices_df_44,
                                     device_categories=['Media/TV', 'Home Automation', 'Voice Assistant'])


# Find the mean values for all samples
rq_1_df_sampled_mean = [rq_1_df_sampled_40, rq_1_df_sampled_42, rq_1_df_sampled_44]
rq_1_df_sampled_mean = pd.concat(rq_1_df_sampled_mean).groupby('Category').mean().reindex(['Media/TV', 'Home Automation', 'Voice Assistant']).reset_index()

sampled_device_categories = ['Media/TV', 'Home Automation', 'Voice Assistant']
rq_1_df_all =  rq_1_df[rq_1_df['Category'].isin(sampled_device_categories)]

In [None]:
import matplotlib
matplotlib.use('QtAgg')  # Choose the appropriate backend

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('default')  # Set the default style

# Assuming your DataFrame is named df
# Replace 'category', 'total', 'avg', 'std' with the actual column names

# Create a figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(9, 4), sharex=True)
colors = ['blue', 'green', 'red']

# Bar width
bar_width = 0.3
capsize = 2
log_scale = True
alpha = 0.5
# Position of bars on X-axis
bar_positions_1 = np.arange(-0.3, 2 , 1).tolist()
# bar_positions_1 = range(0,9)
bar_positions_2 = [pos + bar_width for pos in bar_positions_1]
bar_positions_3 = [pos + bar_width for pos in bar_positions_2]

# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting

df = rq_1_df_sampled_mean.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')

sns.barplot(x='short_category', y='total_domain', data=df_melted, hue='party_type', ax=axes[0][0],
            width=0.8, log=True, palette=colors, alpha=0.6, edgecolor='black', linewidth=0.5)

axes[0][0].set_title('1. Total # of domains')
# axes[0][0].legend(loc='center left', bbox_to_anchor=(0.6, 0.855), prop={'size': 6}, )
# Hide x-axis and y-axis labels
axes[0][0].legend().set_visible(False)
axes[0][0].set_xlabel('')  # Hide x-axis label on axes[0]
axes[0][0].set_ylabel('Sampled devices')  # Hide x-axis label on axes[0]

# Plot mean values
axes[0][1].bar(bar_positions_1, df['first_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['first_party_std'],
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][1].bar(bar_positions_2, df['support_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['support_party_std'],
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][1].bar(bar_positions_3, df['third_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['third_party_std'],
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)
axes[0][1].set_title('2. Average # of Domain')


# plot upstream data

axes[0][2].bar(bar_positions_1, df['first_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][2].bar(bar_positions_2, df['support_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][2].bar(bar_positions_3, df['third_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][2].set_title('3. Volume of up-stream data')

legend_names = ['First-party', 'Support-party', 'Third-party']

axes[0][2].legend(legend_names, loc='center left', bbox_to_anchor=(.58, .81), prop={'size': 6}, )


# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting

df = rq_1_df_all.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')

sns.barplot(x='short_category', y='total_domain', data=df_melted, hue='party_type', ax=axes[1][0],
            width=0.8, log=True, palette=colors, alpha=0.6, edgecolor='black', linewidth=0.5)

# axes[1][0].set_title('1. Total # of domains')
# axes[1][0].legend(loc='center left', bbox_to_anchor=(0.6, 0.855), prop={'size': 6}, )
axes[1][0].legend().set_visible(False)
# Hide x-axis and y-axis labels
axes[1][0].set_xlabel('')  # Hide x-axis label on axes[0]
axes[1][0].set_ylabel('All devices')  # Hide x-axis label on axes[0]


# Plot mean values
axes[1][1].bar(bar_positions_1, df['first_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['first_party_std'],
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][1].bar(bar_positions_2, df['support_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['support_party_std'],
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][1].bar(bar_positions_3, df['third_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['third_party_std'],
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)
# axes[1][1].set_title('2. Average # of Domain')


# plot upstream data

axes[1][2].bar(bar_positions_1, df['first_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][2].bar(bar_positions_2, df['support_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][2].bar(bar_positions_3, df['third_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)

# axes[1][2].set_title('3. Volume of up-stream data')

# Rotate category names vertically
for ax in axes[1]:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='center')

plt.tight_layout()

# todo save and show the plot
# plt.savefig('../../Statistical Data/Appendix/sample-plot.pdf')
plt.show()


In [None]:
# Sample all device categories

sampled_devices_all_40 = sample_devices(devices_data=clean_flow_party_label.copy(),
                                       seed=40,
                                       sample_size=50,
                                       device_categories=unique_categories)

sampled_devices_all_42 = sample_devices(devices_data=clean_flow_party_label.copy(),
                                        seed=42,
                                        sample_size=50,
                                        device_categories=unique_categories)

sampled_devices_all_44 = sample_devices(devices_data=clean_flow_party_label.copy(),
                                        seed=44,
                                        sample_size=50,
                                        device_categories=unique_categories)


rq_1_df_sampled_all_40 = sample_analysis(device_flow_data=sampled_devices_all_40,
                                         device_categories=unique_categories)

rq_1_df_sampled_all_42 = sample_analysis(device_flow_data=sampled_devices_all_42,
                                         device_categories=unique_categories)

rq_1_df_sampled_all_44 = sample_analysis(device_flow_data=sampled_devices_all_44,
                                         device_categories=unique_categories)

# Find the mean values for all samples
rq_1_df_sampled_all_mean = [rq_1_df_sampled_all_40, rq_1_df_sampled_all_42, rq_1_df_sampled_all_44]
rq_1_df_sampled_all_mean = pd.concat(rq_1_df_sampled_all_mean).groupby('Category').mean().reindex(unique_categories).reset_index()

In [None]:
import matplotlib
matplotlib.use('QtAgg')  # Choose the appropriate backend

import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df
# Replace 'category', 'total', 'avg', 'std' with the actual column names

# Create a figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(9, 4), sharex=True)
colors = ['blue', 'green', 'red']

# Bar width
bar_width = 0.3
capsize = 2
log_scale = True
alpha = 0.5
# Position of bars on X-axis
bar_positions_1 = np.arange(-0.3, 7 , 1).tolist()
# bar_positions_1 = range(0,9)
bar_positions_2 = [pos + bar_width for pos in bar_positions_1]
bar_positions_3 = [pos + bar_width for pos in bar_positions_2]

# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting

df = rq_1_df_sampled_all_mean.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')

sns.barplot(x='short_category', y='total_domain', data=df_melted, hue='party_type', ax=axes[0][0],
            width=0.8, log=True, palette=colors, alpha=0.6, edgecolor='black', linewidth=0.5)

axes[0][0].set_title('1. Total # of domains')
# axes[0][0].legend(loc='center left', bbox_to_anchor=(0.6, 0.855), prop={'size': 6}, )
# Hide x-axis and y-axis labels
axes[0][0].legend().set_visible(False)
axes[0][0].set_xlabel('')  # Hide x-axis label on axes[0]
axes[0][0].set_ylabel('Sampled devices')  # Hide x-axis label on axes[0]

# Plot mean values
axes[0][1].bar(bar_positions_1, df['first_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['first_party_std'],
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][1].bar(bar_positions_2, df['support_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['support_party_std'],
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][1].bar(bar_positions_3, df['third_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['third_party_std'],
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)
axes[0][1].set_title('2. Average # of Domain')


# plot upstream data

axes[0][2].bar(bar_positions_1, df['first_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][2].bar(bar_positions_2, df['support_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][2].bar(bar_positions_3, df['third_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[0][2].set_title('3. Volume of up-stream data')

legend_names = ['First-party', 'Support-party', 'Third-party']

axes[0][2].legend(legend_names, loc='center left', bbox_to_anchor=(.58, .81), prop={'size': 6}, )


# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting

df = rq_1_df.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')

sns.barplot(x='short_category', y='total_domain', data=df_melted, hue='party_type', ax=axes[1][0],
            width=0.8, log=True, palette=colors, alpha=0.6, edgecolor='black', linewidth=0.5)

# axes[1][0].set_title('1. Total # of domains')
# axes[1][0].legend(loc='center left', bbox_to_anchor=(0.6, 0.855), prop={'size': 6}, )
axes[1][0].legend().set_visible(False)
# Hide x-axis and y-axis labels
axes[1][0].set_xlabel('')  # Hide x-axis label on axes[0]
axes[1][0].set_ylabel('All devices')  # Hide x-axis label on axes[0]


# Plot mean values
axes[1][1].bar(bar_positions_1, df['first_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['first_party_std'],
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][1].bar(bar_positions_2, df['support_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['support_party_std'],
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][1].bar(bar_positions_3, df['third_party_mean'],
               width=bar_width, label='Mean 1', yerr=df['third_party_std'],
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)
# axes[1][1].set_title('2. Average # of Domain')


# plot upstream data

axes[1][2].bar(bar_positions_1, df['first_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][2].bar(bar_positions_2, df['support_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=0.5)

axes[1][2].bar(bar_positions_3, df['third_party_up_stream'],
               width=bar_width, label='Mean 1',
               capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=0.5)

# axes[1][2].set_title('3. Volume of up-stream data')

# Rotate category names vertically
for ax in axes[1]:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='center')

plt.tight_layout()

# todo save and show the plot
plt.savefig('../../Statistical Data/Appendix/sampled-plot-all-cat.pdf')
plt.show()