In [None]:
# todo NOTE: this script does not reproduce original result since we removed device IDs 
# todo NOTE: also this script will encounter runtime/space error since we used device IDs as a key while merge dataframes 


import pandas as pd
import numpy as np

file_cleaned_flow = '../../../Endpoint Mapping Data/Cleaned Flow/cleaned_flow_stat.csv'
cleaned_flow = pd.read_csv(file_cleaned_flow)

cleaned_flow['super_vendor'] = cleaned_flow.apply(
    lambda row: row.vendor_name.lower() if row.vendor_name==row.vendor_name
    else row.gpt_clean_vendor,
    axis=1)


cleaned_flow['generic_category'] = cleaned_flow.apply(
    lambda row: row.man_generic_category if row.man_generic_category==row.man_generic_category
    else row.gpt_generic_category,
    axis=1)


# read all party mapping files
file_all_party_mapping = '../../../Endpoint Mapping Data/Domain Data/all_party_mapping.csv'
all_party_mapping = pd.read_csv(file_all_party_mapping)
# drop extra columns 
all_party_mapping = all_party_mapping[['super_vendor', 'domain', 'party_labels']].drop_duplicates()

# marge with clean flow
clean_flow_party_label = pd.merge(cleaned_flow,
                                  all_party_mapping,
                                  on=['super_vendor', 'domain'],
                                  how='left'
                                  )


# read user information file 
file_user_device_timezone = '../../../Inspector Dataset/New data/user_device_timezone.csv'
user_device_timezone = pd.read_csv(file_user_device_timezone)
user_device_timezone = user_device_timezone[['device_id', 'user_key', 'user_country', 'timezone']]

# merge with timezone file 
clean_flow_party_label = pd.merge(clean_flow_party_label,
                                  user_device_timezone,
                                  on=['device_id'],
                                  how='left'
                                  )

print(len(clean_flow_party_label['device_id'].unique()))
unique_categories = clean_flow_party_label['generic_category'].unique()
print(list(unique_categories))

### RQ 1
#### How does the distribution of different types of contacted endpoints vary across various categories of IoT devices?

In [None]:
unique_categories = ['Media/TV',
                     'Home Automation',
                     'Voice Assistant',
                     'Surveillance',
                     'Game Console',
                     'Work Appliance',
                     'Home Appliance ',
                     'Generic IoT',]

clean_flow_party_label['average_out_byte_per_sec'] = clean_flow_party_label['total_out_byte']/clean_flow_party_label['flow_duration']
threshold = 1e6
# Replace values greater than the threshold with NaN
clean_flow_party_label['average_out_byte_per_sec'] = np.where(clean_flow_party_label['average_out_byte_per_sec'] > threshold, 
                                                              np.nan, clean_flow_party_label['average_out_byte_per_sec'])

# original result
# distribution of different types of contacted endpoints vary across various categories of IoT devices

def analyze_rq_1(device_df):
    aggregated_data = []
    
    for category in unique_categories:
        df = device_df[(device_df['generic_category']==category)]
    
        # Get number of domains of each type within this category
        first_party = len(df[df['party_labels']==1]['domain'].unique())
        support_party = len(df[df['party_labels']==2]['domain'].unique())
        third_party = len(df[df['party_labels']==3]['domain'].unique())
    
        first = df[df['party_labels']==1].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "1"})
        support = df[df['party_labels']==2].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "2"})
        Third = df[df['party_labels']==3].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "3"})
    
        devices = df[['device_id']].drop_duplicates()
        devices = devices.merge(first,how ='left').merge(support,how ='left').merge(Third,how='left').fillna(0)
    
        first_party_mean, support_party_mean, third_party_mean = devices[['1','2','3']].mean()
        first_party_std, support_party_std, third_party_std = devices[['1','2','3']].std()
    
    
        first_party_up_stream = df[df['party_labels']==1]['average_out_byte_per_sec'].mean()
        support_party_up_stream = df[df['party_labels']==2]['average_out_byte_per_sec'].mean()
        third_party_up_stream = df[df['party_labels']==3]['average_out_byte_per_sec'].mean()
    
        first_party_up_stream_std = df[df['party_labels']==1]['average_out_byte_per_sec'].std()
        support_party_up_stream_std = df[df['party_labels']==2]['average_out_byte_per_sec'].std()
        third_party_up_stream_std = df[df['party_labels']==3]['average_out_byte_per_sec'].std()
    
    
        data = {'Category': category,
                'first_party': first_party,
                'support_party': support_party,
                'third_party': third_party,
                'first_party_mean': first_party_mean,
                'support_party_mean': support_party_mean,
                'third_party_mean': third_party_mean,
                'first_party_std': first_party_std,
                'support_party_std': support_party_std,
                'third_party_std': third_party_std,
                'first_party_up_stream': first_party_up_stream,
                'support_party_up_stream': support_party_up_stream,
                'third_party_up_stream': third_party_up_stream,
                'first_party_up_stream_std': first_party_up_stream_std,
                'support_party_up_stream_std': support_party_up_stream_std,
                'third_party_up_stream_std': third_party_up_stream_std
                }
    
        aggregated_data.append(data)
    return pd.DataFrame(aggregated_data)

rq_1_df = analyze_rq_1(clean_flow_party_label.copy())

In [None]:
import matplotlib
matplotlib.use('QtAgg')  # Choose the appropriate backend

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('default')  # Set the default style


# Assuming your DataFrame is named df
# Replace 'category', 'total', 'avg', 'std' with the actual column names
df = rq_1_df.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]

# Create a figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 3), sharex=True)
colors = ['blue', 'green', 'red']

# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')

sns.barplot(x='short_category', y='total_domain', data=df_melted, hue='party_type', ax=axes,
            width=0.8, log=True, palette=colors, alpha=0.6, edgecolor='black', linewidth=1)

# axes.set_title('1. Total # of domains')
axes.legend(loc='center left', bbox_to_anchor=(0.575, 0.8), prop={'size': 8.5}, )
# Hide x-axis and y-axis labels
axes.set_xlabel('')  # Hide x-axis label on axes[0]
axes.set_ylabel('Number of domains')  # Hide x-axis label on axes[0]

# Rotate category names vertically
# for ax in axes:
axes.set_xticklabels(axes.get_xticklabels(), rotation=65, ha='center')

# Adjust layout
plt.tight_layout()

# todo save and show the plot
# plt.savefig('../../Statistical Data/RQ1/figure-3-plot-a.pdf')
plt.show()


In [None]:
import matplotlib
matplotlib.use('QtAgg')  # Choose the appropriate backend

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('default')  # Set the default style


# Assuming your DataFrame is named df
# Replace 'category', 'total', 'avg', 'std' with the actual column names
df = rq_1_df.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]

# Create a figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 3), sharex=True)
colors = ['blue', 'green', 'red']

# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')


# Plot the mean and std columns

# Bar width
bar_width = 0.3
capsize = 2
log_scale = True
alpha = 0.5
# Position of bars on X-axis
bar_positions_1 = np.arange(-0.3, 7 , 1).tolist()
# bar_positions_1 = range(0,9)
bar_positions_2 = [pos + bar_width for pos in bar_positions_1]
bar_positions_3 = [pos + bar_width for pos in bar_positions_2]

# Plot mean values
axes.bar(bar_positions_1, df['first_party_mean'],
            width=bar_width, label='Mean 1', yerr=df['first_party_std'],
            capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=1)

axes.bar(bar_positions_2, df['support_party_mean'],
            width=bar_width, label='Mean 1', yerr=df['support_party_std'],
            capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=1)

axes.bar(bar_positions_3, df['third_party_mean'],
            width=bar_width, label='Mean 1', yerr=df['third_party_std'],
            capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=1)
# axes.set_title('2. Average # of Domain')
axes.set_ylabel('Average # of domain')  # Hide x-axis label on axes[0]

# Rotate category names vertically
# for ax in axes:
axes.set_xticks(bar_positions_2)
axes.set_xticklabels(df['short_category'], rotation=65, ha='center')

# Adjust layout
plt.tight_layout()

# todo save and show the plot
# plt.savefig('../../Statistical Data/RQ1/figure-3-plot-b.pdf')
plt.show()


In [None]:
import matplotlib
matplotlib.use('QtAgg')  # Choose the appropriate backend

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('default')  # Set the default style


# Assuming your DataFrame is named df
# Replace 'category', 'total', 'avg', 'std' with the actual column names
df = rq_1_df.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]

# Create a figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 3), sharex=True)
colors = ['blue', 'green', 'red']

# Plot bar plots for each column with respect to 'category'
# Melt the DataFrame to 'long' format for easy plotting
df_melted = pd.melt(df[['short_category', 'first_party', 'support_party', 'third_party']],
                    id_vars='short_category', var_name='party_type', value_name='total_domain')


# Bar width
bar_width = 0.3
capsize = 2
log_scale = True
alpha = 0.5
# Position of bars on X-axis
bar_positions_1 = np.arange(-0.3, 7 , 1).tolist()
# bar_positions_1 = range(0,9)
bar_positions_2 = [pos + bar_width for pos in bar_positions_1]
bar_positions_3 = [pos + bar_width for pos in bar_positions_2]

# plot upstream data

axes.bar(bar_positions_1, df['first_party_up_stream'],
            width=bar_width, label='Mean 1', yerr=df['first_party_up_stream_std'],
            capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha, edgecolor='black', linewidth=1)

axes.bar(bar_positions_2, df['support_party_up_stream'],
            width=bar_width, label='Mean 1', yerr=df['support_party_up_stream_std'],
            capsize=capsize, log=log_scale, color=colors[1], alpha=alpha, edgecolor='black', linewidth=1)

axes.bar(bar_positions_3, df['third_party_up_stream'],
            width=bar_width, label='Mean 1', yerr=df['third_party_up_stream_std'],
            capsize=capsize, log=log_scale, color=colors[2], alpha=alpha, edgecolor='black', linewidth=1)

# axes.set_title('3. Volume of up-stream data')
axes.set_ylabel('Up-stream data volume')  # Hide x-axis label on axes[0]

# Rotate category names vertically
# for ax in axes:
axes.set_xticks(bar_positions_2)
axes.set_xticklabels(df['short_category'], rotation=65, ha='center')

# Adjust layout
plt.tight_layout()

# todo save and show the plot
# plt.savefig('../../Statistical Data/RQ1/figure-3-plot-c.pdf')
plt.show()


### RQ 2

### Do the contacted endpoints significantly change based on the user's location?

In [None]:
# function to find the timezone of the users

def timezone_continent(timezone):
    try:
        timezone = float(timezone)
        if (timezone >= -11) and (timezone < -1):
            return 'AA'
        if (timezone >= 0) and (timezone <= 3):
            return 'EA'
        if (timezone >= 4):
            return 'APA'
    except:
        pass
    return 'UN'


devices_df = clean_flow_party_label.copy()
timezone = clean_flow_party_label[['device_id', 'timezone']].drop_duplicates()
timezone['user_loc'] =  timezone.apply(lambda row: timezone_continent(row.timezone), axis=1)
devices_df = pd.merge(devices_df, timezone, on=['device_id', 'timezone'], how='inner')



result_dictionary = {}
locations = ['AA', 'EA', 'APA']

aggregated_data = []

for category in unique_categories:
    df = devices_df[(devices_df['generic_category']==category)]
    # print(category.ljust(15), end=' & ')

    for loc in locations:
        device_count = len(df[(df['user_loc']==loc)]['device_id'].unique())

        first = df[(df['user_loc']==loc) & (df['party_labels']==1)].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "1"})
        support = df[(df['user_loc']==loc) & (df['party_labels']==2)].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "2"})
        Third = df[(df['user_loc']==loc) & (df['party_labels']==3)].groupby('device_id')['domain'].nunique().reset_index().rename(columns={"domain": "3"})

        devices = df[(df['user_loc']==loc)][['device_id']].drop_duplicates()
        devices = devices.merge(first,how ='left').merge(support,how ='left').merge(Third,how='left').fillna(0)

        first_party_mean, support_party_mean, third_party_mean = devices[['1','2','3']].mean()
        first_party_std, support_party_std, third_party_std = devices[['1','2','3']].std()
        
        data = {'Category': category,
                'loc': loc,
                'first_party_mean': first_party_mean,
                'support_party_mean': support_party_mean,
                'third_party_mean': third_party_mean,
                'first_party_std': first_party_std,
                'support_party_std': support_party_std,
                'third_party_std': third_party_std,
                }

        aggregated_data.append(data)
        
        
rq_2_df = pd.DataFrame(aggregated_data)   

In [None]:
# Table 6 matplotlib without SD

import seaborn as sns
import matplotlib.pyplot as plt
# ['GTK3Agg', 'GTK3Cairo', 'GTK4Agg', 'GTK4Cairo', 'MacOSX', 'nbAgg', 'QtAgg', 'QtCairo', 'Qt5Agg', 'Qt5Cairo', 'TkAgg', 'TkCairo', 'WebAgg', 'WX', 'WXAgg', 'WXCairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template']
matplotlib.use('QtAgg')  # Choose the appropriate backend


# Assuming your DataFrame is named df
# Replace 'category', 'total', 'avg', 'std' with the actual column names
df = rq_2_df.copy()
df['short_category'] = df['Category'].astype(str).str[0:8]

# Set up the plot
fig, ax = plt.subplots(figsize=(8, 3))

colors = ['blue', 'green', 'red']


# Bar width
bar_width = 0.3
capsize = 2
log_scale = False
alpha_1 = 0.8
alpha_2 = 0.6
alpha_3 = 0.5
# Position of bars on X-axis
bar_positions_1 = np.arange(-0.3, 21 , 3).tolist() # bar_positions_1 = range(0,9)
bar_positions_2 = [pos + bar_width for pos in bar_positions_1]
bar_positions_3 = [pos + bar_width for pos in bar_positions_2]
bar_positions_4 = [pos + bar_width for pos in bar_positions_3]
bar_positions_5 = [pos + bar_width for pos in bar_positions_4]
bar_positions_6 = [pos + bar_width for pos in bar_positions_5]
bar_positions_7 = [pos + bar_width for pos in bar_positions_6]
bar_positions_8 = [pos + bar_width for pos in bar_positions_7]
bar_positions_9 = [pos + bar_width for pos in bar_positions_8]


# Plot first party values
ax.bar(bar_positions_1, df[df['loc']=='AA']['first_party_mean'],
       width=bar_width, label='Mean 1',
       capsize=capsize, log=log_scale,  color=colors[0], alpha=alpha_1, edgecolor='black', linewidth=0.5)

ax.bar(bar_positions_2, df[df['loc']=='EA']['first_party_mean'],
       width=bar_width, label='Mean 1',
       capsize=capsize, log=log_scale, color=colors[0], alpha=alpha_2, edgecolor='black', linewidth=0.5)

ax.bar(bar_positions_3, df[df['loc']=='APA']['first_party_mean'],
       width=bar_width, label='Mean 1',
       capsize=capsize, log=log_scale, color=colors[0], alpha=alpha_3, edgecolor='black', linewidth=0.5)



# Plot support party values
ax.bar(bar_positions_4, df[df['loc']=='AA']['support_party_mean'],
       width=bar_width, label='Mean 1', 
       capsize=capsize, log=log_scale,  color=colors[1], alpha=alpha_1, edgecolor='black', linewidth=0.5)

ax.bar(bar_positions_5, df[df['loc']=='EA']['support_party_mean'],
       width=bar_width, label='Mean 1', 
       capsize=capsize, log=log_scale, color=colors[1], alpha=alpha_2, edgecolor='black', linewidth=0.5)

ax.bar(bar_positions_6, df[df['loc']=='APA']['support_party_mean'],
       width=bar_width, label='Mean 1', 
       capsize=capsize, log=log_scale, color=colors[1], alpha=alpha_3, edgecolor='black', linewidth=0.5)



# Plot third party values
ax.bar(bar_positions_7, df[df['loc']=='AA']['third_party_mean'],
       width=bar_width, label='Mean 1', 
       capsize=capsize, log=log_scale,  color=colors[2], alpha=alpha_1, edgecolor='black', linewidth=0.5)

ax.bar(bar_positions_8, df[df['loc']=='EA']['third_party_mean'],
       width=bar_width, label='Mean 1', 
       capsize=capsize, log=log_scale, color=colors[2], alpha=alpha_2, edgecolor='black', linewidth=0.5)

ax.bar(bar_positions_9, df[df['loc']=='APA']['third_party_mean'],
       width=bar_width, label='Mean 1', 
       capsize=capsize, log=log_scale, color=colors[2], alpha=alpha_3, edgecolor='black', linewidth=0.5)



# ax.set_title('Average number of domains')
ax.set_ylabel('Average #of Domains')  # Hide x-axis label on axes[0]

ax.set_xticks([pos + 4*bar_width for pos in bar_positions_1])
ax.set_xticklabels(df[df['loc']=='AA']['Category'])

# Rotate category names vertically
ax.set_xticklabels(ax.get_xticklabels(), rotation=15, ha='center', fontsize=9)
# Adding legend with manually set names
legend_names = ['First-party: America', 'First-party: EA', 'First-party: APA',
                'Support-party: America', 'Support-party: EA', 'Support-party: APA',
                'Third-party: America', 'Third-party: EA', 'Third-party: APA',
                ]

ax.legend(legend_names, loc='center left', bbox_to_anchor=(.615, .68), prop={'size': 7.5}, )

# Adjust layout
plt.tight_layout()

# todo save and show the plot
# plt.savefig('../../Statistical Data/RQ2/figure-4-plot.pdf')
plt.show()