In [None]:
%%writefile requirements.txt

pandas==2.2.2
matplotlib==3.8.3
seaborn==0.13.2
numpy==1.26.4
scipy==1.13.1
os

%pip install -r requirements.txt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
import os

pd.set_option("future.no_silent_downcasting", True)

In [None]:
def load_csv(file_path):
    df = pd.read_csv(file_path)
    return df

def normalize_log_times(data):
    data['Normalized Time'] = data.groupby(['Batching Period', 'Subscription Count', 'Subscribers Count'])['Log Time (UTC)'].transform(lambda x: x - x.min())
    return data

def preprocess_data(df, subscription_count, batching_period, subscribers_count):
    df['Publish Time (UTC)'] = pd.to_datetime(df['Publish Time (UTC)'])
    df['Log Time (UTC)'] = pd.to_datetime(df['Log Time (UTC)'])
    df['Subscription Count'] = subscription_count
    df['Batching Period'] = batching_period
    df['Subscribers Count'] = subscribers_count
    return df

def load_and_preprocess_all(files):
    all_data = []
    for file_path, subscription_count, batching_period, subscribers_count in files:
        df = load_csv(file_path)
        df = preprocess_data(df, subscription_count, batching_period, subscribers_count)
        all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

def plot_impact_of_batching(data):
    plt.figure(figsize=(14, 8))
    sns.boxplot(data=data, x='Batching Period', y='Travel Elapsed (ms)')
    plt.title('Impact of Batching Period on Travel Elapsed Time')
    plt.xlabel('Batching Period (ms)')
    plt.ylabel('Travel Elapsed (ms)')
    plt.savefig('Impact of Batching Period on Travel Elapsed Time.png')
    plt.show()

    plt.figure(figsize=(14, 8))
    sns.boxplot(data=data, x='Batching Period', y='E2E Elapsed (ms)')
    plt.title('Impact of Batching Period on E2E Elapsed Time')
    plt.xlabel('Batching Period (ms)')
    plt.ylabel('E2E Elapsed (ms)')
    plt.savefig('Impact of Batching Period on E2E Elapsed Time.png')
    plt.show()

    plt.figure(figsize=(14, 8))
    sns.boxplot(data=data, x='Batching Period', y='MDS Elapsed (ms)')
    plt.title('Impact of Batching Period on MDS Elapsed Time')
    plt.xlabel('Batching Period (ms)')
    plt.ylabel('MDS Elapsed (ms)')
    plt.savefig('Impact of Batching Period on MDS Elapsed Time.png')
    plt.show()

def plot_distribution_per_batching_period(data):
    unique_batching_periods = data['Batching Period'].unique()
    unique_subscribers_count = data['Subscribers Count'].unique()
    unique_subscription_count = data['Subscription Count'].unique()
    
    for batching_period in unique_batching_periods:
        for subscriber_count in unique_subscribers_count:
            for subscription_count in unique_subscription_count:
                subset = data[(data['Batching Period'] == batching_period) & (data['Subscribers Count'] == subscriber_count) & (data['Subscription Count'] == subscription_count)]
                if subset.empty:
                    continue
                suffix = f'{batching_period}ms Batching - {subscriber_count} Subscribers - {subscription_count} Subscriptions'
                max_scale = 1000
                
                plt.figure(figsize=(12, 6))
                bin_values = np.arange(start=0, stop=max_scale, step=100)
                
                subset['E2E Elapsed (ms)'].plot(kind='hist', bins=bin_values, alpha=.4, legend=True, density=True)
                
                mu, std = norm.fit(subset['E2E Elapsed (ms)'])
                xmin, xmax = plt.xlim()
                x = np.linspace(xmin, xmax, 100)
                p = norm.pdf(x, mu, std)
                
                plt.plot(x, p, 'k', linewidth=2)
                title = f"Fit Values: {mu:.2f} and {std:.2f}"
                plt.title(title)
                plt.suptitle(f'Normal Distribution - {suffix}')
                plt.xlabel('E2E Elapsed (ms)') 
                plt.ylabel('Density (%)') 
                plt.savefig(f'Normal Distribution - {suffix}.png')
                
                plt.show()


def plot_throughput_over_time_by_group(data):
    data = normalize_log_times(data)
    
    data.set_index('Normalized Time', inplace=True, drop=False)
    
    unique_subscription_counts = data['Subscription Count'].unique()
    unique_subscriber_counts = data['Subscribers Count'].unique()
    
    for subscription_count in unique_subscription_counts:
        for subscriber_count in unique_subscriber_counts:
            subset = data[(data['Subscription Count'] == subscription_count) & (data['Subscribers Count'] == subscriber_count)]
            if subset.empty:
                    continue
            subset['Throughput'] = subset['Size (Bytes)']
            throughput_per_minute = subset['Throughput'].resample('T').sum()
            
            plt.figure(figsize=(14, 8))
            throughput_per_minute.plot()
            plt.title(f'Throughput Over Time Per Minute - {subscriber_count} Subscribers, {subscription_count} Subscriptions')
            plt.xlabel('Normalized Time')
            plt.ylabel('Throughput (Bytes)')
            plt.show()




In [None]:
files = [
    ('results/1500_8_1200_1_100.csv', 1500, 100, 8),
    ('results/1500_8_1200_1_250.csv', 1500, 250, 8),
    ('results/1500_8_1200_1_1000.csv', 1500, 1000, 8),
    ('results/11500_8_1200_1_100.csv', 11500, 100, 8),
    ('results/11500_8_1200_1_250.csv', 11500, 250, 8),
    ('results/11500_8_1200_1_1000.csv', 11500, 1000, 8),
    ('results/11500_32_1200_1_100.csv', 11500, 100, 32),
    ('results/11500_32_1200_1_250.csv', 11500, 250, 32),
    ('results/11500_32_1200_1_1000.csv', 11500, 1000, 32)
]

all_data = load_and_preprocess_all(files)

In [None]:

plot_impact_of_batching(all_data)
plot_distribution_per_batching_period(all_data)

plot_throughput_over_time_by_group(all_data)
