
In this analysis, we merged three performance dataframes (good, ok, and bad performances) and compared the merged dataframe 
with a random performance dataframe. The comparison was visualized using box plots, and the percentage difference in means 
for each column was also calculated.


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
good = pd.read_csv('/mnt/data/performance_good.csv')
ok = pd.read_csv('/mnt/data/performance_ok.csv')
bad = pd.read_csv('/mnt/data/performance_bad.csv')
random_perf = pd.read_csv('/mnt/data/performance_random.csv')

# Merge data
merged_df = pd.concat([good, ok, bad], ignore_index=True)


In [None]:

# Set aesthetic parameters
sns.set_style("whitegrid")
plt.figure(figsize=(15, 10))

# List of columns to compare
columns_to_compare = merged_df.columns.drop(["Unnamed: 0", "ID"])

# Plotting box plots for each column
for idx, column in enumerate(columns_to_compare, 1):
    plt.subplot(2, 2, idx)
    sns.boxplot(data=[merged_df[column], random_perf[column]], orient="h", palette="pastel")
    plt.title(f"Boxplot of {column}")
    plt.yticks([0, 1], ["Improved", "Random"])
    plt.xlabel(column)

plt.tight_layout()
plt.show()


In [None]:

# Calculate percentage difference in means for each column
percentage_differences = {}

for column in columns_to_compare:
    merged_mean = merged_df[column].mean()
    random_mean = random_perf[column].mean()
    percentage_diff = ((merged_mean - random_mean) / random_mean) * 100
    percentage_differences[column] = percentage_diff

percentage_differences


In [None]:

# Plotting histograms for each column
plt.figure(figsize=(15, 10))

for idx, column in enumerate(columns_to_compare, 1):
    plt.subplot(2, 2, idx)
    sns.histplot(merged_df[column], color="skyblue", kde=True, label="Improved", alpha=0.5)
    sns.histplot(random_perf[column], color="salmon", kde=True, label="Random", alpha=0.5)
    plt.title(f"Histogram of {column}")
    plt.legend()

plt.tight_layout()
plt.show()
