# aggregated results

In [None]:
import os
import pandas as pd

# Set the path to the folder containing the CSV files
RESULTS_FOLDER = '/kristof.meszaros/pract/aggregated_results'

# Create empty lists for each type of file
mice_best_overall_files = []
mice_best_for_each_files = []
categorical_best_for_each_files = []
categorical_all_files = []

# Loop through all files in the folder
for file_name in os.listdir(RESULTS_FOLDER):
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        # Get the category number and threshold from the file name
        category_number = int(file_name.split('_')[0][3:])
        threshold = int(file_name.split('_')[1][9:])
        
        # Read in the CSV file and add two columns for category number and threshold
        df = pd.read_csv(os.path.join(RESULTS_FOLDER, file_name))
        df.insert(0,'category_number', category_number)
        df.insert(1,'threshold', threshold)
        
        # Append the DataFrame to the appropriate list based on the file name pattern
        if 'mice_best_overall' in file_name:
            mice_best_overall_files.append(df)
        elif 'mice_best_for_each' in file_name:
            mice_best_for_each_files.append(df)
        elif 'categorical_best_for_each' in file_name:
            categorical_best_for_each_files.append(df)
        elif 'categorical_all' in file_name:
            categorical_all_files.append(df)
            
# Concatenate the DataFrames for each type of file into a single DataFrame
mice_best_overall_df = pd.concat(mice_best_overall_files, ignore_index=True)
mice_best_for_each_df = pd.concat(mice_best_for_each_files, ignore_index=True)
categorical_best_for_each_df = pd.concat(categorical_best_for_each_files, ignore_index=True)
categorical_all_df = pd.concat(categorical_all_files, ignore_index=True)

##  mice_best_overall_df
for each cat-threshold, the overall best MICE estimator's performance on each numerical column

In [None]:
mice_best_overall_df

In [None]:
sumdf=mice_best_overall_df[mice_best_overall_df.threshold==100].groupby(["category_number","threshold"]).count()
sumdf.sort_values(by='column', ascending=False)

In [None]:
# Group the data by threshold and best_overall_model, and count the number of occurrences
counts = df.groupby(['threshold', 'best_overall_model']).count()['category_number']

# Compute the total count for each threshold
totals = counts.groupby(level=0).sum()

# Compute the percentage of each count out of its total
ratios = counts / totals

# Reshape the data so that each best_overall_model is a column
stacked = ratios.unstack()

# Create a stacked bar chart
ax = stacked.plot(kind='bar', stacked=True, legend=None)

# Set the chart title and axes labels
plt.title('Best Overall Models by Threshold')
plt.xlabel('Threshold')
plt.ylabel('Best MICE estimator for percentage of datasets')

# Add color coding explanation outside of the chart
handles, labels = ax.get_legend_handles_labels()
ax.figure.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the chart
plt.show()

In [None]:
# Choose a random category number
category_number = 1531

# Get the subset of data for the chosen category number
subset = df[df['category_number'] == category_number]

# Get the 10 columns with the highest correlation values across all thresholds
top_columns = subset.groupby('column').mean()['correlation'].nlargest(10).index.tolist()

# Initialize empty lists to store data
avg_rmse = []
avg_mae = []
avg_corr = []

# Loop through each threshold and calculate the average RMSE, MAE, and correlation for each column
for threshold in [30, 50, 100]:
    # Filter the subset of the DataFrame for the specified threshold
    threshold_subset = subset[subset['threshold'] == threshold]
    
    # Loop through each column and calculate the average RMSE, MAE, and correlation
    for column in top_columns:
        # Filter the subset of the DataFrame for the specified column
        column_subset = threshold_subset[threshold_subset['column'] == column]
        
        # Calculate the average RMSE, MAE, and correlation for the specified column and threshold
        avg_rmse.append(column_subset['rmse'].mean())
        avg_mae.append(column_subset['mae'].mean())
        avg_corr.append(column_subset['correlation'].mean())

# Reshape the data so that each column is a row and each threshold is a column
avg_values = np.array([avg_rmse]).T.reshape(10, 3)

# Set the colors for each threshold
colors = ['tab:blue', 'tab:orange', 'tab:green']

# Set the width of each bar and the positions of the x ticks
bar_width = 0.2
x_pos = np.arange(10)

# Create a bar chart with three bars for each column, one for each threshold
thresholds = [30,50,100]
for i in range(3):
    plt.bar(x_pos + (i - 1) * bar_width, avg_values[:, i], width=bar_width, 
            label=f'Threshold {thresholds[i]}', color=colors[i])

# Set the chart title and axes labels
plt.title(f'Average RMSE for columns in Category {category_number}')
plt.xlabel('Column')
plt.ylabel('RMSE')
plt.xticks(x_pos, top_columns, rotation=45, ha='right')
plt.legend()

# Display the chart
plt.show()

##  mice_best_for_each_df
for each cat-threshold-column, the best performing estimator in MICE (we chose the above for fitting, but this
might be intertesting to see)

In [None]:
df=mice_best_for_each_df
df

In [None]:
df[df.category_number==1531]

In [None]:
# Group the data by threshold and best_overall_model, and count the number of occurrences
counts = df.groupby(['threshold', 'best_model']).count()['category_number']

# Compute the total count for each threshold
totals = counts.groupby(level=0).sum()

# Compute the percentage of each count out of its total
ratios = counts / totals

# Reshape the data so that each best_overall_model is a column
stacked = ratios.unstack()

# Create a stacked bar chart
ax = stacked.plot(kind='bar', stacked=True, legend=None)

# Set the chart title and axes labels
plt.title('MICE best performing models column level,  by threshold')
plt.xlabel('Threshold')
plt.ylabel('Best performing model for percentage of columns')

# Add color coding explanation outside of the chart
handles, labels = ax.get_legend_handles_labels()
ax.figure.legend(handles, labels, bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the chart
plt.show()

##  categorical_best_for_each_df
for each cat-threshold-column, showing the performance of each classifier

In [None]:
categorical_best_for_each_df

In [None]:
# Create a subset of the data containing only the 'threshold' and 'best_model' columns
subset = df[['threshold', 'best_model']]

# Group the subset by threshold and count the number of occurrences of each value of 'best_model'
grouped = subset.groupby('threshold')['best_model'].value_counts(normalize=True).unstack()

# Plot the stacked bar chart
ax = grouped.plot(kind='bar', stacked=True, figsize=(10, 6))

# Set the chart title and axes labels
ax.set_title('Distribution of Best Model by Threshold')
ax.set_xlabel('Threshold')
ax.set_ylabel('Proportion of Models')

# Move the legend outside of the chart
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Show the chart
plt.show()


In [None]:
df =categorical_best_for_each_df 
# Create a subset of the data containing only the 'best_model', 'mean_fit_time' and 'std_fit_time' columns
subset = df[['best_model', 'mean_fit_time', 'std_fit_time']]

# Group the subset by 'best_model' and calculate the average 'mean_fit_time' and 'std_fit_time' for each group
grouped = subset.groupby('best_model').mean().reset_index()

# Set the width of each bar and the positions of the x ticks
bar_width = 0.35
x_pos = np.arange(len(grouped))

# Create a bar chart for the average 'mean_fit_time'
plt.bar(x_pos, grouped['mean_fit_time'], width=bar_width, label='Mean Fit Time')

# Create a bar chart for the average 'std_fit_time'
plt.bar(x_pos + bar_width, grouped['std_fit_time'], width=bar_width, label='Std Fit Time')

# Set the chart title and axes labels
plt.title('Average Mean Fit Time and Std Fit Time by Best Model')
plt.xlabel('Best Model')
plt.ylabel('Time (s)')
plt.xticks(x_pos + bar_width / 2, grouped['best_model'], rotation=45, ha='right')
plt.legend()

# Show the chart
plt.show()

##  categorical_all_df
for each cat-threshold-column, showing the performance of the best classifier which was used to fit. 
This should be just the subset of the above dataframe

In [None]:
categorical_all_df

In [None]:
df = categorical_all_df
# Create a subset of the data containing only the 'threshold' and 'best_model' columns
subset = df[['threshold', 'model']]

# Group the subset by threshold and count the number of occurrences of each value of 'best_model'
grouped = subset.groupby('threshold')['model'].value_counts(normalize=True).unstack()

# Plot the stacked bar chart
ax = grouped.plot(kind='bar', stacked=True, figsize=(10, 6))

# Set the chart title and axes labels
ax.set_title('Distribution of Best Model by Threshold')
ax.set_xlabel('Threshold')
ax.set_ylabel('Proportion of Models')

# Move the legend outside of the chart
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Show the chart
plt.show()
