# aggregated results

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.plotting import scatter_matrix

# Set the path to the folder containing the CSV files
RESULTS_FOLDER = '/kristof.meszaros/pract/aggregated_results'

# Create empty lists for each type of file
mice_best_overall_files = []
mice_best_for_each_files = []
categorical_best_for_each_files = []
categorical_all_files = []

# Loop through all files in the folder
for file_name in os.listdir(RESULTS_FOLDER):
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        # Get the category number and threshold from the file name
        category_number = int(file_name.split('_')[0][3:])
        threshold = int(file_name.split('_')[1][9:])
        
        # Read in the CSV file and add two columns for category number and threshold
        df = pd.read_csv(os.path.join(RESULTS_FOLDER, file_name))
        df.insert(0,'category_number', category_number)
        df.insert(1,'threshold', threshold)
        
        # Append the DataFrame to the appropriate list based on the file name pattern
        if 'mice_best_overall' in file_name:
            mice_best_overall_files.append(df)
        elif 'mice_best_for_each' in file_name:
            mice_best_for_each_files.append(df)
        elif 'categorical_best_for_each' in file_name:
            categorical_best_for_each_files.append(df)
        elif 'categorical_all' in file_name:
            categorical_all_files.append(df)
            
# Concatenate the DataFrames for each type of file into a single DataFrame
mice_best_overall_df = pd.concat(mice_best_overall_files, ignore_index=True)
mice_best_for_each_df = pd.concat(mice_best_for_each_files, ignore_index=True)
categorical_best_for_each_df = pd.concat(categorical_best_for_each_files, ignore_index=True)
categorical_all_df = pd.concat(categorical_all_files, ignore_index=True)

##  mice_best_overall_df
for each cat-threshold, the overall best MICE estimator's performance on each numerical column

In [None]:
mice_best_overall_df

In [None]:
color_map = {30: 'red',
             50: 'green',
             100: 'blue'}
marker_map = {'Support Vector Regression': 'o',
              'KNN': 's',
              'Elastic Net': '^',
              'Random Forest': '*',
              'Bayesian Ridge': '+', 
              'Mean': 'x', 
              'Median': 'd'}
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in mice_best_overall_df['threshold'].unique():
    subset = mice_best_overall_df[mice_best_overall_df['threshold'] == i]
    for j in subset['best_overall_model'].unique():
        x = subset[subset['best_overall_model'] == j]['rmse']
        y = subset[subset['best_overall_model'] == j]['mae']
        z = subset[subset['best_overall_model'] == j]['correlation']
        ax.scatter(x, y, z, c=color_map[i], marker=marker_map[j], label=i)
ax.legend()

In [None]:
group_model = mice_best_overall_df.groupby(['threshold','best_overall_model']).agg({'rmse':'mean', 'mae':'mean', 'correlation': 'mean'}).reset_index()
fig, ax = plt.subplots()
sns.scatterplot(data=group_model, x='rmse', y='mae', size='correlation', hue='threshold', style='best_overall_model', sizes=(10, 200))
ax.set_xlabel('RMSE')
ax.set_ylabel('MAE')
ax.legend(title=None, fontsize=8, loc='lower right', bbox_to_anchor=(1.4, 0))
plt.show()

In [None]:
group_model = mice_best_overall_df.groupby(['threshold','best_overall_model']).agg({'rmse':'mean', 'mae':'mean'}).reset_index()
fig, ax = plt.subplots()
sns.scatterplot(data=group_model, x='rmse', y='mae', size=10, hue='threshold', style='best_overall_model', sizes=(10, 200))
ax.set_xlabel('RMSE')
ax.set_ylabel('MAE')
ax.legend(title=None, fontsize=8, loc='lower right', bbox_to_anchor=(1.4, 0))
plt.show()

In [None]:
new_mice_best_overall_df = pd.DataFrame()
for k, v in mice_best_overall_df.groupby('category_number').groups.items():
    sub_group = mice_best_overall_df.groupby('category_number').get_group(k).copy()
    number_of_columns = sub_group.groupby('threshold')['column'].count().to_dict()
    sub_group['number_of_columns'] = sub_group['threshold'].map(number_of_columns)
    new_mice_best_overall_df = pd.concat([new_mice_best_overall_df, sub_group], axis=0)
new_mice_best_overall_df

In [None]:
groups = mice_best_overall_df.groupby('category_number')
group_1531 = groups.get_group(1531).copy()
number_of_columns = group_1531.groupby('threshold')['column'].count().to_dict()
group_1531['number_of_columns'] = group_1531['threshold'].map(number_of_columns)
groups = group_1531.groupby('column')
group_depth = groups.get_group('Depth.1650')

x = np.arange(len(group_depth))
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

# plot bar chart
bar_width = 0.4
ax1.bar(x - bar_width/2, group_depth['number_of_columns'], width=bar_width, alpha=0.8, label='number of columns')

# plot line chart
ax2.plot(x, group_depth['rmse'], 'go-', alpha=0.8, label='RMSE')
ax2.plot(x, group_depth['mae'], 'ro-', alpha=0.8, label='MAE')
ax2.plot(x, group_depth['correlation'], 'yo-', alpha=0.8, label='correlation')

# set x-axis ticks and labels
ax1.set_xticks(x-0.2)
ax1.set_xticklabels(group_depth['threshold'])

# set y-axis labels and title
ax1.set_ylabel('Number of Columns')
ax2.set_ylabel('Error Values')
ax1.set_xlabel('Model')
# ax1.set_title('RMSE and MAE for different threshold types')

# add legend
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles1 + handles2, labels1 + labels2, loc='upper right')

# show plot
plt.show()
group_depth

In [None]:
group_by_model = mice_best_overall_df.groupby('best_overall_model').agg({'best_overall_model': 'count', 'rmse':'mean','mae':'mean'})
group_by_model.columns = pd.Series(['count', 'rmse', 'mae'])
group_by_model = group_by_model.reset_index()
x = np.arange(len(group_by_model))

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

# plot bar chart
bar_width = 0.4
ax1.bar(x - bar_width/2, group_by_model['count'], width=bar_width, alpha=0.8, label='count')

# plot line chart
ax2.plot(x, group_by_model['rmse'], 'go-', alpha=0.8, label='RMSE')
ax2.plot(x, group_by_model['mae'], 'ro-', alpha=0.8, label='MAE')

# set x-axis ticks and labels
ax1.set_xticks(x-0.5)
ax1.set_xticklabels(group_by_model['best_overall_model'], rotation=15, fontsize=8)

# set y-axis labels and title
ax1.set_ylabel('Number of Columns')
ax2.set_ylabel('Error Values')
ax1.set_xlabel('Model')
# ax1.set_title('RMSE and MAE for different threshold types')

# add legend
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(handles1 + handles2, labels1 + labels2, loc='upper left')

# show plot
plt.show()

##  mice_best_for_each_df
for each cat-threshold-column, the best performing estimator in MICE (we chose the above for fitting, but this
might be intertesting to see)

In [None]:
mice_best_for_each_df[mice_best_for_each_df['category_number']==1531]

In [None]:
group_model = mice_best_for_each_df.groupby(['threshold', 'best_model'])['category_number'].count().unstack(level=-1).reset_index()
ax = group_model[['Bayesian Ridge', 'Elastic Net', 'KNN', 'Mean', 'Median', 'Random Forest', 'Support Vector Regression']].plot.barh(stacked=True)
ax.set_yticklabels(group_model['threshold'])
ax.set_xlabel('Number of columns')
ax.set_ylabel('Threshold')
plt.legend(loc='upper right', fontsize=8)
plt.show()
# group_model

In [None]:
group_performance = pd.DataFrame()
for k, v in mice_best_for_each_df.groupby(['category_number','threshold', 'best_model']).groups.items():
    sub_group = mice_best_for_each_df.groupby(['category_number','threshold', 'best_model']).get_group(k)
    if sub_group.shape[0] > 1:
        group_performance = pd.concat([group_performance, sub_group.groupby(['category_number','threshold', 'best_model']).agg({'rmse':'mean','mae':'mean','correlation':'mean'}).reset_index()])
group_performance.reset_index(drop=True, inplace=True)

group = group_performance.groupby('category_number')
result_df = pd.DataFrame()

for group_name, group_df in group:
    max_row = group_df.loc[group_df['rmse'].idxmax()]
    result_df = pd.concat([result_df, max_row], axis=1)
result_df = result_df.T

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18,9))
sns.scatterplot(data=result_df[result_df['threshold']==30], x='category_number', y='rmse', hue='best_model', style='best_model', ax=ax1)
ax1.set_title('Threshold: 30')
ax1.legend(title=None)
sns.scatterplot(data=result_df[result_df['threshold']==50], x='category_number', y='rmse', hue='best_model', style='best_model', ax=ax2)
ax2.set_title('Threshold: 50')
ax2.legend(title=None)
sns.scatterplot(data=result_df[result_df['threshold']==100], x='category_number', y='rmse', hue='best_model', style='best_model', ax=ax3)
ax3.set_title('Threshold: 100')
ax3.legend(title=None)
plt.show()

In [None]:
for k, v in mice_best_for_each_df.groupby('column').groups.items():
    if len(mice_best_for_each_df.groupby('column').get_group(k)['category_number'].unique()) > 1:
        sub_group = mice_best_for_each_df.groupby('column').get_group(k)
        for i, j in sub_group.groupby(['threshold', 'column']).groups.items():
            if sub_group.groupby(['threshold', 'column']).get_group(i).shape[0] == 49:
                exp_df = sub_group.groupby(['threshold', 'column']).get_group(i)
                print(i)
exp_df = exp_df[exp_df.rmse<120]
sns.scatterplot(data=exp_df, x='rmse', y='correlation', hue='best_model', style='best_model')
plt.xlabel('RMSE', fontsize=10)
plt.ylabel('Correlation', fontsize=10)
plt.legend(title=None, fontsize=8)

In [None]:
exp_df

In [None]:
group_by_threshold = mice_best_for_each_df.groupby('threshold').agg({'rmse':'mean','mae':'mean'}).reset_index()

fig, ax = plt.subplots()
index = np.arange(len(group_by_threshold))
bar_width = 0.35
opacity = 0.8

rects1 = ax.bar(index, group_by_threshold['rmse'], bar_width, alpha=opacity, label='RMSE')
rects2 = ax.bar(index + bar_width, group_by_threshold['mae'], bar_width, alpha=opacity, label='MAE')

ax.set_xlabel('Threshold')
ax.set_ylabel('Error Values')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(group_by_threshold['threshold'])
ax.legend()

plt.show()
# sns.barplot(x = 'threshold', y = 'rmse', data = mice_best_overall_df)

##  categorical_all_df
for each cat-threshold-column, showing the performance of each classifier

In [None]:
categorical_all_df

In [None]:
group_threshold = categorical_all_df.groupby(['threshold', 'model']).agg({'column': 'count', 'mean_fit_time':'mean','mean_test_score':'mean'}).reset_index()
fig, ax1 = plt.subplots()
sns.barplot(data=group_threshold, x='model', y='mean_fit_time', hue='threshold', ax=ax1)
ax2 = ax1.twinx()
# Add a line plot of the profit data on the same chart
sns.lineplot(data=group_threshold, x='model', y='mean_test_score', hue='threshold', marker='o', ax=ax2)
ax1.set_xticklabels(ax.get_xticklabels(), fontsize=8, rotation=15)
ax1.legend(fontsize=8, title='threshold', title_fontsize=8)
ax2.legend(fontsize=8, title='threshold', title_fontsize=8)

##  categorical_best_for_each_df
for each cat-threshold-column, showing the performance of the best classifier which was used to fit. 
This should be just the subset of the above dataframe

In [None]:
categorical_best_for_each_df

In [None]:
group_model = categorical_best_for_each_df.groupby(['threshold', 'best_model'])['category_number'].count().unstack(level=-1).reset_index()
ax = group_model[['KNN', 'Logistic Regression', 'Most Frequent', 'Naive Bayes', 'Random Forest', 'Support Vector Classifier']].plot.barh(stacked=True)
ax.set_yticklabels(group_model['threshold'])
ax.set_xlabel('Number of columns')
ax.set_ylabel('Threshold')
plt.legend(loc='upper right', fontsize=8)
plt.show()

In [None]:
group_performance = pd.DataFrame()
for k, v in categorical_best_for_each_df.groupby(['category_number','threshold', 'best_model']).groups.items():
    sub_group = categorical_best_for_each_df.groupby(['category_number','threshold', 'best_model']).get_group(k)
    if sub_group.shape[0] > 1:
        group_performance = pd.concat([group_performance, sub_group.groupby(['category_number','threshold', 'best_model'])['accuracy'].mean().reset_index()])
group_performance.reset_index(drop=True, inplace=True)

group = group_performance.groupby('category_number')
result_df = pd.DataFrame()

for group_name, group_df in group:
    max_row = group_df.loc[group_df['accuracy'].idxmax()]
    result_df = pd.concat([result_df, max_row], axis=1)
result_df = result_df.T

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18,9))
sns.scatterplot(data=result_df[result_df['threshold']==30], x='category_number', y='accuracy', hue='best_model', style='best_model', ax=ax1)
ax1.set_title('Threshold: 30')
ax1.legend(title=None)
sns.scatterplot(data=result_df[result_df['threshold']==50], x='category_number', y='accuracy', hue='best_model', style='best_model', ax=ax2)
ax2.set_title('Threshold: 50')
ax2.legend(title=None)
sns.scatterplot(data=result_df[result_df['threshold']==100], x='category_number', y='accuracy', hue='best_model', style='best_model', ax=ax3)
ax3.set_title('Threshold: 100')
ax3.legend(title=None)
plt.show()

# plt.figure(figsize=(20, 10))
# sns.scatterplot(data=result_df, x='category_number', y='accuracy', hue='best_model', style='best_model', s=200)
# plt.xlabel('Category number', fontsize=14)
# plt.ylabel('Accuracy', fontsize=14)
# plt.legend(title=None, fontsize=14)

In [None]:
for k, v in categorical_best_for_each_df.groupby('column').groups.items():
    if len(categorical_best_for_each_df.groupby('column').get_group(k)['category_number'].unique()) > 1:
        sub_group = categorical_best_for_each_df.groupby('column').get_group(k)
        for i, j in sub_group.groupby(['threshold', 'column']).groups.items():
            if sub_group.groupby(['threshold', 'column']).get_group(i).shape[0] == 26:
                exp_df = sub_group.groupby(['threshold', 'column']).get_group(i)
                print(i)
sns.scatterplot(data=exp_df, x='f1', y='accuracy', hue='best_model', style='best_model')
plt.xlabel('f1', fontsize=10)
plt.ylabel('Accuracy', fontsize=10)
plt.legend(title=None, fontsize=8)