In [15]:
import os
import json
import pandas as pd
from glob import glob

In [16]:
# Define the path to the folder containing JSON files
folder_path = '/Users/iliasx/Documents/GitHub/box-office-prediction/metadata/'

# Get all JSON files in the folder
json_files = glob(os.path.join(folder_path, '*.json'))

print(f'Found {len(json_files)} JSON files in the folder.')

Found 816 JSON files in the folder.


In [17]:
# Initialize lists to hold data
experiment_data = []
metrics_data = []
conf_matrix_data = []
feature_importance_data = []

In [18]:
for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)
        
        # Experiment-level data
        experiment_info = {
            'timestamp': data.get('timestamp', None),
            'model_type': data.get('model_type', None),
            'problem_type': data.get('problem_type', None),
            'dataset_name': data.get('dataset_name', None),
            'grid_type': data.get('grid_type', None),
            'has_outliers_removed': data.get('has_outliers_removed', None),
            'feature_engineering' : data.get('feature_engineering', None),
            
            # Import the metrics dict dynamically with each key as a column
            **data.get('metrics', {})
        }
        experiment_data.append(experiment_info)
 
experiment_df = pd.DataFrame(experiment_data)


In [20]:
experiment_df = experiment_df[experiment_df['grid_type'] == 'grid_search']

In [39]:
experiment_df

Unnamed: 0,timestamp,model_type,problem_type,dataset_name,grid_type,has_outliers_removed,feature_engineering,ROC AUC Score,Accuracy,Precision,Recall,F1 Score,MSE,MAPE,MAE,RMSE,RMSLE,MSLE,R2
0,20240624_191120,logistic_regression,binary_classification,full,grid_search,True,simple,0.651271,0.708513,0.587771,0.473016,0.524186,,,,,,,
2,20240624_191732,linear_regression,regression,medium_productions,grid_search,True,simple,,,,,,9.826687e+32,1.015810e+10,1.710149e+15,3.134755e+16,2.863727,8.200935,-8.722063e+16
4,20240624_183738,lightgbm_classifier,multi_class_classification,medium_productions,grid_search,False,none,0.759370,0.663249,0.591676,0.663249,0.614808,,,,,,,
7,20240624_180754,logistic_regression,binary_classification,full,grid_search,True,complex,0.686192,0.732220,0.620690,0.542857,0.579170,,,,,,,
8,20240624_191727,linear_regression,regression,full,grid_search,False,none,,,,,,9.756889e+30,2.233181e+09,8.863294e+13,3.123602e+15,3.514968,12.355000,-2.505327e+14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,20240624_190644,decision_tree_classifier,multi_class_classification,large_productions,grid_search,True,none,0.607256,0.443869,0.445550,0.443869,0.444537,,,,,,,
812,20240624_190855,xgboost_regressor,regression,small_productions,grid_search,True,complex,,,,,,1.223979e+15,2.384603e+01,8.018370e+06,3.498540e+07,2.240157,5.018303,-1.143313e+00
813,20240624_190634,support_vector_machine_classifier,multi_class_classification,large_productions,grid_search,False,simple,0.726790,0.610544,0.476274,0.610544,0.527483,,,,,,,
814,20240624_190820,linear_regression,regression,small_productions,grid_search,False,simple,,,,,,3.146904e+27,2.377310e+08,3.163238e+12,5.609727e+13,4.023345,16.187307,-2.619969e+13


In [None]:
# Assuming the DataFrame is named experiment_df
grouped = experiment_df[experiment_df['problem_type'] == 'regression'].groupby(
    ['problem_type', 'dataset_name', 'has_outliers_removed', 'feature_engineering'])

# Get the index of the minimum MAPE in each group
idx = grouped['MAPE'].idxmin()

# Select the rows that correspond to the minimum MAPE in each group
best_models = experiment_df.loc[idx]

# Display the desired columns
result = best_models[['problem_type', 'dataset_name', 'has_outliers_removed', 'feature_engineering','model_type', 'MAPE', 'MAE']]

In [43]:
#pretty print
print(result.to_string(index=False))

problem_type       dataset_name  has_outliers_removed feature_engineering              model_type       MAPE          MAE
  regression               full                 False             complex random_forest_regressor 182.735221 4.417663e+07
  regression               full                 False                none decision_tree_regressor 102.208126 6.932268e+07
  regression               full                 False              simple decision_tree_regressor 117.981241 6.736046e+07
  regression               full                  True             complex decision_tree_regressor  15.457374 7.471967e+07
  regression               full                  True                none decision_tree_regressor  22.564863 8.978859e+07
  regression               full                  True              simple random_forest_regressor  24.643574 6.791664e+07
  regression  large_productions                 False             complex random_forest_regressor   5.080257 1.419944e+08
  regression  large_prod

In [37]:
best_model

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,timestamp,model_type,problem_type,dataset_name,grid_type,has_outliers_removed,feature_engineering,ROC AUC Score,Accuracy,Precision,Recall,F1 Score,MSE,MAPE,MAE,RMSE,RMSLE,MSLE,R2
problem_type,dataset_name,has_outliers_removed,feature_engineering,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
regression,full,False,complex,20240624_192427,linear_regression,regression,full,grid_search,False,complex,,,,,,4.502572999999999e+35,479731700000.0,1.904012e+16,6.710122e+17,3.798304,14.427113,-1.156149e+19
regression,full,False,none,20240624_191727,linear_regression,regression,full,grid_search,False,none,,,,,,9.756889e+30,2233181000.0,88632940000000.0,3123602000000000.0,3.514968,12.355,-250532700000000.0
regression,full,False,simple,20240624_180416,linear_regression,regression,full,grid_search,False,simple,,,,,,1.634646e+31,2890546000.0,114723200000000.0,4043075000000000.0,3.599736,12.958096,-419736500000000.0
regression,full,True,complex,20240624_192616,linear_regression,regression,full,grid_search,True,complex,,,,,,1.1399939999999999e+34,29205680000.0,4241077000000000.0,1.067705e+17,3.266562,10.670427,-2.610682e+17
regression,full,True,none,20240624_192120,linear_regression,regression,full,grid_search,True,none,,,,,,3.636336e+32,17154920000.0,625976600000000.0,1.906918e+16,2.946125,8.679653,-8327515000000000.0
regression,full,True,simple,20240624_190834,linear_regression,regression,full,grid_search,True,simple,,,,,,1.179062e+34,97684370000.0,3564466000000000.0,1.085846e+17,2.968483,8.811893,-2.700152e+17
regression,large_productions,False,complex,20240624_180652,linear_regression,regression,large_productions,grid_search,False,complex,,,,,,8.749968e+36,16584360000.0,2.61378e+17,2.958035e+18,6.191699,38.337141,-7.018299e+19
regression,large_productions,False,none,20240624_192716,linear_regression,regression,large_productions,grid_search,False,none,,,,,,5.6460319999999995e+34,935454800.0,1.697241e+16,2.376138e+17,2.484483,6.172656,-4.52865e+17
regression,large_productions,False,simple,20240624_184902,random_forest_regressor,regression,large_productions,grid_search,False,simple,,,,,,5.24194e+16,5.097021,148544300.0,228952800.0,1.192569,1.42222,0.579547
regression,large_productions,True,complex,20240624_191050,linear_regression,regression,large_productions,grid_search,True,complex,,,,,,1.5106270000000001e+38,303109800000.0,1.610126e+18,1.229076e+19,5.535722,30.644223,-1.228359e+21


In [22]:
experiment_df.to_csv('experiment_data.csv', index=False)

In [25]:
import plotly.express as px

def plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='binary_classification', metric='F1 Score', metric_agg='max'):
    # Filter the data frame for the specified problem type
    experiment_df = experiment_df[experiment_df['problem_type'] == problem_type]

    # Define custom orders for plotting
    feature_engineering_order = ['none', 'simple', 'complex']
    has_outliers_removed_order = [False, True]  # Plotly handles boolean as strings in legends

    # Group the data by dataset name, outliers removed, and feature engineering
    grouped_data = experiment_df.groupby(['dataset_name', 'has_outliers_removed', 'feature_engineering']).agg({
        metric: metric_agg
    }).reset_index()

    # Iterate over each dataset and create a plot
    for dataset_name, data in grouped_data.groupby('dataset_name'):
        fig = px.bar(
            data,
            x='feature_engineering',
            y=metric,
            color='has_outliers_removed',
            text=metric,  # This places the text on the bars
            category_orders={
                'feature_engineering': feature_engineering_order,
                'has_outliers_removed': has_outliers_removed_order
            },
            title=f'{metric} for {dataset_name}',
            labels={
                'feature_engineering': 'Feature Engineering Type',
                'has_outliers_removed': 'Outliers Removed',
                metric: f'{metric} Value'
            },
            color_discrete_sequence=px.colors.qualitative.Set2,  # Custom color palette
            barmode = 'group'
        )

        # Update layout for a cleaner look
        fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')
        fig.update_layout(
            showlegend=True,
            legend_title_text='Outliers Removed'
        )

        fig.show()


In [26]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='binary_classification', metric='F1 Score', metric_agg='max')


In [27]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='multi_class_classification', metric='F1 Score', metric_agg='max')

In [28]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='regression', metric='MAPE', metric_agg='min')

In [33]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='regression', metric='R2', metric_agg='min')