In [19]:
import os
import json
import pandas as pd
from glob import glob

In [20]:
# Define the path to the folder containing JSON files
folder_path = '/Users/iliasx/Documents/GitHub/box-office-prediction/metadata/'

# Get all JSON files in the folder
json_files = glob(os.path.join(folder_path, '*.json'))

print(f'Found {len(json_files)} JSON files in the folder.')

Found 1166 JSON files in the folder.


In [21]:
# Initialize lists to hold data
experiment_data = []
metrics_data = []
conf_matrix_data = []
feature_importance_data = []
parameters_data = []

In [22]:
for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)
        
        # Experiment-level data
        experiment_info = {
            'run_id': data.get('run_id', None),
            'timestamp': data.get('timestamp', None),
            'model_type': data.get('model_type', None),
            'problem_type': data.get('problem_type', None),
            'dataset_name': data.get('dataset_name', None),
            'grid_type': data.get('grid_type', None),
            'has_outliers_removed': data.get('has_outliers_removed', None),
            'feature_engineering' : data.get('feature_engineering', None),
            'scaler': data.get('scaler', None),
            "variance_threshold": data.get('variance_threshold', None),
            'duration': data.get('duration', None),
            'number_of_combinations': data.get('number_of_combinations', None),
            
            # Import the metrics dict dynamically with each key as a column
            **data.get('model_parameters', {})
        }
        parameters_data.append(experiment_info)
 
parameters_df = pd.DataFrame(parameters_data)

In [23]:
for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)
        
        # Experiment-level data
        experiment_info = {
            'run_id': data.get('run_id', None),
            'timestamp': data.get('timestamp', None),
            'model_type': data.get('model_type', None),
            'problem_type': data.get('problem_type', None),
            'dataset_name': data.get('dataset_name', None),
            'grid_type': data.get('grid_type', None),
            'has_outliers_removed': data.get('has_outliers_removed', None),
            'feature_engineering' : data.get('feature_engineering', None),
            'scaler': data.get('scaler', None),
            "variance_threshold": data.get('variance_threshold', None),
            'duration': data.get('duration', None),
            'number_of_combinations': data.get('number_of_combinations', None),
            
            # Import the metrics dict dynamically with each key as a column
            **data.get('metrics', {})
        }
        experiment_data.append(experiment_info)
 
experiment_df = pd.DataFrame(experiment_data)


In [24]:
experiment_df = experiment_df[(experiment_df['run_id'] == '20240701_193131')]#[experiment_df['grid_type'] == 'deep_grid_search']#

In [38]:
experiment_df.drop(columns=['run_id', 'timestamp', 'dataset_name']).sort_values(by='duration', ascending=False).head(100)

Unnamed: 0,model_type,problem_type,grid_type,has_outliers_removed,feature_engineering,scaler,variance_threshold,duration,number_of_combinations,ROC AUC Score,...,Precision,Recall,F1 Score,MSE,MAPE,MAE,RMSE,RMSLE,MSLE,R2
1028,logistic_regression,multi_class_classification,grid_search,False,complex,StandardScaler,0.0,258.038657,24,0.759233,...,0.612978,0.556763,0.579632,,,,,,,
1059,lightgbm_classifier,multi_class_classification,grid_search,False,complex,StandardScaler,0.0,257.593349,27,0.803038,...,0.630288,0.650564,0.639671,,,,,,,
56,lightgbm_classifier,multi_class_classification,grid_search,True,simple,StandardScaler,0.0,254.979841,27,0.748072,...,0.575000,0.552263,0.562875,,,,,,,
40,lightgbm_classifier,multi_class_classification,grid_search,False,none,StandardScaler,0.0,254.970085,27,0.764470,...,0.615568,0.586957,0.599878,,,,,,,
1016,lightgbm_classifier,multi_class_classification,grid_search,False,complex,StandardScaler,0.0,254.363116,27,0.792608,...,0.644730,0.687697,0.659262,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,xgboost_classifier,binary_classification,grid_search,False,complex,StandardScaler,0.0,39.483082,27,0.706851,...,0.684524,0.502183,0.579345,,,,,,,
619,xgboost_regressor,regression,grid_search,False,complex,StandardScaler,0.0,38.862933,27,,...,,,,1.448754e+16,139.263043,4.623335e+07,1.203642e+08,2.388657,5.705683,0.627996
118,random_forest_regressor,regression,grid_search,False,complex,StandardScaler,0.0,38.831868,18,,...,,,,1.406808e+16,232.957081,4.684602e+07,1.186089e+08,2.632454,6.929812,0.638766
842,lightgbm_classifier,binary_classification,grid_search,True,none,StandardScaler,0.0,38.651599,27,0.697297,...,0.530120,0.594595,0.560510,,,,,,,


In [26]:
experiment_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 432 entries, 0 to 1162
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   run_id                  432 non-null    object 
 1   timestamp               432 non-null    object 
 2   model_type              432 non-null    object 
 3   problem_type            432 non-null    object 
 4   dataset_name            432 non-null    object 
 5   grid_type               432 non-null    object 
 6   has_outliers_removed    432 non-null    bool   
 7   feature_engineering     432 non-null    object 
 8   scaler                  432 non-null    object 
 9   variance_threshold      432 non-null    float64
 10  duration                432 non-null    float64
 11  number_of_combinations  432 non-null    int64  
 12  ROC AUC Score           288 non-null    float64
 13  Accuracy                288 non-null    float64
 14  Precision               288 non-null    float6

In [27]:
# Assuming the DataFrame is named experiment_df
grouped = experiment_df[experiment_df['problem_type'] == 'regression'].groupby(
    ['problem_type', 'dataset_name', 'has_outliers_removed', 'feature_engineering'])

# Get the index of the minimum MAPE in each group
idx = grouped['MAPE'].idxmin()

# Select the rows that correspond to the minimum MAPE in each group
best_models = experiment_df.loc[idx]

# Display the desired columns
result_reg = best_models[['problem_type', 'dataset_name', 'has_outliers_removed', 'feature_engineering','model_type', 'scaler', 'MAPE', 'MAE']]

In [28]:
#pretty print
print(result_reg.to_string(index=False))

problem_type       dataset_name  has_outliers_removed feature_engineering    model_type         scaler     MAPE          MAE
  regression               full                 False             complex mlp_regressor StandardScaler 1.028038 7.571061e+07
  regression               full                 False                none mlp_regressor StandardScaler 1.028104 7.571061e+07
  regression               full                 False              simple mlp_regressor StandardScaler 1.028132 7.571060e+07
  regression               full                  True             complex mlp_regressor StandardScaler 0.986301 9.417614e+07
  regression               full                  True                none mlp_regressor StandardScaler 0.986277 9.417614e+07
  regression               full                  True              simple mlp_regressor StandardScaler 0.986277 9.417614e+07
  regression  large_productions                 False             complex mlp_regressor StandardScaler 0.999941 2.681107e+08


In [39]:
# Assuming the DataFrame is named experiment_df
grouped = experiment_df[experiment_df['problem_type'] != 'regression'].groupby(
    ['problem_type', 'dataset_name', 'has_outliers_removed', 'feature_engineering'])

# Get the index of the minimum MAPE in each group
idx = grouped['F1 Score'].idxmin()

# Select the rows that correspond to the minimum MAPE in each group
best_models = experiment_df.loc[idx]

# Display the desired columns
result_class = best_models[['problem_type', 'dataset_name', 'has_outliers_removed', 'feature_engineering','model_type', 'scaler', 'F1 Score']]

In [30]:
print(result_class.to_string(index=False))

              problem_type       dataset_name  has_outliers_removed feature_engineering               model_type         scaler  F1 Score
     binary_classification               full                 False             complex      logistic_regression   MinMaxScaler  0.576750
     binary_classification               full                 False             complex           mlp_classifier StandardScaler  0.515385
     binary_classification               full                 False                none      logistic_regression   MinMaxScaler  0.558894
     binary_classification               full                 False                none           mlp_classifier StandardScaler  0.501245
     binary_classification               full                 False              simple      logistic_regression   MinMaxScaler  0.555220
     binary_classification               full                 False              simple       xgboost_classifier StandardScaler  0.541427
     binary_classification        

In [31]:
experiment_df.to_csv('experiment_data.csv', index=False)

In [32]:
import plotly.express as px

def plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='binary_classification', metric='F1 Score', metric_agg='max'):
    # Filter the data frame for the specified problem type
    experiment_df = experiment_df[experiment_df['problem_type'] == problem_type]

    # Define custom orders for plotting
    feature_engineering_order = ['none', 'simple', 'complex']
    has_outliers_removed_order = [False, True]  # Plotly handles boolean as strings in legends

    # Group the data by dataset name, outliers removed, and feature engineering
    grouped_data = experiment_df.groupby(['dataset_name', 'has_outliers_removed', 'feature_engineering']).agg({
        metric: metric_agg
    }).reset_index()

    # Iterate over each dataset and create a plot
    for dataset_name, data in grouped_data.groupby('dataset_name'):
        fig = px.bar(
            data,
            x='feature_engineering',
            y=metric,
            color='has_outliers_removed',
            text=metric,  # This places the text on the bars
            category_orders={
                'feature_engineering': feature_engineering_order,
                'has_outliers_removed': has_outliers_removed_order
            },
            title=f'{metric} for {dataset_name}',
            labels={
                'feature_engineering': 'Feature Engineering Type',
                'has_outliers_removed': 'Outliers Removed',
                metric: f'{metric} Value'
            },
            color_discrete_sequence=px.colors.qualitative.Set2,  # Custom color palette
            barmode = 'group'
        )

        # Update layout for a cleaner look
        fig.update_traces(texttemplate='%{text:.2s}', textposition='inside')
        fig.update_layout(
            showlegend=True,
            legend_title_text='Outliers Removed'
        )

        fig.show()


In [33]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='binary_classification', metric='F1 Score', metric_agg='max')


In [34]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='multi_class_classification', metric='F1 Score', metric_agg='max')

In [35]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='regression', metric='MAPE', metric_agg='min')

In [36]:
plot_one_metric_of_different_datasets_per_feature_engineering_outliers_with_plotly(experiment_df, problem_type='regression', metric='R2', metric_agg='max')