In [None]:
import os

import pandas as pd
from pandas import DataFrame as DF

from mcbn.utils.helper import get_setup
from mcbn.utils.helper import dump_yaml
from mcbn.utils.helper import get_directories_in_path
from mcbn.utils.helper import get_logger
from mcbn.environment.constants import HYPERPARAMS_EVAL_PATH

In [None]:
logger = get_logger()

logger.info("STEP 3: Getting best hyperparameter choices")

In [None]:
# Keep track of parsed datasets
parsed_datasets = []

# Store all results of parsed datasets in a dataframe
results_df = None

# Get evaluations performed in order from last evaluation to first
evaluation_dirs = sorted(os.listdir(HYPERPARAMS_EVAL_PATH), reverse=True)

# Iterate over all evaluation dirs
for eval_dir in evaluation_dirs:
    
    # Get all dataset-specific subdirs in evaluation dir
    eval_path = os.path.join(HYPERPARAMS_EVAL_PATH, eval_dir)
    dataset_dirs = get_directories_in_path(eval_path)
    
    # Iterate over dataset-specific subdirs
    for dataset_name in dataset_dirs:
        
        # Make sure we have not added a later evaluation of this dataset to results
        if not dataset_name in parsed_datasets:
            
            dataset_eval_path = os.path.join(eval_path, dataset_name)
            results_file_path = os.path.join(dataset_eval_path, 'dataset-results.csv')
            
            # Check that a results file exists (i.e. we are not currently running this evaluation)
            if os.path.exists(results_file_path):

                # Load results dataframe
                df = DF.from_csv(results_file_path)
                
                # For each result, add its corresponding test index
                df['original_index'] = df.index
                for model_name, group in df.groupby('model'):
                    model_df = group.reset_index(drop=True)
                    df.loc[model_df['original_index'], 'test_index'] = model_df.index
                df.drop('original_index', axis=1, inplace=True)
                
                # For each result, add the path to the trained models for all folds
                def get_relative_path_of_trained_models(results_row):
                    
                    # Get test dir name
                    test_dir_name = 'test_{}'.format(int(results_row.test_index))
                    
                    # Get base model name
                    model_name = results_row.model
                    base_model_name = model_name.replace('MC', '')
                    
                    # Return relative path
                    abs_path = os.path.join(dataset_eval_path, base_model_name, test_dir_name, model_name)
                    return os.path.relpath(abs_path, os.getcwd())
                
                df['path'] = df.apply(get_relative_path_of_trained_models, axis=1)
                
                # Mark dataset as added
                parsed_datasets.append(dataset_name)
                
                # Append df to collection of all results 
                results_df = df.reset_index(drop=True) if results_df is None else results_df.append(df, ignore_index=True)

In [None]:
logger.info(results_df.groupby(['dataset_name', 'model', 'batch_size']).cv_rmse.min())

In [None]:
idx = results_df.groupby(['dataset_name', 'model']).cv_rmse.transform(min) == results_df.cv_rmse
best_results_df = results_df[idx]
logger.info(best_results_df)

In [None]:
# Summarize best results in a dict to be dumped as yml
parsed_datasets = list(set(d for d in best_results_df.dataset_name)) 
parsed_models = list(set(d for d in best_results_df.model))

best_results_dict = {d: {m: {} for m in parsed_models} for d in parsed_datasets}

for i, row in best_results_df.iterrows():
    config_dict_keys = ['batch_size', 'lambda', 'dropout', 'cv_rmse', 'path', 'cv_epoch']
    best_results_dict[row.dataset_name][row.model] = { k: row[k] for k in config_dict_keys if not pd.isnull(row[k])}

dump_yaml(best_results_dict, os.getcwd(), 'grid_search_results.yml')
logger.info("DONE STEP 3")