## Post processing
#### Create a matrix of size num_models x num_folds containing normalized loglikelihood for both train and test splits

In [1]:
""" 
IMPORTS
"""
import json

import numpy as np
from post_processing_utils import load_data, load_session_fold_lookup, \
    prepare_data_for_cv, calculate_baseline_test_ll, \
    calculate_glm_test_loglikelihood, calculate_cv_bit_trial, \
    return_glmhmm_nll, return_lapse_nll

In [None]:
""" 
PARAMETERS
"""
# Parameters
C = 2  # number of output classes
num_folds = 5  # number of folds
D = 1  # number of output dimensions
K_max = 5  # maximum number of latent states
num_models = K_max + 2  # model for each latent + 2 lapse models
num_bins = 10

In [None]:
"""
Useful directories
"""
data_dir = '../../data/ibl/data_for_cluster/'
results_dir = '../../results/ibl_global_fit/'
data_dir = '/home/ines/repositories/representation_learning_variability/DATA/GLMHMM/data_for_cluster/'
#data_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/GLMHMM/data_for_cluster/'
processed_ibl_data_path = data_dir + "data_by_bin" + str(num_bins) + "global_normalization/"


results_dir = '../../results/ibl_global_fit/GLM_HMM_K_' + str(K)


In [None]:
"""
Loop through bins and animals
"""
# Get all data
animal_list = load_animal_list(
processed_ibl_data_path + 'animal_list.npz')

animal_file = processed_ibl_data_path + 'all_animals_concat.npz'
inpt, y, bin_data = load_data(animal_file)
bin_fold_lookup_table = load_session_fold_lookup(
    processed_ibl_data_path + 'all_animals_concat_session_fold_lookup.npz')

# Loop through bins
for b, bin in enumerate(range(num_bins)):

    # Subset of trials of interest for bin
    bin_idx = np.where(bin_data == bin)
    bin_y = y[bin_idx]
    bin_bin_fold_lookup_table = bin_fold_lookup_table[bin_idx]
    bin_inpt = inpt[bin_idx]
    bin_bin_data = bin_data[bin_idx]
        
    bin_preferred_model_dict = {}
    models = ["GLM", "Lapse_Model", "GLM_HMM"]

    cvbt_folds_model = np.zeros((num_models, num_folds))
    cvbt_train_folds_model = np.zeros((num_models, num_folds))

    # Save best initialization for each model-fold combination
    best_init_cvbt_dict = {}
    for fold in range(num_folds):
    
        test_inpt, test_y, test_nonviolation_mask, this_test_session, \
        train_inpt, train_y, train_nonviolation_mask, this_train_session, M,\
        n_test, n_train = prepare_data_for_cv(
            bin_inpt, bin_y, bin_bin_data, bin_bin_fold_lookup_table, fold)

        # Ines addapted
        ll0 = calculate_baseline_test_ll(
            train_y[0, train_nonviolation_mask == 1],
            test_y[0, test_nonviolation_mask == 1], C)
        ll0_train = calculate_baseline_test_ll(
            train_y[0, train_nonviolation_mask == 1],
            train_y[0, train_nonviolation_mask == 1], C)
        
        # Check if there are choices of both types.
        if len(np.unique(test_y[0, test_nonviolation_mask == 1])) > 1:
                                
            for model in models:
                if model == "GLM_HMM":
                    for K in range(2, K_max + 1):
                        print("K = " + str(K))
                        model_idx = 3 + (K - 2)
                        cvbt_folds_model[model_idx, fold], \
                        cvbt_train_folds_model[
                            model_idx, fold], _, _, init_ordering_by_train = \
                            return_glmhmm_nll(
                                np.hstack((bin_inpt, np.ones((len(bin_inpt), 1)))), y,
                                bin_bin_data, bin_bin_fold_lookup_table, fold,
                                K, D, C, results_dir)
                        # Save best initialization to dictionary for later:
                        key_for_dict = '/GLM_HMM_K_' + str(K) + '/fold_' + str(
                            fold)
                        best_init_cvbt_dict[key_for_dict] = int(
                            init_ordering_by_train[0])
                        
    # Save best initialization directories across animals, folds and models
    # (only GLM-HMM):
    print(cvbt_folds_model)
    print(cvbt_train_folds_model)
    json_dump = json.dumps(best_init_cvbt_dict)
    f = open(results_dir + "/best_init_cvbt_dict.json", "w")
    f.write(json_dump)
    f.close()
    # Save cvbt_folds_model as numpy array for easy parsing across all
    # models and folds
    np.savez(results_dir + 'bin_' + str(bin+1) + "_cvbt_folds_model.npz", cvbt_folds_model)
    np.savez(results_dir + 'bin_' + str(bin+1) + "_cvbt_train_folds_model.npz",
                cvbt_train_folds_model)
            