## Post processing
#### Create a matrix of size num_models x num_folds containing normalized loglikelihood for both train and test splits

In [10]:
""" 
IMPORTS
"""
import json
import os
import numpy as np
from post_processing_utils import load_data, load_session_fold_lookup, \
    prepare_data_for_cv, calculate_baseline_test_ll, \
    calculate_glm_test_loglikelihood, calculate_cv_bit_trial, \
    return_glmhmm_nll, return_lapse_nll
from glm_hmm_utils import load_cluster_arr, load_session_fold_lookup, \
        load_animal_list, load_data, create_violation_mask, \
        launch_glm_hmm_job

In [11]:
""" 
PARAMETERS
"""
# Parameters
C = 2  # number of output classes
num_folds = 5  # number of folds
D = 1  # number of output dimensions
K_max = 5  # maximum number of latent states
num_models = K_max + 2  # model for each latent + 2 lapse models
num_bins = 10
Ks = [2, 3]

In [12]:
"""
Useful directories
"""
data_dir = '../../data/ibl/data_for_cluster/'
results_dir = '../../results/ibl_global_fit/'
data_dir = '/home/ines/repositories/representation_learning_variability/DATA/GLMHMM/data_for_cluster/'
data_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/GLMHMM/data_for_cluster/'
processed_ibl_data_path = data_dir + "data_by_bin" + str(num_bins) + "global_normalization/"


In [13]:
"""
Loop through bins and animals
"""
# Get all data
animal_list = load_animal_list(processed_ibl_data_path + 'animal_list.npz')
animal_list = ['DY_013', 'DY_012']
bins_interest = [0, 5, 9]
# Loop through bins
for b, bin in enumerate(bins_interest):

        for i, animal in enumerate(animal_list):
            print(animal)
            animal_file = processed_ibl_data_path + animal + '_bin_' + str(bin+1) + '_processed.npz'
            # Check if animal has data
            if os.path.exists(animal_file):
                # Load animal data
                inpt, y, bin_data = load_data(animal_file)
                bin_fold_lookup_table = load_session_fold_lookup(
                    processed_ibl_data_path + animal + '_' + str(bin+1) + '_bin_fold_lookup.npz')
        
                #animal_preferred_model_dict = {}
                models = ["GLM", "Lapse_Model", "GLM_HMM"]

                cvbt_folds_model = np.zeros((num_models, num_folds))
                cvbt_train_folds_model = np.zeros((num_models, num_folds))

                # Save best initialization for each model-fold combination
                best_init_cvbt_dict = {}
                for fold in range(num_folds):
                
                    test_inpt, test_y, test_nonviolation_mask, this_test_session, \
                    train_inpt, train_y, train_nonviolation_mask, this_train_session, M,\
                    n_test, n_train = prepare_data_for_cv(
                        inpt, y, bin_data, bin_fold_lookup_table, fold)

                    # Ines addapted
                    ll0 = calculate_baseline_test_ll(
                        train_y[0, train_nonviolation_mask == 1],
                        test_y[0, test_nonviolation_mask == 1], C)
                    ll0_train = calculate_baseline_test_ll(
                        train_y[0, train_nonviolation_mask == 1],
                        train_y[0, train_nonviolation_mask == 1], C)
                
                    # Check if there are choices of both types.
                    if len(np.unique(test_y[0, test_nonviolation_mask == 1])) > 1:
                                            
                        for model in models:
                            if model == "GLM_HMM":
                                #for K in range(2, K_max + 1):
                                for K in Ks:
                                    print("K = " + str(K))
                                    results_dir = '../../results/ibl_individual_fit/' + animal + '_' \
                                                  + str(bin+1) + '/GLM_HMM_K_' + str(K) + '/fold_' + str(
                                                  fold) 
                                                
                                    model_idx = 3 + (K - 2)
                                    cvbt_folds_model[model_idx, fold], \
                                    cvbt_train_folds_model[
                                        model_idx, fold], _, _, init_ordering_by_train = \
                                        return_glmhmm_nll(
                                            np.hstack((inpt, np.ones((len(inpt), 1)))), y,
                                            bin_data, bin_fold_lookup_table, fold,
                                            K, D, C, results_dir)
                                    # Save best initialization to dictionary for later:
                                    key_for_dict = '/GLM_HMM_K_' + str(K) + '/fold_' + str(
                                        fold)
                                    best_init_cvbt_dict[key_for_dict] = int(
                                        init_ordering_by_train[0])
                            
            # Save best initialization directories folds and models per animal per bin
            # (only GLM-HMM):
            #print(cvbt_folds_model)
            #print(cvbt_train_folds_model)
            final_results_dir = '../../results/ibl_individual_fit/' + animal + '_' + str(bin+1) 
            json_dump = json.dumps(best_init_cvbt_dict)
            f = open(final_results_dir + "/best_init_cvbt_dict.json", "w")
            f.write(json_dump)
            f.close()
            # Save cvbt_folds_model as numpy array for easy parsing across all
            # models and folds
            if not os.path.exists(final_results_dir + '/GLMHMM/'):
                os.makedirs(final_results_dir + '/GLMHMM/')
            np.savez(final_results_dir + '/GLMHMM/' + "cvbt_folds_model.npz", cvbt_folds_model)
            np.savez(final_results_dir + '/GLMHMM/' + "cvbt_train_folds_model.npz",
                        cvbt_train_folds_model)
            

DY_013
K = 2
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_2/fold_0/
K = 3
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_3/fold_0/
K = 2
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_2/fold_1/
K = 3
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_3/fold_1/
K = 2
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_2/fold_2/
K = 3
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_3/fold_2/
K = 2
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_2/fold_3/
K = 3
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_3/fold_3/
K = 2
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_2/fold_4/
K = 3
../../results/ibl_individual_fit/DY_013_1/GLM_HMM_K_3/fold_4/
DY_012
K = 2
../../results/ibl_individual_fit/DY_012_1/GLM_HMM_K_2/fold_0/
K = 3
../../results/ibl_individual_fit/DY_012_1/GLM_HMM_K_3/fold_0/
K = 2
../../results/ibl_individual_fit/DY_012_1/GLM_HMM_K_2/fold_1/
K = 3
../../results/ibl_individual_fit/DY_012_1/GLM_HMM_K_3/fold_1/
K = 2
../../results/ibl_individual