### Create design matrix with IBL mice
#### Normalize data across all animals and all bins

In [1]:
"""
IMPORTS
"""
import numpy as np
from oneibl.onelight import ONE
import numpy.random as npr
import json
from sklearn import preprocessing
from collections import defaultdict
import wget
from zipfile import ZipFile
import os
import pandas as pd
from preprocessing_utils_session_ines import get_animal_name, load_animal_list, load_animal_eid_dict, \
    get_all_unnormalized_data_this_session, create_train_test_sessions, bin_frac

npr.seed(65)

In [2]:
"""
Load data
"""
bin_num = 10
data_dir = '/home/ines/repositories/representation_learning_variability/DATA/GLMHMM/'
data_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl/partially_processed/'
# Load animal list/results of partial processing:
animal_list = load_animal_list(
    data_dir + 'animal_list.npz')
animal_eid_dict = load_animal_eid_dict(
    data_dir + 'animal_eid_dict.json')


In [3]:
"""
Create folders to save processed data
"""
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl-behavioral-data-Dec-2019/'
# Create directories for saving data:
processed_ibl_data_path = data_dir + "data_for_cluster/"
if not os.path.exists(processed_ibl_data_path):
    os.makedirs(processed_ibl_data_path)
# Also create a subdirectory for storing each animal's bin data:
if not os.path.exists(processed_ibl_data_path + "data_by_bin" + str(bin_num) + "global_normalization/"):
    os.makedirs(processed_ibl_data_path + "data_by_bin" + str(bin_num) + "global_normalization/")

In [4]:
# Require that each animal has at least 10 sessions (=2700 trials) of data:
req_num_sessions = 10  # 30*90 = 2700
for animal in animal_list:
    num_sessions = len(animal_eid_dict[animal])
    if num_sessions < req_num_sessions:
        animal_list = np.delete(animal_list,
                                np.where(animal_list == animal))

In [5]:
#all_trials.to_csv(trials_dir + "learning_private2_one.csv")

In [6]:
trials_dir = '/home/ines/repositories/representation_learning_variability/DATA/'
trials_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/'
all_trials = pd.read_csv(trials_dir + "learning_private_one.csv")  
all_trials = bin_frac(all_trials, bin_num)

In [12]:
# Identify idx in master array where each animal-bin's data starts and ends:
animal_start_idx = {}
animal_end_idx = {}

missing_data = {}

final_animal_eid_dict = defaultdict(list)
# WORKHORSE: iterate through each animal and each animal's set of eids;
# obtain unnormalized data.  Write out each animal's data and then also
# write to master array
for b, bin in enumerate(range(bin_num)):
        
    for z, animal in enumerate(animal_list):
        # Check if all data is available:
        #eid_path = one_local_rep_path + eid + '/alf'
        #dir_list = os.listdir(eid_path)
        
        animal, unnormalized_inpt, y, bin_arr, num_viols_50, rewarded = \
            get_all_unnormalized_data_this_session(
                animal, bin, all_trials)
        if num_viols_50 < 100:  # only include session if number of viols, if not, data will not be created or appended
            animal_unnormalized_inpt = np.copy(unnormalized_inpt)
            animal_y = np.copy(y)
            animal_bin = bin_arr
            animal_rewarded = np.copy(rewarded)
            final_animal_eid_dict[animal].append(str(bin+1))
            
            ## Check psychometric curves!
            #un_inpt, un_y, un_bin_data = load_data(unnormalized_data)
            """
            import matplotlib.pyplot as plt
            plt.figure()
            x_stim = np.unique(unnormalized_inpt[:, 0])
            y_stim = np.zeros(len(x_stim))
            
            for s, stimulus in enumerate(x_stim):
                y_stim[s] = np.nanmean(y[np.where(unnormalized_inpt[:, 0]==stimulus), 0])
            plt.scatter(x_stim, y_stim)
            plt.title('Bin' + str(bin+1))
            plt.ylim([0,1])
            """
                    
            # Write out bin's unnormalized data matrix:
            np.savez(
                processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + animal + '_' + str(bin+1) +
                '_unnormalized.npz',
                animal_unnormalized_inpt, animal_y,
                animal_bin)
            np.savez(
                processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + animal + '_' + str(bin+1) +
                '_rewarded.npz',
                animal_rewarded)
            bin_shuffled_folds = create_train_test_sessions(animal_bin, num_folds=5)
            np.savez(
                processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + animal + '_' + str(bin+1) +
                "_bin_fold_lookup" +
                ".npz",
                bin_shuffled_folds)
            
            assert animal_rewarded.shape[0] == animal_y.shape[0]
            assert len(animal_bin) == len(bin_shuffled_folds)

            # Now create or append data to master array across all animals across all bins:
            key = animal +'_bin_'+ str(bin+1)
            if z == 0 and b == 0:
                master_inpt = np.copy(animal_unnormalized_inpt)
                animal_start_idx[key] = 0
                animal_end_idx[key] = master_inpt.shape[0] - 1
                master_y = np.copy(animal_y)
                master_bin = animal_bin
                master_bin_fold_lookup_table = bin_shuffled_folds
                master_rewarded = np.copy(animal_rewarded)
            else:
                animal_start_idx[key] = master_inpt.shape[0]
                master_inpt = np.vstack((master_inpt, animal_unnormalized_inpt))
                animal_end_idx[key] = master_inpt.shape[0] - 1
                master_y = np.vstack((master_y, animal_y))
                master_bin = np.concatenate((master_bin, animal_bin))
                master_bin_fold_lookup_table = np.hstack(
                    (master_bin_fold_lookup_table, bin_shuffled_folds))  # Changed vstack into hstack
                master_rewarded = np.vstack((master_rewarded, animal_rewarded))
                    
        else:
            print(str(animal + '_' + str(bin) + '_missing'))
            missing_data[animal] = str(bin+1)
            
# After looping through animals, write out data from across animals for that bin
assert np.shape(master_inpt)[0] == np.shape(master_y)[
    0], "inpt and y not same length"
assert np.shape(master_rewarded)[0] == np.shape(master_y)[
    0], "rewarded and y not same length"
assert len(master_bin) == \
        np.shape(master_bin_fold_lookup_table)[
    0], "number of unique sessions and session fold lookup don't " \
                "match"


# Normalize across animals
normalized_inpt = np.copy(master_inpt)
normalized_inpt[:, 0] = preprocessing.scale(normalized_inpt[:, 0])
np.savez(processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + 'all_animals_concat' + '.npz',
            normalized_inpt,
            master_y, master_bin)
np.savez(
    processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + 'all_animals_concat_unnormalized' + '.npz',
    master_inpt, master_y, master_bin)
np.savez(
    processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + 'all_animals_concat_session_fold_lookup' +
    '.npz',
    master_bin_fold_lookup_table)
np.savez(processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + 'all_animals_concat_rewarded' + '.npz',
            master_rewarded)
np.savez(processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + 'animal_list.npz',
            animal_list)
import json
json = json.dumps(final_animal_eid_dict)
f = open(processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "final_animal_eid_dict" + ".json", "w")
f.write(json)
f.close()
    
# Now write out normalized data (when normalized across all animals) for
# each animal:
counter = 0
for key in animal_start_idx.keys():
    #for b, bin in enumerate(range(bin_num)):
        #key = animal +'_'+ str(bin)
    start_idx = animal_start_idx[key]
    end_idx = animal_end_idx[key]
    inpt = normalized_inpt[range(start_idx, end_idx + 1)]
    y = master_y[range(start_idx, end_idx + 1)]
    binn = master_bin[range(start_idx, end_idx + 1)]
    counter += inpt.shape[0]
    np.savez(processed_ibl_data_path + 'data_by_bin' + str(bin_num) + "global_normalization/" + key + '_processed.npz',
                inpt, y,
                binn)

assert counter == master_inpt.shape[0]


CSH_ZAD_024 0
CSH_ZAD_023 0
CSH_ZAD_021 0
CSH_ZAD_019 0
CSH_ZAD_018 0
CSH_ZAD_017 0
CSH_ZAD_016 0
CSH_ZAD_015 0
NYU-21 0
CSH_ZAD_022 0
DY_015 0
DY_012 0
CSHL058 0
DY_014 0
DY_013 0
CSHL060 0
DY_009 0
DY_008 0
CSHL059 0
CSHL046 0
NYU-20 0
CSHL052 0
CSHL045 0
CSHL055 0
NYU-11 0
NYU-11_0_missing
NYU-14 0
CSHL053 0
CSHL047 0
CSHL054 0
CSHL051 0
NYU-13 0
NYU-12 0
NYU-09 0
CSHL049 0
NYU-07 0
CSH_ZAD_009 0
CSHL_011 0
CSH_ZAD_006 0
CSH_ZAD_005 0
CSH_ZAD_001 0
CSH_ZAD_004 0
CSH_ZAD_003 0
CSH_ZAD_002 0
CSHL_014 0
CSHL_013 0
CSHL_012 0
SWC_001 0
CSHL_015 0
IBL_001 0
CSH_ZAD_024 1
CSH_ZAD_023 1
CSH_ZAD_021 1
CSH_ZAD_019 1
CSH_ZAD_018 1
CSH_ZAD_017 1
CSH_ZAD_016 1
CSH_ZAD_015 1
NYU-21 1
CSH_ZAD_022 1
DY_015 1
DY_012 1
CSHL058 1
DY_014 1
DY_013 1
CSHL060 1
DY_009 1
DY_008 1
CSHL059 1
CSHL046 1
NYU-20 1
CSHL052 1
CSHL045 1
CSHL055 1
NYU-11 1
NYU-11_1_missing
NYU-14 1
CSHL053 1
CSHL047 1
CSHL054 1
CSHL051 1
NYU-13 1
NYU-12 1
NYU-09 1
CSHL049 1
NYU-07 1
CSH_ZAD_009 1
CSHL_011 1
CSH_ZAD_006 1
CSH_ZAD_00