### Create design matrix with IBL mice
#### Normalize data across all animals and all bins

In [1]:
"""
IMPORTS
"""
import numpy as np
from oneibl.onelight import ONE
import numpy.random as npr
import json
from sklearn import preprocessing
from collections import defaultdict
import wget
from zipfile import ZipFile
import os
import pandas as pd
from preprocessing_utils_session_ines import get_animal_name, load_animal_list, load_animal_eid_dict, \
    get_all_unnormalized_data_this_session, create_train_test_sessions

npr.seed(65)

In [2]:
"""
Load data
"""
data_dir = '/home/ines/repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl/partially_processed/'
# Load animal list/results of partial processing:
animal_list = load_animal_list(
    data_dir + 'animal_list_prof.npz')
animal_eid_dict = load_animal_eid_dict(
    data_dir + 'animal_eid_dict_prof.json')


In [3]:
len(animal_list)

288

In [4]:
"""
Create folders to save processed data
"""
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl-behavioral-data-Dec-2019/'
# Create directories for saving data:
processed_ibl_data_path = data_dir + "data_for_cluster/"
if not os.path.exists(processed_ibl_data_path):
    os.makedirs(processed_ibl_data_path)
# Also create a subdirectory for storing each animal's data:
if not os.path.exists(processed_ibl_data_path + "data_by_animal"):
    os.makedirs(processed_ibl_data_path + "data_by_animal")

In [5]:
# Require that each animal has at least 10 sessions (=2700 trials) of data:
req_num_sessions = 30  # 30*90 = 2700
total_sessions = 0
for animal in animal_list:
    num_sessions = len(animal_eid_dict[animal])
    #print(num_sessions)
    if num_sessions < req_num_sessions:
        animal_list = np.delete(animal_list,
                                np.where(animal_list == animal))
    else:
        total_sessions = total_sessions + num_sessions


In [8]:
trials_dir = '/home/ines/repositories/representation_learning_variability/DATA/'
#trials_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/'
all_trials = pd.read_csv(trials_dir + "proficient_data.csv")  


In [10]:
# Identify idx in master array where each animal-bin's data starts and ends:
animal_start_idx = {}
animal_end_idx = {}

missing_data = []

final_animal_eid_dict = defaultdict(list)
# WORKHORSE: iterate through each animal and each animal's set of eids;
# obtain unnormalized data.  Write out each animal's data and then also
# write to master array

# Loop through animals        
for z, animal in enumerate(animal_list):
    
    animal_sessions = np.unique(all_trials.loc[all_trials['subject_nickname']==animal, 'session'])
    # Loop through animals
    sess_counter = 0
    for eid in animal_sessions:
        session_data = all_trials.loc[(all_trials['subject_nickname']==animal) & (all_trials['session']==eid)]
        animal, unnormalized_inpt, y, session, num_viols_50, rewarded = \
            get_all_unnormalized_data_this_session(
                animal, session_data)
        if num_viols_50 < 10:  # only include session if number of viols, if not, data will not be created or appended
            # Append data from different sessions of the same animal
            if sess_counter == 0:
                animal_unnormalized_inpt = np.copy(unnormalized_inpt)
                animal_y = np.copy(y)
                animal_session = session
                animal_rewarded = np.copy(rewarded)
            else:
                animal_unnormalized_inpt = np.vstack(
                    (animal_unnormalized_inpt, unnormalized_inpt))
                animal_y = np.vstack((animal_y, y))
                animal_session = np.concatenate((animal_session, session))
                animal_rewarded = np.vstack((animal_rewarded, rewarded))
            sess_counter += 1
            final_animal_eid_dict[animal].append(eid)
            
            ## Check psychometric curves!
            #un_inpt, un_y, un_bin_data = load_data(unnormalized_data)
            """
            import matplotlib.pyplot as plt
            plt.figure()
            x_stim = np.unique(unnormalized_inpt[:, 0])
            y_stim = np.zeros(len(x_stim))
            
            for s, stimulus in enumerate(x_stim):
                y_stim[s] = np.nanmean(y[np.where(unnormalized_inpt[:, 0]==stimulus), 0])
            plt.scatter(x_stim, y_stim)
            plt.title('Session' + str(sessio))
            plt.ylim([0,1])
            """
                
        else:
            print(str(animal + '_missing' + str(num_viols_50)))
            missing_data.append(animal)
        
    # Write out animal's unnormalized data matrix (all sessions concatenated):
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
            '_unnormalized.npz',
        animal_unnormalized_inpt, animal_y,
        animal_session)
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
        '_rewarded.npz',
        animal_rewarded)
    animal_session_fold_lookup = create_train_test_sessions(animal_session, num_folds=5)
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
        "_session_fold_lookup" +
        ".npz",
        animal_session_fold_lookup)
    
    assert animal_rewarded.shape[0] == animal_y.shape[0]
    #assert len(animal_bin) == len(bin_shuffled_folds)

    # Now create or append data to master array across all animals:
    if z == 0:
        master_inpt = np.copy(animal_unnormalized_inpt)
        animal_start_idx[animal] = 0
        animal_end_idx[animal] = master_inpt.shape[0] - 1
        master_y = np.copy(animal_y)
        master_session = animal_session
        master_session_fold_lookup_table = animal_session_fold_lookup
        master_rewarded = np.copy(animal_rewarded)
    else:
        animal_start_idx[animal] = master_inpt.shape[0]
        master_inpt = np.vstack((master_inpt, animal_unnormalized_inpt))
        animal_end_idx[animal] = master_inpt.shape[0] - 1
        master_y = np.vstack((master_y, animal_y))
        master_session = np.concatenate((master_session, animal_session))
        master_session_fold_lookup_table = np.vstack(
            (master_session_fold_lookup_table, animal_session_fold_lookup))  # Changed vstack into hstack
        master_rewarded = np.vstack((master_rewarded, animal_rewarded))
                
    # Write out data from across animals
    assert np.shape(master_inpt)[0] == np.shape(master_y)[
        0], "inpt and y not same length"
    assert np.shape(master_rewarded)[0] == np.shape(master_y)[
        0], "rewarded and y not same length"
    assert len(np.unique(master_session)) == \
           np.shape(master_session_fold_lookup_table)[
               0], "number of unique sessions and session fold lookup don't " \
                   "match"
    #assert len(master_inpt) == 181530, "design matrix for all IBL animals " \
    #                                   "should have shape (181530, 3)"
    #assert len(animal_list) == 37, "37 animals were studied in Ashwood et " \
    #                               "al. (2020)"
    normalized_inpt = np.copy(master_inpt)
    normalized_inpt[:, 0] = preprocessing.scale(normalized_inpt[:, 0])
    np.savez(processed_ibl_data_path + 'all_animals_concat' + '.npz',
             normalized_inpt,
             master_y, master_session)
    np.savez(
        processed_ibl_data_path + 'all_animals_concat_unnormalized' + '.npz',
        master_inpt, master_y, master_session)
    np.savez(
        processed_ibl_data_path + 'all_animals_concat_session_fold_lookup' +
        '.npz',
        master_session_fold_lookup_table)
    np.savez(processed_ibl_data_path + 'all_animals_concat_rewarded' + '.npz',
             master_rewarded)
    np.savez(processed_ibl_data_path + 'data_by_animal/' + 'animal_list.npz',
             animal_list)
    
    import json
    json = json.dumps(final_animal_eid_dict)
    f = open(processed_ibl_data_path + "final_animal_eid_dict.json", "w")
    f.write(json)
    f.close()

    # Now write out normalized data (when normalized across all animals) for
    # each animal:
    counter = 0
    for animal in animal_start_idx.keys():
        start_idx = animal_start_idx[animal]
        end_idx = animal_end_idx[animal]
        inpt = normalized_inpt[range(start_idx, end_idx + 1)]
        y = master_y[range(start_idx, end_idx + 1)]
        session = master_session[range(start_idx, end_idx + 1)]
        counter += inpt.shape[0]
        np.savez(processed_ibl_data_path + 'data_by_animal/' + animal + '_processed.npz',
                 inpt, y,
                 session)

    assert counter == master_inpt.shape[0]

CSHL054_missing13
CSHL054_missing10
CSHL054_missing35
CSHL054_missing16
CSHL054_missing21
CSHL_001_missing13
CSHL_002_missing17
CSHL_003_missing10
CSHL_003_missing15
CSHL_007_missing21
CSHL_007_missing10
CSHL_007_missing14
CSHL_007_missing34
CSHL_007_missing16
CSHL_007_missing14
CSHL_007_missing17
CSHL_007_missing12
CSHL_007_missing15
CSHL_007_missing22
CSHL_007_missing12
CSHL_007_missing36
CSHL_007_missing23
CSHL_007_missing10
CSHL_007_missing13
CSHL_007_missing17
CSHL_008_missing20
CSHL_008_missing11
CSHL_008_missing12
CSHL_010_missing13
CSHL_010_missing16
CSHL_012_missing26
CSHL_012_missing38
CSHL_012_missing14
CSHL_012_missing13
CSHL_012_missing32
CSHL_012_missing27
CSHL_012_missing57
CSHL_012_missing30
CSHL_012_missing29
CSHL_014_missing11
CSHL_014_missing18
CSHL_014_missing10
CSHL_014_missing19
CSHL_014_missing15
CSHL_014_missing12
CSHL_014_missing13
CSHL_015_missing11
CSHL_015_missing20
CSHL_015_missing27
CSHL_015_missing18
CSHL_015_missing32
CSHL_015_missing15
CSHL_015_missing4