### Create design matrix with IBL mice
#### Normalize data across all animals and all bins

In [1]:
"""
IMPORTS
"""
import numpy as np
from oneibl.onelight import ONE
import numpy.random as npr
import json
from sklearn import preprocessing
from collections import defaultdict
import wget
from zipfile import ZipFile
import os
import pandas as pd
from preprocessing_utils import get_animal_name, load_animal_list, load_animal_eid_dict, \
    get_all_unnormalized_data_this_session, create_train_test_sessions

npr.seed(65)

In [2]:
"""
Load data
"""
data_dir = '/home/ines/repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl/partially_processed/'
# Load animal list/results of partial processing:
animal_list = load_animal_list(
    data_dir + 'animal_list.npz')
animal_eid_dict = load_animal_eid_dict(
    data_dir + 'animal_eid_dict.json')


In [3]:
"""
Create folders to save processed data
"""
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl-behavioral-data-Dec-2019/'
# Create directories for saving data:
processed_ibl_data_path = data_dir + "data_for_cluster/"
if not os.path.exists(processed_ibl_data_path):
    os.makedirs(processed_ibl_data_path)
# Also create a subdirectory for storing each animal's data:
if not os.path.exists(processed_ibl_data_path + "data_by_animal"):
    os.makedirs(processed_ibl_data_path + "data_by_animal")

In [4]:
# Require that each animal has at least 10 sessions (=2700 trials) of data:
req_num_sessions = 30  # 30*90 = 2700
# TODO: check how many animals are left, 30 sounds like a lot!
for animal in animal_list:
    num_sessions = len(animal_eid_dict[animal])
    if num_sessions < req_num_sessions:
        animal_list = np.delete(animal_list,
                                np.where(animal_list == animal))

In [5]:
trials_dir = '/home/ines/repositories/representation_learning_variability/DATA/'
#trials_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/'
all_trials = pd.read_csv(trials_dir + "profficient_one.csv")  


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# Identify idx in master array where each animal-bin's data starts and ends:
animal_start_idx = {}
animal_end_idx = {}

missing_data = {}

final_animal_eid_dict = defaultdict(list)
# WORKHORSE: iterate through each animal and each animal's set of eids;
# obtain unnormalized data.  Write out each animal's data and then also
# write to master array

# Loop through animals        
for z, animal in enumerate(animal_list):
    
    animal_sessions = all_trials.loc[all_trials['subject_nickname']==animal, 'session']
    # Loop through animals
    sess_counter = 0
    for eid in animal_sessions:
        
        animal, unnormalized_inpt, y, session, num_viols_50, rewarded = \
            get_all_unnormalized_data_this_session(
                animal, eid, all_trials)
        if num_viols_50 < 10:  # only include session if number of viols, if not, data will not be created or appended
            # Append data from different sessions of the same animal
            if sess_counter == 0:
                animal_unnormalized_inpt = np.copy(unnormalized_inpt)
                animal_y = np.copy(y)
                animal_session = session
                animal_rewarded = np.copy(rewarded)
            else:
                animal_unnormalized_inpt = np.vstack(
                    (animal_unnormalized_inpt, unnormalized_inpt))
                animal_y = np.vstack((animal_y, y))
                animal_session = np.concatenate((animal_session, session))
                animal_rewarded = np.vstack((animal_rewarded, rewarded))
            sess_counter += 1
            final_animal_eid_dict[animal].append(eid)
            
            ## Check psychometric curves!
            #un_inpt, un_y, un_bin_data = load_data(unnormalized_data)
            """
            import matplotlib.pyplot as plt
            plt.figure()
            x_stim = np.unique(unnormalized_inpt[:, 0])
            y_stim = np.zeros(len(x_stim))
            
            for s, stimulus in enumerate(x_stim):
                y_stim[s] = np.nanmean(y[np.where(unnormalized_inpt[:, 0]==stimulus), 0])
            plt.scatter(x_stim, y_stim)
            plt.title('Session' + str(sessio))
            plt.ylim([0,1])
            """
                
        else:
            print(str(animal + '_missing'))
            missing_data.append(animal)
        
    # Write out animal's unnormalized data matrix (all sessions concatenated):
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
            '_unnormalized.npz',
        animal_unnormalized_inpt, animal_y,
        animal_session)
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
        '_rewarded.npz',
        animal_rewarded)
    animal_session_fold_lookup = create_train_test_sessions(animal_session, num_folds=5)
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
        "_session_fold_lookup" +
        ".npz",
        animal_session_fold_lookup)
    
    assert animal_rewarded.shape[0] == animal_y.shape[0]
    #assert len(animal_bin) == len(bin_shuffled_folds)

    # Now create or append data to master array across all animals:
    if z == 0:
        master_inpt = np.copy(animal_unnormalized_inpt)
        animal_start_idx[animal] = 0
        animal_end_idx[animal] = master_inpt.shape[0] - 1
        master_y = np.copy(animal_y)
        master_session = animal_session
        master_session_fold_lookup_table = animal_session_fold_lookup
        master_rewarded = np.copy(animal_rewarded)
    else:
        animal_start_idx[animal] = master_inpt.shape[0]
        master_inpt = np.vstack((master_inpt, animal_unnormalized_inpt))
        animal_end_idx[animal] = master_inpt.shape[0] - 1
        master_y = np.vstack((master_y, animal_y))
        master_session = np.concatenate((master_session, animal_session))
        master_session_fold_lookup_table = np.hstack(
            (master_session_fold_lookup_table, animal_session_fold_lookup))  # Changed vstack into hstack
        master_rewarded = np.vstack((master_rewarded, animal_rewarded))
                
    # Write out data from across animals
    assert np.shape(master_inpt)[0] == np.shape(master_y)[
        0], "inpt and y not same length"
    assert np.shape(master_rewarded)[0] == np.shape(master_y)[
        0], "rewarded and y not same length"
    assert len(np.unique(master_session)) == \
           np.shape(master_session_fold_lookup_table)[
               0], "number of unique sessions and session fold lookup don't " \
                   "match"
    assert len(master_inpt) == 181530, "design matrix for all IBL animals " \
                                       "should have shape (181530, 3)"
    assert len(animal_list) == 37, "37 animals were studied in Ashwood et " \
                                   "al. (2020)"
    normalized_inpt = np.copy(master_inpt)
    normalized_inpt[:, 0] = preprocessing.scale(normalized_inpt[:, 0])
    np.savez(processed_ibl_data_path + 'all_animals_concat' + '.npz',
             normalized_inpt,
             master_y, master_session)
    np.savez(
        processed_ibl_data_path + 'all_animals_concat_unnormalized' + '.npz',
        master_inpt, master_y, master_session)
    np.savez(
        processed_ibl_data_path + 'all_animals_concat_session_fold_lookup' +
        '.npz',
        master_session_fold_lookup_table)
    np.savez(processed_ibl_data_path + 'all_animals_concat_rewarded' + '.npz',
             master_rewarded)
    np.savez(processed_ibl_data_path + 'data_by_animal/' + 'animal_list.npz',
             animal_list)

    json = json.dumps(final_animal_eid_dict)
    f = open(processed_ibl_data_path + "final_animal_eid_dict.json", "w")
    f.write(json)
    f.close()

    # Now write out normalized data (when normalized across all animals) for
    # each animal:
    counter = 0
    for animal in animal_start_idx.keys():
        start_idx = animal_start_idx[animal]
        end_idx = animal_end_idx[animal]
        inpt = normalized_inpt[range(start_idx, end_idx + 1)]
        y = master_y[range(start_idx, end_idx + 1)]
        session = master_session[range(start_idx, end_idx + 1)]
        counter += inpt.shape[0]
        np.savez(processed_ibl_data_path + 'data_by_animal/' + animal + '_processed.npz',
                 inpt, y,
                 session)

    assert counter == master_inpt.shape[0]

ZFM-05234 0
PL038 0
UCLA057 0
UCLA056 0
UCLA048 0
UCLA051 0
UCLA054 0
UCLA049 0
NR_0021 0
UCLA052 0
UCLA053 0
ZFM-04306 0
KS093 0
KS096 0
KS095 0
SWC_067 0
KS094 0
KS091 0
KS092 0
UCLA050 0
SWC_065 0
ZFM-04307 0
NR_0019 0
ZFM-03843 0
NR_0020 0
UCLA029 0
KS089 0
PL032 0
UCLA045 0
UCLA044 0
UCLA043 0
UCLA046 0
ZFM-03841 0
PL033 0
KS087 0
KS086 0
KS083 0
UCLA047 0
UCLA030 0
SWC_063 0
PL031 0
SWC_064 0
KS085 0
PL029 0
UCLA033 0
PL028 0
UCLA036 0
KS084 0
UCLA037 0
UCLA035 0
UCLA034 0
NR_0018 0
NR_0017 0
KS080 0
KS075 0
UCLA031 0
KS082 0
SWC_062 0
NYU-58 0
PL026 0
KS074 0
UCLA015 0
UCLA015_0_missing
UCLA016 0
PL027 0
PL019 0
PL025 0
UCLA010 0
PL015 0
UCLA013 0
PL012 0
PL014 0
PL022 0
PL016 0
NR_0014 0
UCLA009 0
UCLA006 0
PL023 0
PL018 0
NYU-54 0
UCLA017 0
NR_0011 0
PL011 0
PL017 0
NR_0010 0
NR_0008 0
UCLA014 0
UCLA004 0
UCLA004_0_missing
NR_0012 0
PL003 0
PL004 0
ZFM-02371 0
UCLA011 0
UCLA008 0
UCLA005 0
UCLA007 0
NYU-47 0
NYU-45 0
ZFM-02373 0
ZFM-02372 0
ZFM-02368 0
KS056 0
KS056_0_missing
