### Create design matrix with IBL mice
#### Normalize data across all animals and all bins

In [1]:
"""
IMPORTS
"""
import numpy as np
from oneibl.onelight import ONE
import numpy.random as npr
import json
from sklearn import preprocessing
from collections import defaultdict
import wget
from zipfile import ZipFile
import os
import pandas as pd
from preprocessing_utils_session_ines import get_animal_name, load_animal_list, load_animal_eid_dict, \
    get_all_unnormalized_data_this_session, create_train_test_sessions

npr.seed(65)

In [10]:
"""
Load data
"""
data_dir = '/home/ines/repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/GLMHMM/'
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl/partially_processed/'
# Load animal list/results of partial processing:
animal_list = load_animal_list(
    data_dir + 'animal_list_prof.npz')
animal_eid_dict = load_animal_eid_dict(
    data_dir + 'animal_eid_dict_prof.json')


In [11]:
"""
Create folders to save processed data
"""
#data_dir = '/home/ines/repositories/learning_variability/HMMs/Zoes_model/data/ibl-behavioral-data-Dec-2019/'
# Create directories for saving data:
processed_ibl_data_path = data_dir + "data_for_cluster/"
if not os.path.exists(processed_ibl_data_path):
    os.makedirs(processed_ibl_data_path)
# Also create a subdirectory for storing each animal's data:
if not os.path.exists(processed_ibl_data_path + "data_by_animal"):
    os.makedirs(processed_ibl_data_path + "data_by_animal")

In [22]:
# Require that each animal has at least 10 sessions (=2700 trials) of data:
req_num_sessions = 20  # 30*90 = 2700
total_sessions = 0
for animal in animal_list:
    num_sessions = len(animal_eid_dict[animal])
    print(num_sessions)
    if num_sessions < req_num_sessions:
        animal_list = np.delete(animal_list,
                                np.where(animal_list == animal))
    else:
        total_sessions = total_sessions + num_sessions


31
70
30
37
35
37
31
54
37
58
72
61
74
61
60
44
62
57
32
40
34
30
36
38
43
32
42
38
68
75
51
52
36
40
83
78
76
67
30
139
134
128
133
36
60
40
43
32
30
34
37
70
39
60
31
33
35
40
36
46
42
40
45
80
66
61
33
49
43
32
36
56
46
32
58
43
93
82
96
86
48
49
42
42
68
89
65
40
40
32
60
43
49
35
43
54
36
31
39
37
37
33
35
42
42
83
100
37
53
53
48
40
43
33
32
55
53
127
36
119
102
47
38
31
73
39
157
112
71
36
32
31
48
50
35
58
37
36
30
39
35
51
50
113
51
59
30
37
51
53
50
65
60
45
75
34
31
34
31
46
77
78
98
77
54
55
52
32
34
35
30
46
32
53
37
43
86
85
85
96


In [26]:
all_trials.task_protocol.unique()

array(['_iblrig_tasks_biasedChoiceWorld6.3.1',
       '_iblrig_tasks_trainingChoiceWorld6.3.1',
       '_iblrig_tasks_biasedChoiceWorld6.2.5',
       '_iblrig_tasks_ephysChoiceWorld6.2.5',
       '_iblrig_tasks_ephysChoiceWorld6.4.0',
       '_iblrig_tasks_biasedChoiceWorld6.4.0',
       '_iblrig_tasks_trainingChoiceWorld6.2.5',
       '_iblrig_tasks_biasedChoiceWorld6.2.4',
       '_iblrig_tasks_trainingChoiceWorld6.1.3',
       '_iblrig_tasks_trainingChoiceWorld6.2.1',
       '_iblrig_tasks_trainingChoiceWorld6.2.0',
       '_iblrig_tasks_biasedChoiceWorld6.2.1',
       '_iblrig_tasks_trainingChoiceWorld6.2.4',
       '_iblrig_tasks_trainingChoiceWorld6.4.0',
       '_iblrig_tasks_biasedChoiceWorld4.1.3',
       '_iblrig_tasks_biasedChoiceWorld5.2.7',
       '_iblrig_tasks_biasedChoiceWorld5.2.5',
       '_iblrig_tasks_biasedChoiceWorld4.0.1',
       '_iblrig_tasks_trainingChoiceWorld4.0.1',
       '_iblrig_tasks_biasedChoiceWorld5.2.9',
       '_iblrig_tasks_biasedChoiceWorld5.1.2',

In [23]:
trials_dir = '/home/ines/repositories/representation_learning_variability/DATA/'
#trials_dir = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability/DATA/'
all_trials = pd.read_csv(trials_dir + "proficient_one.csv")  


In [6]:
# Identify idx in master array where each animal-bin's data starts and ends:
animal_start_idx = {}
animal_end_idx = {}

missing_data = []

final_animal_eid_dict = defaultdict(list)
# WORKHORSE: iterate through each animal and each animal's set of eids;
# obtain unnormalized data.  Write out each animal's data and then also
# write to master array

# Loop through animals        
for z, animal in enumerate(animal_list):
    
    animal_sessions = np.unique(all_trials.loc[all_trials['subject_nickname']==animal, 'session'])
    # Loop through animals
    sess_counter = 0
    for eid in animal_sessions:
        session_data = all_trials.loc[(all_trials['subject_nickname']==animal) & (all_trials['session']==eid)]
        animal, unnormalized_inpt, y, session, num_viols_50, rewarded = \
            get_all_unnormalized_data_this_session(
                animal, session_data)
        if num_viols_50 < 10:  # only include session if number of viols, if not, data will not be created or appended
            # Append data from different sessions of the same animal
            if sess_counter == 0:
                animal_unnormalized_inpt = np.copy(unnormalized_inpt)
                animal_y = np.copy(y)
                animal_session = session
                animal_rewarded = np.copy(rewarded)
            else:
                animal_unnormalized_inpt = np.vstack(
                    (animal_unnormalized_inpt, unnormalized_inpt))
                animal_y = np.vstack((animal_y, y))
                animal_session = np.concatenate((animal_session, session))
                animal_rewarded = np.vstack((animal_rewarded, rewarded))
            sess_counter += 1
            final_animal_eid_dict[animal].append(eid)
            
            ## Check psychometric curves!
            #un_inpt, un_y, un_bin_data = load_data(unnormalized_data)
            """
            import matplotlib.pyplot as plt
            plt.figure()
            x_stim = np.unique(unnormalized_inpt[:, 0])
            y_stim = np.zeros(len(x_stim))
            
            for s, stimulus in enumerate(x_stim):
                y_stim[s] = np.nanmean(y[np.where(unnormalized_inpt[:, 0]==stimulus), 0])
            plt.scatter(x_stim, y_stim)
            plt.title('Session' + str(sessio))
            plt.ylim([0,1])
            """
                
        else:
            print(str(animal + '_missing' + str(num_viols_50)))
            missing_data.append(animal)
        
    # Write out animal's unnormalized data matrix (all sessions concatenated):
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
            '_unnormalized.npz',
        animal_unnormalized_inpt, animal_y,
        animal_session)
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
        '_rewarded.npz',
        animal_rewarded)
    animal_session_fold_lookup = create_train_test_sessions(animal_session, num_folds=5)
    np.savez(
        processed_ibl_data_path + 'data_by_animal/' + animal +
        "_session_fold_lookup" +
        ".npz",
        animal_session_fold_lookup)
    
    assert animal_rewarded.shape[0] == animal_y.shape[0]
    #assert len(animal_bin) == len(bin_shuffled_folds)

    # Now create or append data to master array across all animals:
    if z == 0:
        master_inpt = np.copy(animal_unnormalized_inpt)
        animal_start_idx[animal] = 0
        animal_end_idx[animal] = master_inpt.shape[0] - 1
        master_y = np.copy(animal_y)
        master_session = animal_session
        master_session_fold_lookup_table = animal_session_fold_lookup
        master_rewarded = np.copy(animal_rewarded)
    else:
        animal_start_idx[animal] = master_inpt.shape[0]
        master_inpt = np.vstack((master_inpt, animal_unnormalized_inpt))
        animal_end_idx[animal] = master_inpt.shape[0] - 1
        master_y = np.vstack((master_y, animal_y))
        master_session = np.concatenate((master_session, animal_session))
        master_session_fold_lookup_table = np.vstack(
            (master_session_fold_lookup_table, animal_session_fold_lookup))  # Changed vstack into hstack
        master_rewarded = np.vstack((master_rewarded, animal_rewarded))
                
    # Write out data from across animals
    assert np.shape(master_inpt)[0] == np.shape(master_y)[
        0], "inpt and y not same length"
    assert np.shape(master_rewarded)[0] == np.shape(master_y)[
        0], "rewarded and y not same length"
    assert len(np.unique(master_session)) == \
           np.shape(master_session_fold_lookup_table)[
               0], "number of unique sessions and session fold lookup don't " \
                   "match"
    #assert len(master_inpt) == 181530, "design matrix for all IBL animals " \
    #                                   "should have shape (181530, 3)"
    #assert len(animal_list) == 37, "37 animals were studied in Ashwood et " \
    #                               "al. (2020)"
    normalized_inpt = np.copy(master_inpt)
    normalized_inpt[:, 0] = preprocessing.scale(normalized_inpt[:, 0])
    np.savez(processed_ibl_data_path + 'all_animals_concat' + '.npz',
             normalized_inpt,
             master_y, master_session)
    np.savez(
        processed_ibl_data_path + 'all_animals_concat_unnormalized' + '.npz',
        master_inpt, master_y, master_session)
    np.savez(
        processed_ibl_data_path + 'all_animals_concat_session_fold_lookup' +
        '.npz',
        master_session_fold_lookup_table)
    np.savez(processed_ibl_data_path + 'all_animals_concat_rewarded' + '.npz',
             master_rewarded)
    np.savez(processed_ibl_data_path + 'data_by_animal/' + 'animal_list.npz',
             animal_list)
    
    import json
    json = json.dumps(final_animal_eid_dict)
    f = open(processed_ibl_data_path + "final_animal_eid_dict.json", "w")
    f.write(json)
    f.close()

    # Now write out normalized data (when normalized across all animals) for
    # each animal:
    counter = 0
    for animal in animal_start_idx.keys():
        start_idx = animal_start_idx[animal]
        end_idx = animal_end_idx[animal]
        inpt = normalized_inpt[range(start_idx, end_idx + 1)]
        y = master_y[range(start_idx, end_idx + 1)]
        session = master_session[range(start_idx, end_idx + 1)]
        counter += inpt.shape[0]
        np.savez(processed_ibl_data_path + 'data_by_animal/' + animal + '_processed.npz',
                 inpt, y,
                 session)

    assert counter == master_inpt.shape[0]

CSHL045_missing15
CSHL046_missing17
CSHL051_missing15
CSHL051_missing11
CSHL051_missing18
CSHL051_missing11
CSHL051_missing15
CSHL051_missing27
CSHL051_missing14
CSHL051_missing14
CSHL051_missing30
CSHL053_missing18
CSHL054_missing13
CSHL054_missing10
CSHL054_missing35
CSHL054_missing16
CSHL054_missing21
CSHL055_missing22
CSHL055_missing19
CSHL058_missing16
CSHL058_missing10
CSHL058_missing10
CSHL058_missing19
CSHL058_missing26
CSHL058_missing26
CSHL_001_missing13
CSHL_002_missing17
CSHL_003_missing10
CSHL_003_missing15
CSHL_003_missing13
CSHL_003_missing20
CSHL_007_missing21
CSHL_007_missing10
CSHL_007_missing12
CSHL_007_missing14
CSHL_007_missing34
CSHL_007_missing16
CSHL_007_missing14
CSHL_007_missing17
CSHL_007_missing12
CSHL_007_missing32
CSHL_007_missing15
CSHL_007_missing22
CSHL_007_missing12
CSHL_007_missing36
CSHL_007_missing23
CSHL_007_missing28
CSHL_007_missing10
CSHL_007_missing13
CSHL_007_missing17
CSHL_008_missing20
CSHL_008_missing11
CSHL_008_missing12
CSHL_010_missing13

In [10]:
def get_animal_name(eid):
    # get session id:
    raw_session_id = eid.split('Subjects/')[1]
    # Get animal:
    animal = raw_session_id.split('/')[0]
    return animal

"""
def get_raw_data(eid, data_dir):
    print(eid)
    # get session id:
    raw_session_id = eid.split('Subjects/')[1]
    # Get animal:
    animal = raw_session_id.split('/')[0]
    # replace '/' with dash in session ID
    session_id = raw_session_id.replace('/', '-')
    # hack to work with ONE:
    current_dir = os.getcwd()
    os.chdir(data_dir)
    # Get choice data, stim data and rewarded/not rewarded:
    choice = one.load_dataset(eid, '_ibl_trials.choice')
    stim_left = one.load_dataset(eid, '_ibl_trials.contrastLeft')
    stim_right = one.load_dataset(eid, '_ibl_trials.contrastRight')
    rewarded = one.load_dataset(eid, '_ibl_trials.feedbackType')
    bias_probs = one.load_dataset(eid, '_ibl_trials.probabilityLeft')
    os.chdir(current_dir)
    return animal, session_id, stim_left, stim_right, rewarded, choice, \
           bias_probs
"""


def get_raw_data(animal, trials_df):
    print(animal)

    # Get data
    choice = np.array(trials_df.loc[trials_df['subject_nickname']==animal,'choice'])
    #choice = np.array(remap_choice_vals(choice))
    stim_left = np.array(trials_df.loc[trials_df['subject_nickname']==animal,'contrastLeft'])
    stim_right = np.array(trials_df.loc[trials_df['subject_nickname']==animal,'contrastRight'])
    rewarded = np.array(trials_df.loc[trials_df['subject_nickname']==animal,'feedbackType'])
    bias_probs = np.array(trials_df.loc[trials_df['subject_nickname']==animal,'probabilityLeft'])
    session_id = trials_df.loc[trials_df['subject_nickname']==animal,'session']
    
    return animal, session_id, stim_left, stim_right, rewarded, choice, \
           bias_probs


def create_stim_vector(stim_left, stim_right):
    # want stim_right - stim_left
    # Replace NaNs with 0:
    stim_left = np.nan_to_num(stim_left, nan=0)
    stim_right = np.nan_to_num(stim_right, nan=0)
    # now get 1D stim
    signed_contrast = stim_right - stim_left
    return signed_contrast


def create_previous_choice_vector(choice):
    ''' choice: choice vector of size T
        previous_choice : vector of size T with previous choice made by
        animal - output is in {0, 1}, where 0 corresponds to a previous left
        choice; 1 corresponds to right.
        If the previous choice was a violation, replace this with the choice
        on the previous trial that was not a violation.
        locs_mapping: array of size (~num_viols)x2, where the entry in
        column 1 is the location in the previous choice vector that was a
        remapping due to a violation and the
        entry in column 2 is the location in the previous choice vector that
        this location was remapped to
    '''
    previous_choice = np.hstack([np.array(choice[0]), choice])[:-1]
    locs_to_update = np.where(previous_choice == -1)[0]
    locs_with_choice = np.where(previous_choice != -1)[0]
    loc_first_choice = locs_with_choice[0]
    locs_mapping = np.zeros((len(locs_to_update) - loc_first_choice, 2),
                            dtype='int')

    for i, loc in enumerate(locs_to_update):
        if loc < loc_first_choice:
            # since no previous choice, bernoulli sample: (not output of
            # bernoulli rvs is in {1, 2})
            previous_choice[loc] = bernoulli.rvs(0.5, 1) - 1
        else:
            # find nearest loc that has a previous choice value that is not
            # -1, and that is earlier than current trial
            potential_matches = locs_with_choice[
                np.where(locs_with_choice < loc)]
            absolute_val_diffs = np.abs(loc - potential_matches)
            absolute_val_diffs_ind = absolute_val_diffs.argmin()
            nearest_loc = potential_matches[absolute_val_diffs_ind]
            locs_mapping[i - loc_first_choice, 0] = int(loc)
            locs_mapping[i - loc_first_choice, 1] = int(nearest_loc)
            previous_choice[loc] = previous_choice[nearest_loc]
    assert len(np.unique(
        previous_choice)) <= 2, "previous choice should be in {0, 1}; " + str(
        np.unique(previous_choice))
    return previous_choice, locs_mapping


def create_wsls_covariate(previous_choice, success, locs_mapping):
    '''
    inputs:
    success: vector of size T, entries are in {-1, 1} and 0 corresponds to
    failure, 1 corresponds to success
    previous_choice: vector of size T, entries are in {0, 1} and 0
    corresponds to left choice, 1 corresponds to right choice
    locs_mapping: location remapping dictionary due to violations
    output:
    wsls: vector of size T, entries are in {-1, 1}.  1 corresponds to
    previous choice = right and success OR previous choice = left and
    failure; -1 corresponds to
    previous choice = left and success OR previous choice = right and failure
    '''
    # remap previous choice vals to {-1, 1}
    remapped_previous_choice = 2 * previous_choice - 1
    previous_reward = np.hstack([np.array(success[0]), success])[:-1]
    # Now need to go through and update previous reward to correspond to
    # same trial as previous choice:
    for i, loc in enumerate(locs_mapping[:, 0]):
        nearest_loc = locs_mapping[i, 1]
        previous_reward[loc] = previous_reward[nearest_loc]
    wsls = previous_reward * remapped_previous_choice
    assert len(np.unique(wsls)) == 2, "wsls should be in {-1, 1}"
    return wsls


def remap_choice_vals(choice):
    # raw choice vector has CW = 1 (correct response for stim on left),
    # CCW = -1 (correct response for stim on right) and viol = 0.  Let's
    # remap so that CW = 0, CCw = 1, and viol = -1
    choice_mapping = {1: 0, -1: 1, 0: -1}
    new_choice_vector = [choice_mapping[old_choice] for old_choice in choice]
    return new_choice_vector


def create_design_mat(choice, stim_left, stim_right, rewarded):
    # Create unnormalized_inpt: with first column = stim_right - stim_left,
    # second column as past choice, third column as WSLS
    stim = create_stim_vector(stim_left, stim_right)
    T = len(stim)
    design_mat = np.zeros((T, 3))
    design_mat[:, 0] = stim
    # make choice vector so that correct response for stim>0 is choice =1
    # and is 0 for stim <0 (viol is mapped to -1)
    choice = remap_choice_vals(choice)
    # create past choice vector:
    previous_choice, locs_mapping = create_previous_choice_vector(choice)
    # create wsls vector:
    wsls = create_wsls_covariate(previous_choice, rewarded, locs_mapping)
    # map previous choice to {-1,1}
    design_mat[:, 1] = 2 * previous_choice - 1
    design_mat[:, 2] = wsls
    return design_mat

"""
def get_all_unnormalized_data_this_session(eid, trials_df):
    # Load raw data
    animal, session_id, stim_left, stim_right, rewarded, choice, bias_probs \
        = get_raw_data(eid, trials_df)
    # Subset choice and design_mat to 50-50 entries:
    trials_to_study = np.where(bias_probs == 0.5)[0]
    trials_to_study = np.where(bias_probs <=1)[0]  # redundant code, but makes sure all trials are used
    num_viols_50 = len(np.where(choice[trials_to_study] == 0)[0])  # viols are omissions
    if num_viols_50 < 10:
        # Create design mat = matrix of size T x 3, with entries for
        # stim/past choice/wsls
        unnormalized_inpt = create_design_mat(choice[trials_to_study],
                                              stim_left[trials_to_study],
                                              stim_right[trials_to_study],
                                              rewarded[trials_to_study])
        y = np.expand_dims(remap_choice_vals(choice[trials_to_study]), axis=1)
        session = [session_id for i in range(y.shape[0])]
        rewarded = np.expand_dims(rewarded[trials_to_study], axis=1)
    else:
        unnormalized_inpt = np.zeros((90, 3))
        y = np.zeros((90, 1))
        session = []
        rewarded = np.zeros((90, 1))
    return animal, unnormalized_inpt, y, session, num_viols_50, rewarded
"""

def get_all_unnormalized_data_this_session(animal, trials_df):
    # Load raw data
    animal, session_id, stim_left, stim_right, rewarded, choice, bias_probs \
        = get_raw_data(animal, trials_df)
    # Subset choice and design_mat to 50-50 entries:
    trials_to_study = np.where(bias_probs == 0.5)[0]
    trials_to_study = np.where(bias_probs <=1)[0]  # redundant code, but makes sure all trials are used
    num_viols_50 = len(np.where(choice[trials_to_study] == 0)[0])  # viols are omissions
    if num_viols_50 < 10:
        # Create design mat = matrix of size T x 3, with entries for
        # stim/past choice/wsls
        unnormalized_inpt = create_design_mat(choice[trials_to_study],
                                              stim_left[trials_to_study],
                                              stim_right[trials_to_study],
                                              rewarded[trials_to_study])
        y = np.expand_dims(remap_choice_vals(choice[trials_to_study]), axis=1)
        session = [session_id for i in range(y.shape[0])]
        rewarded = np.expand_dims(rewarded[trials_to_study], axis=1)
    else:
        unnormalized_inpt = np.zeros((90, 3))
        y = np.zeros((90, 1))
        session = []
        rewarded = np.zeros((90, 1))
    return animal, unnormalized_inpt, y, session, num_viols_50, rewarded


def load_animal_list(file):
    container = np.load(file, allow_pickle=True)
    data = [container[key] for key in container]
    animal_list = data[0]
    return animal_list


def load_animal_eid_dict(file):
    with open(file, 'r') as f:
        animal_eid_dict = json.load(f)
    return animal_eid_dict


def load_data(animal_file):
    container = np.load(animal_file, allow_pickle=True)
    data = [container[key] for key in container]
    inpt = data[0]
    y = data[1]
    y = y.astype('int')
    session = data[2]
    return inpt, y, session


def create_train_test_sessions(session, num_folds=5):
    # create a session-fold lookup table
    num_sessions = len(np.unique(session))
    # Map sessions to folds:
    unshuffled_folds = np.repeat(np.arange(num_folds),
                                 np.ceil(num_sessions / num_folds))
    shuffled_folds = npr.permutation(unshuffled_folds)[:num_sessions]
    assert len(np.unique(
        shuffled_folds)) == 5, "require at least one session per fold for " \
                               "each animal!"
    # Look up table of shuffle-folds:
    sess_id = np.array(np.unique(session), dtype='str')
    shuffled_folds = np.array(shuffled_folds, dtype='O')
    session_fold_lookup_table = np.transpose(
        np.vstack([sess_id, shuffled_folds]))
    return session_fold_lookup_table
"""

def create_train_test_sessions(animal_bin, num_folds=5):
    num_folds = 5
    # create a session-fold lookup table
    bin_len = len(animal_bin)

    # Map trials to folds:
    unshuffled_folds = np.repeat(np.arange(num_folds),
                                    np.ceil(bin_len/ num_folds))
    shuffled_folds = npr.permutation(unshuffled_folds)[:bin_len]
    # make sure final size is the same
    shuffled_folds = shuffled_folds[:bin_len]
    
    return shuffled_folds
"""

'\n\ndef create_train_test_sessions(animal_bin, num_folds=5):\n    num_folds = 5\n    # create a session-fold lookup table\n    bin_len = len(animal_bin)\n\n    # Map trials to folds:\n    unshuffled_folds = np.repeat(np.arange(num_folds),\n                                    np.ceil(bin_len/ num_folds))\n    shuffled_folds = npr.permutation(unshuffled_folds)[:bin_len]\n    # make sure final size is the same\n    shuffled_folds = shuffled_folds[:bin_len]\n    \n    return shuffled_folds\n'