In [1]:
prefix = '/home/ines/repositories/'
# prefix = '/Users/ineslaranjeira/Documents/Repositories/'

In [37]:
""" 
IMPORTS
"""
import os
import numpy as np
import pickle
import pandas as pd
from one.api import ONE
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore
from matplotlib.colors import ListedColormap

# Get my functions
from functions import idxs_from_files#, state_identifiability

one = ONE(mode='remote')

# Parameters

In [42]:
# LOAD DATA
data_path = prefix + 'representation_learning_variability/paper-individuality/data/design_matrices/kcenia/'
paw_wavelet_path = prefix + 'representation_learning_variability/paper-individuality/data/paw_wavelets/1_camera_setup/'
wheel_wavelet_path = prefix + 'representation_learning_variability/paper-individuality/data/wheel_wavelets/1_camera_setup/'

results_path = prefix + 'representation_learning_variability/paper-individuality/data/kcenias_states/'

all_files = os.listdir(data_path)
design_matrices = [item for item in all_files if 'design_matrix' in item and 'standardized' not in item]
idxs, mouse_names = idxs_from_files(design_matrices)

""" FITTING PARAMETERS """
num_iters = 100
num_states = 2
num_train_batches = 5
method='prior'
fit_method='em'
threshold = 0


optimal_k = 8
optimal_k_wheel = 3

whisker_params = str(10)+'_'+method+'_'+fit_method+'_zsc_'+'True/'
licking_params = str(num_train_batches)+'_'+method+'_'+fit_method+'_zsc_'+'False/'

states_path = prefix + 'representation_learning_variability/paper-individuality/data/hmm/most_likely_states/'
paw_wavelet_states_path = prefix + 'representation_learning_variability/paper-individuality/data/paw_most_likely_states/'
wheel_wavelet_states_path = prefix + 'representation_learning_variability/paper-individuality/data/wheel_most_likely_states/'


# Individual sessions

In [27]:
# Identify sessions available to process
sessions_to_process = []
for m, mat in enumerate(idxs):
    mouse_name = mat[37:]
    session = mat[:36]
    fit_id = str(mouse_name + session)
    whisker_filename = os.path.join(states_path + whisker_params, 'whisker_me' + '_' + fit_id)
    licks_filename = os.path.join(states_path + licking_params, 'Lick count' + '_' + fit_id)
    paw_wavelet_filename = os.path.join(paw_wavelet_states_path, "most_likely_states_" + str(optimal_k) + '_' + fit_id+'.npy')
    wheel_wavelet_filename = os.path.join(wheel_wavelet_states_path, "most_likely_states_" + str(optimal_k_wheel) + '_' + fit_id+'.npy')

    # if os.path.exists(whisker_filename) and os.path.exists(licks_filename) and os.path.exists(paw_wavelet_filename) and os.path.exists(wheel_wavelet_filename):
    if os.path.exists(whisker_filename) and os.path.exists(licks_filename) and os.path.exists(wheel_wavelet_filename):
        sessions_to_process.append((mouse_name, session))

print(f"Found {len(sessions_to_process)} sessions to process.")

Found 37 sessions to process.


In [30]:
wheel_fix_mapping = {0:2, 1:0, 2:1}

In [38]:
def state_identifiability(session_states, use_sets):
    # Create new mapping depending on empirical data for each state
    for v, var in enumerate(use_sets):
        var_states = var+'_states'
        
        # For an empty variable, do not make changes (wavelet)
        if len(var) == 0:
            var_0 = np.nan
            var_1 = np.nan
        elif var == ['avg_wheel_vel']:
            var_0 = np.nanmean(np.abs(session_states.loc[session_states[var_states]==0, var]))
            var_1 = np.nanmean(np.abs(session_states.loc[session_states[var_states]==1, var]))
        elif var == ['left_X', 'left_Y', 'right_X', 'right_Y']:
            var_0 = np.array(np.abs(np.diff(session_states.loc[session_states[var_states]==0, var])))
            var_1 = np.array(np.abs(np.diff(session_states.loc[session_states[var_states]==0, var])))
        elif var == ['nose_x', 'nose_Y']:
            print('Not implemented yet')
        else:
            var_0 = session_states.loc[session_states[var_states]==0, var]
            var_1 = session_states.loc[session_states[var_states]==1, var]
        
        if np.nanmean(var_0)> np.nanmean(var_1):
            session_states[var_states] = session_states[var_states] * -1 + 1
    return session_states

In [43]:
var_interest = ['whisker_me', 'Lick count', 'wheel']
path_sets = [states_path + whisker_params, states_path+licking_params, wheel_wavelet_states_path]

states_trial_type = pd.DataFrame(columns=['mouse_name', 'session',  'most_likely_states', 
                                          'identifiable_states', 'Bin'])

for m, mat in enumerate(sessions_to_process):

    mouse_name = mat[0]
    session = mat[1]

    fit_id = str(mouse_name+session)
    trials_file = data_path + "session_trials_" + str(session) + '_'  + mouse_name
    session_trials = pd.read_parquet(trials_file, engine='pyarrow').reset_index() 
    
    session_states = pd.DataFrame()
    for v, var in enumerate(var_interest):

        # Get most likely states filename
        use_path = path_sets[v]
        var_name = var_interest[v]
        use_k = optimal_k if var=='paw' else optimal_k_wheel
        states_filename = os.path.join(use_path, 
                                       f"{'most_likely_states_'+str(use_k)+ '_' + fit_id + '.npy' if var_name=='paw' or var_name=='wheel' else var_name+ '_' + fit_id}")
        
        column_name = var + '_states'
        # Load design matrix used to obtain states and compare sizes
        if var == 'wheel':
            use_vars = ['avg_wheel_vel0.5', 'avg_wheel_vel1.0', 'avg_wheel_vel2.0', 'avg_wheel_vel4.0', 'avg_wheel_vel8.0', 
                            'avg_wheel_vel16.0', 'avg_wheel_vel32.0']   
            filename = wheel_wavelet_path + "wheel_vel_wavelets_" + str(session) + '_'  + mouse_name
            design_matrix = pd.read_parquet(filename)
            most_likely_states = np.load(open(states_filename, "rb"))
            # Rename states to be more intuitive
            replace_func = np.vectorize(wheel_fix_mapping.get)
            most_likely_states = replace_func(most_likely_states)
            # not_nan_indices = ~np.isnan(np.array(design_matrix)).any(axis=1)

            not_nan_indices = ~np.isnan(np.array(design_matrix[use_vars])).any(axis=1)
            assert np.sum(not_nan_indices) == len(most_likely_states)
            design_matrix[column_name] = design_matrix['Bin'] * np.nan
            design_matrix[column_name].loc[not_nan_indices] = most_likely_states
        
        # elif var == 'paw':
        #     use_vars = ['l_paw_x0.5', 'l_paw_x1.0', 'l_paw_x2.0', 'l_paw_x4.0', 'l_paw_x8.0', 
        #             'l_paw_y0.5', 'l_paw_y1.0', 'l_paw_y2.0', 'l_paw_y4.0', 'l_paw_y8.0', 
        #             'r_paw_x0.5', 'r_paw_x1.0', 'r_paw_x2.0', 'r_paw_x4.0', 'r_paw_x8.0', 
        #             'r_paw_y0.5', 'r_paw_y1.0', 'r_paw_y2.0', 'r_paw_y4.0', 'r_paw_y8.0']
        #     filename = paw_wavelet_path + "paw_vel_wavelets_" + str(session) + '_'  + mouse_name
        #     design_matrix = pd.read_parquet(filename)
        #     most_likely_states = np.load(open(states_filename, "rb"))
        #     # Rename states to be more intuitive
        #     replace_func = np.vectorize(paw_fix_mapping.get)
        #     most_likely_states = replace_func(most_likely_states)
        #     not_nan_indices = ~np.isnan(np.array(design_matrix[use_vars])).any(axis=1)
        #     assert np.sum(not_nan_indices) == len(most_likely_states)
        #     design_matrix[column_name] = desigresults_pathn_matrix['Bin'] * np.nan
        #     design_matrix[column_name].loc[not_nan_indices] = most_likely_states

        else:
            most_likely_states, _, _ = pickle.load(open(states_filename, "rb"))
            filename = paw_wavelet_path + "paw_vel_wavelets_" + str(session) + '_'  + mouse_name
            design_matrix = pd.read_parquet(filename)
            not_nan_indices = ~np.isnan(np.array(design_matrix)).any(axis=1)
            num_timesteps = np.shape(design_matrix.dropna())[0]
            
            # This was an error, now fixed with a hack
            if var == 'whisker_me':
                num_train_batches = 10
            elif var == 'Lick count':
                num_train_batches = 5
            hmm_length = (num_timesteps // num_train_batches) * num_train_batches
            assert hmm_length == len(most_likely_states)
            use_indices = not_nan_indices[:hmm_length]
            design_matrix[column_name] = design_matrix['Bin'] * np.nan
            rows = design_matrix.index[not_nan_indices][:hmm_length]
            design_matrix.loc[rows, column_name] = most_likely_states

        # Join data
        if v == 0:
            session_states = design_matrix[['Bin', 'Lick count', 'avg_wheel_vel', 'whisker_me', 
                                            'l_paw_x', 'l_paw_y', 'r_paw_x', 'r_paw_y', column_name]]
        else:
            session_states = session_states.merge(design_matrix[['Bin', column_name]], on='Bin', how='outer')
            
    # Transform states into identifiable states
    sets_to_identify = ['whisker_me', 'Lick count']
    session_states = state_identifiability(session_states, sets_to_identify)

    # Combine states
    session_states = session_states.dropna() 
    c_states = np.array(session_states[['whisker_me_states', 'Lick count_states', 'wheel_states']].astype(int))
    combined_states = np.array([''.join(map(str, row)) for row in c_states])
    session_states['identifiable_states'] = combined_states
    
    """ SAVE DATA """       
    # Save unnormalized design matrix
    filename = results_path + "states_matrix" + str(session) + '_'  + mouse_name
    session_states.to_parquet(filename, compression='gzip')  

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  design_matrix[column_name].loc[not_nan_indices] = most_likely_states
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c