# 1.4 State sequence variability

In [2]:

""" 
IMPORTS
"""
import os
import autograd.numpy as np
import seaborn as sns
from collections import defaultdict
import pandas as pd

from one.api import ONE
from pprint import pprint
import matplotlib.pyplot as plt
from scipy.stats import mode

# --Machine learning and statistics
from sklearn.metrics.pairwise import cosine_similarity

# Get my functions
functions_path =  '/home/ines/repositories/representation_learning_variability/Models/Sub-trial//2_fit_models/'
# functions_path = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability//Models/Sub-trial//2_fit_models/'
os.chdir(functions_path)
from preprocessing_functions import idxs_from_files
functions_path =  '/home/ines/repositories/representation_learning_variability/Models/Sub-trial//3_postprocess_results/'
# functions_path = '/Users/ineslaranjeira/Documents/Repositories/representation_learning_variability//Models/Sub-trial//2_fit_models/'
os.chdir(functions_path)
from postprocessing_functions import  define_trial_types, bin_sequence
from plotting_functions import plot_binned_sequence
functions_path =  '/home/ines/repositories/representation_learning_variability/Functions/'
os.chdir(functions_path)
from data_processing import save_and_log
one = ONE(mode='remote')

## Parameters

In [3]:
# Parameters
bin_size = 0.017
num_states = 2
threshold = 0.05
optimal_k = 4

save_path = '/home/ines/repositories/representation_learning_variability/DATA/Sub-trial/Results/'  + str(bin_size) + '/'+str(num_states)+'_states/most_likely_states/'

# LOAD DATA
data_path ='/home/ines/repositories/representation_learning_variability/DATA/Sub-trial/Design matrix/' + 'v5_15Jan2025/' + str(bin_size) + '/'
all_files = os.listdir(data_path)
design_matrices = [item for item in all_files if 'design_matrix' in item and 'standardized' not in item]
idxs, mouse_names = idxs_from_files(design_matrices, bin_size)

states_path =  '/home/ines/repositories/representation_learning_variability/DATA/Sub-trial/Results/' + str(bin_size) + '/'+str(num_states)+'_states/most_likely_states/'
wavelet_states_path = '/home/ines/repositories/representation_learning_variability/DATA/Sub-trial/Results/'  + str(bin_size) + '/wavelet_transform_states/'

path_sets = [wavelet_states_path, states_path, states_path]


# Individual sessions

In [4]:
# Identify sessions available to process
sessions_to_process = []
for m, mat in enumerate(idxs):
    mouse_name = mat[37:]
    session = mat[:36]
    fit_id = str(mouse_name + session)
    whisker_filename = os.path.join(states_path, "most_likely_states" + 'whisker_me' + '_' + fit_id)
    licks_filename = os.path.join(states_path, "most_likely_states" + 'Lick count' + '_' + fit_id)
    wavelet_filename = os.path.join(wavelet_states_path, "most_likely_states_" + str(optimal_k) + '_' + fit_id)

    if os.path.exists(whisker_filename) and os.path.exists(licks_filename) and os.path.exists(wavelet_filename):
        sessions_to_process.append((mouse_name, session))

print(f"Found {len(sessions_to_process)} sessions to process.")

Found 215 sessions to process.


## Get sequences per trial epoch

In [5]:
# Load preprocessed data
results_path = '/home/ines/repositories/representation_learning_variability/Models/Sub-trial/3_postprocess_results/'
filename_states = str(results_path + 'states_trial_type02-21-2025')
states_file = pd.read_parquet(filename_states, engine='pyarrow')

## Trial epoch barcoding

In [8]:
all_sequences

Unnamed: 0,mouse_name,y,binned_sequence


In [9]:
trial_type_agg = ['correct_str', 'contrast_str', 'block_str', 'choice']
# trial_type_agg = ['correct_str']

plot = False
length = 5

all_sequences = pd.DataFrame(columns=['mouse_name', 'y', 'binned_sequence'])
for m, mat in enumerate(sessions_to_process):

    mouse_name = mat[0]
    session = mat[1]
    fit_id = str(mouse_name+session)

    states_trial = states_file.loc[states_file['session']==session]
    num_states = len(states_trial['most_likely_states'].unique())

    """ Define trial types  """
    states_df = define_trial_types(states_trial, trial_type_agg)
    vars = ['sample', 'trial_type', 'broader_label', 'mouse_name', 'most_likely_states']

    df_grouped = states_trial.groupby(vars[:-1])['most_likely_states'].apply(list).reset_index()
    df_grouped.rename(columns={'most_likely_states': 'sequence'}, inplace=True)
    df_grouped['binned_sequence'] = df_grouped['sequence'].apply(lambda seq: bin_sequence(seq, target_length=length))

    if plot == True:
        states_to_append = np.arange(0, num_states)
        for i in range(10):
            plot_binned_sequence(df_grouped, i, states_to_append)

    df_grouped['y'] = df_grouped['trial_type'] + df_grouped['broader_label']
    all_sequences = pd.concat([all_sequences, df_grouped[['mouse_name', 'y', 'binned_sequence']]], ignore_index=True)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  states_trial_type['correct_str'] = states_trial_type['correct']
  states_trial_type.loc[states_trial_type['correct_str']==1., 'correct_str'] = 'correct'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  states_trial_type['contrast_str'] = states_trial_type['contrast'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returni

In [16]:
np.shape(all_sequences['binned_sequence'].apply(lambda x: np.array(x)).to_numpy())

(541350,)

In [12]:
all_sequences['binned_sequence'].astype(np.array)

TypeError: Cannot interpret '<built-in function array>' as a data type

In [None]:
X = np.array([[1, 2, 3], 
              [4, 5, 6], 
              [7, 8, 9]])

# Compute pairwise cosine similarity
cos_sim_matrix = cosine_similarity(X)


In [12]:
len(all_sequences.loc[all_sequences['mouse_name']=='ZFM-02372', 'y'].unique())

223