In [48]:
import os
import pandas as pd
from pathlib import Path
from harp_resources import process, utils
from analysis_functions import *

In [49]:
mouse_info = {'B2M1': {'sex': 'M', 'area': 'V2M'},
              'B2M4': {'sex': 'M', 'area': 'V2M'},
              'B2M5': {'sex': 'M', 'area': 'V2M'},
              'B2M6': {'sex': 'M', 'area': 'V2M'},
              'B3M1': {'sex': 'M', 'area': 'V2M'},
              'B3M2': {'sex': 'M', 'area': 'V2M'},
              'B3M3': {'sex': 'F', 'area': 'V1'},
              'B3M4': {'sex': 'M', 'area': 'V2M'},
              'B3M5': {'sex': 'M', 'area': 'V2M'},
              'B3M6': {'sex': 'F', 'area': 'V2M'},
              'B3M7': {'sex': 'F', 'area': 'V2M'},
              'B3M8': {'sex': 'F', 'area': 'V2M'},
              'B0M0': {'sex': 'F', 'area': 'V2M'},
             }

session_info = {'220824': 'day1',
                '230824': 'day2',
                '190824': 'day1',
                '200824': 'day2',
                '120824': 'day1',
                '130824': 'day2',
                '070824': 'day1',
                '080824': 'day2',
               }

## Defining paths for grab or G8

In [50]:
rootdir = '/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824'#Enter root path
h5_paths = []
eventpaths = []
for dirpath, subdirs, files in os.walk(rootdir):
    for x in files:
        if '.h5' in x:
            eventpaths.append(dirpath)
            h5_paths.append(dirpath+'/'+x)

In [51]:
# Expression unit testing
'''h5_paths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5',
            '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5']
eventpaths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0',
             '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0']'''

"h5_paths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5',\n            '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5']\neventpaths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0',\n             '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0']"

### Loading data streams

In [52]:

import numpy as np
def load_h5_streams_to_dict(data_paths):
    '''
    Takes list of H5 file paths and, loads streams into dictionary, and save to dictionary named by mouse ID
    '''
    reconstructed_dict = {}  # Dictionary to save streams
    
    for input_file in data_paths:
        
        name = input_file.split('/')[-1][-7:-3]  # Extract mouse ID from file name
        
        if not os.path.exists(input_file):
            print(f'ERROR: {input_file} does not exist.')
            return None
        
        with h5py.File(input_file, 'r') as h5file:
            print(f'reconstructing streams for mouse {name}, from session folder: {input_file.split("/")[-3]}')
            
            common_index = h5file['HARP_timestamps'][:]
            reconstructed_streams = {}
            
            for source_name in h5file.keys():
                if source_name == 'HARP_timestamps':
                    continue
                
                reconstructed_streams[source_name] = {}
                source_group = h5file[source_name]
                
                for stream_name in source_group.keys():
                    stream_data = source_group[stream_name][:]
                    length_difference = len(common_index) - len(stream_data)
                    
                    # Pad or truncate to match common_index length
                    if len(stream_data) < len(common_index):
                        padding = np.full(len(common_index) - len(stream_data), np.nan)
                        stream_data = np.concatenate([stream_data, padding])
                        print(f"{source_name} - {stream_name}: Length difference: {length_difference}")
                        print(f"missing data, advicable to ensure correct alignment \n ")
                    elif len(stream_data) > len(common_index):
                        stream_data = stream_data[:len(common_index)]
                    
                    reconstructed_streams[source_name][stream_name] = pd.Series(data=stream_data, index=common_index)
        if name not in reconstructed_dict.keys():   
            reconstructed_dict[name] = reconstructed_streams
            print(f'  --> {name} streams reconstructed and added to dictionary \n')
        else: 
            reconstructed_dict[f'{name}_2'] = reconstructed_streams
            print(f'  --> {name} streams_2 reconstructed and added to dictionary \n')
    
    return reconstructed_dict




In [53]:
stream_dict_dict = load_h5_streams_to_dict(h5_paths)

reconstructing streams for mouse B2M4, from session folder: G8_MMclosed-and-Regular_130824
  --> B2M4 streams reconstructed and added to dictionary 

reconstructing streams for mouse B2M5, from session folder: G8_MMclosed-and-Regular_130824
  --> B2M5 streams reconstructed and added to dictionary 

reconstructing streams for mouse B3M1, from session folder: G8_MMclosed-and-Regular_130824
  --> B3M1 streams reconstructed and added to dictionary 

reconstructing streams for mouse B3M2, from session folder: G8_MMclosed-and-Regular_130824
  --> B3M2 streams reconstructed and added to dictionary 



In [54]:
stream_dict_dict.keys()

dict_keys(['B2M4', 'B2M5', 'B3M1', 'B3M2'])

In [55]:
#Make a cut_info dict for the mouse with missing data
#cut_info = {'B2M5': 521, 'B3M2':174}

In [56]:
def make_dataframes(stream_dict_dict, cut_info = {}):
    data_dict = {}
    for mouse, streamdict in stream_dict_dict.items():
        
        print(f'\n--Making dataframe for {mouse}--')
        #Getting fluorescence traces
        try: 
            fluorescence = streamdict['Photometry']['470_dfF'] #Using '470_dfF' only
        except KeyError:
            fluorescence = streamdict['Photometry']['CH1-470']
        print('flourescence 470 extracted')
    
        #Getting mouse movement data and converting to cm / second
        movementX = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0X(46)'])*100
        movementY = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0Y(46)'])*100
        print('movement on x and Y axis extracted')
    
        #Getting eye movements and pupil diameter
        if 'SleapVideoData2' in streamdict:
            eye_center_x = streamdict['SleapVideoData2']['Ellipse.Center.X']
            eye_center_y = streamdict['SleapVideoData2']['Ellipse.Center.Y']
            eye_diameter = streamdict['SleapVideoData2']['Ellipse.Diameter']
            print('eye movement data extracted')
        else: 
            print('There was no eye movement data available for ', mouse)
    
        #Getting visual stimuli event times
        event = streamdict['ONIX']['Photodiode']
        print('photdiode halt info extracted')
        
        time = movementX.index - movementX.index[0]
        print('time in seconds from 0 extracted form X direction movement')
        
        dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
            'Seconds': time}
        #dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
         #   'TimeStamp': time, 'eye_x': eye_center_x, 'eye_y': eye_center_y, 'pupil_diameter': eye_diameter}
        
        df = pd.DataFrame(dict)
        print('dataframe created with columns: ', df.columns)
        #if mouse in cut_info:
            #df = df.iloc[:-cut_info[mouse]]
        
        df['event'] = df['event'].astype(bool) #In case column is not bool
        #Reversing, so that a halt appearst when 'event'==True
        df['event'] = ~df['event']
        print('Event column as bool, True values corresponding to halts')
        
        df.reset_index(inplace=False)
        
        data_dict[mouse]= df
    return data_dict

In [57]:
data_dict = make_dataframes(stream_dict_dict )
names = [name for name in data_dict]


--Making dataframe for B2M4--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B2M4
photdiode halt info extracted
time in seconds from 0 extracted form X direction movement
dataframe created with columns:  Index(['470_dfF', 'movementX', 'movementY', 'event', 'Seconds'], dtype='object')
Event column as bool, True values corresponding to halts

--Making dataframe for B2M5--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B2M5
photdiode halt info extracted
time in seconds from 0 extracted form X direction movement
dataframe created with columns:  Index(['470_dfF', 'movementX', 'movementY', 'event', 'Seconds'], dtype='object')
Event column as bool, True values corresponding to halts

--Making dataframe for B3M1--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B3M1
photdiode halt info extracted
time in seconds fr

In [58]:
data_dict[names[0]]

Unnamed: 0,470_dfF,movementX,movementY,event,Seconds
1.200027e+06,0.000000,-4.480278e-03,7.701030e-03,False,0.0000
1.200027e+06,0.000002,-4.448916e-03,7.701030e-03,False,0.0001
1.200027e+06,0.000004,-4.417554e-03,7.701030e-03,False,0.0002
1.200027e+06,0.000005,-4.386192e-03,7.701030e-03,False,0.0003
1.200027e+06,0.000007,-4.354830e-03,7.701030e-03,False,0.0004
...,...,...,...,...,...
1.202208e+06,-0.735011,1.715617e-06,9.660696e-08,False,2181.4743
1.202208e+06,-0.736111,1.286712e-06,7.245522e-08,False,2181.4744
1.202208e+06,-0.737211,8.578083e-07,4.830348e-08,False,2181.4745
1.202208e+06,-0.738310,4.289041e-07,2.415174e-08,False,2181.4746


In [59]:
for mouse, df in data_dict.items():
    percent_true = (len(df.loc[df['event']==True])*100)/len(df)
    print(f'for {mouse} the True values makes up {percent_true:.2f} % of the total df lenght' )
    if percent_true > 50:
        print('This is more than 50 %, which may be too much, consider inversing True/False or check experiment protocol for mouse')

for B2M4 the True values makes up 7.72 % of the total df lenght
for B2M5 the True values makes up 10.28 % of the total df lenght
for B3M1 the True values makes up 5.94 % of the total df lenght
for B3M2 the True values makes up 11.86 % of the total df lenght


### Loading Experiment events and session info

In [60]:
event_dict = {}
for eventpath in eventpaths:
    ExpEvents = read_ExperimentEvents(Path(eventpath))
    ExpEvents.set_index('Seconds', inplace = True)
    ExpEvents.index = ExpEvents.index.round(4)
    name = eventpath.split('/')[-1][-4:]
    ExpEvents['experiment'] = eventpath.split('/')[-2].split('_')[1]
    for key, item in session_info.items():
        if key in eventpath.split('/')[-2]:
            ExpEvents['session']=item
    if name not in event_dict.keys():  
        event_dict[name] = ExpEvents
    else:
        event_dict[f'{name}_2'] = ExpEvents


In [61]:
event_dict[names[0]].head(10)

Unnamed: 0_level_0,Value,experiment,session
Seconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1200041.0,Sync signal started,MMclosed-and-Regular,day2
1200048.0,LinearNormal block started,MMclosed-and-Regular,day2
1200048.0,Homing platform,MMclosed-and-Regular,day2
1200049.0,Wait for run threshold...,MMclosed-and-Regular,day2
1200075.0,Check halt probability,MMclosed-and-Regular,day2
1200075.0,No halt,MMclosed-and-Regular,day2
1200075.0,Wait for run threshold...,MMclosed-and-Regular,day2
1200113.0,Check halt probability,MMclosed-and-Regular,day2
1200113.0,No halt,MMclosed-and-Regular,day2
1200113.0,Wait for run threshold...,MMclosed-and-Regular,day2


### Adding events (and non-events) and session info to data
1) add_experiment_events() takes the event data, and inserts the values for every timepoint into an event column in the main data
* Sometimes, there are multiple event strings during the same timepoint. Then, if a crucial event takes place during this, this one will be priotized
* The crucial events can be defined from strings they contain in the first line of the add_experiment_events function.
* In no crucial event are at that timepoint, all the events will be assinged to the timepoint in the main df as one string, seperated by a comma (use .split(',') later if it becomes necessary to seperate them during analysis)

2) The No_halt events are used to make a column where the no-halt events are used to make a bool similar to the 'event' (halt) column
   * These can later be used as control as they appear when there could have been a halt but there was none
   * The number of no-halt events is controlled to ensure that all of them were actually used.

In [62]:
data_dict = add_experiment_events(data_dict, event_dict,mouse_info)

Added new ExperimentEvents for B2M4
Added new ExperimentEvents for B2M5
Added new ExperimentEvents for B3M1
Added new ExperimentEvents for B3M2


In [63]:
data_dict[names[0]].ExperimentEvents.unique() #Check random mouse to see what events are present

array([nan, 'Sync signal started', 'LinearNormal block started',
       'Homing platform', 'Wait for run threshold...',
       'Check halt probability', 'Block timer elapsed',
       'LinearRegularMismatch block started', 'Halt delay: 1s',
       'Not running skipping halt', 'Apply halt: 1s',
       'LinearMismatch block started', 'No halt'], dtype=object)

In [64]:
data_dict = add_no_halt_column(data_dict, event_dict)

No_halt events added to B2M4
  Correct number of no-halt events for B2M4

No_halt events added to B2M5
  Correct number of no-halt events for B2M5

No_halt events added to B3M1
  Correct number of no-halt events for B3M1

No_halt events added to B3M2
  Correct number of no-halt events for B3M2



#### Add block columns
For each mouse and corresponding df, update the df to include columns for each block of the experiment for easy slicing later in analysis.
The add_block_columns() function will also test if each of the created block columns contains at least one True value and that there are no temporal overlaps 

In [65]:
for name, df in data_dict.items():
    print('\n updating data for ', name,'...')
    blocks_added_df = add_block_columns(df, event_dict[name])
    blocks_added_df.replace({})
    data_dict[name] = blocks_added_df

check_block_overlap(data_dict)


 updating data for  B2M4 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started

 updating data for  B2M5 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started

 updating data for  B3M1 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started

 updating data for  B3M2 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started
For B2M4: No overlapping True values, and each _block column has at least one True value
For B2M5: No overlapping True values, and each _block column has at least one True value
For B3M1: No overlapping True values, and each _block column has at least one True value
For B3M2: No overlapping True values, and each _block column has at least one True value


### The downsample_data function can be used to make the datset smaller. 
1) Make an empty dict to fill with downsampled versions of the dfs.
2) Loop though the mice and dfs, and use the function for each
3) Set the name of the time column to use and decide on the frequency of the output df datapoints
        * Ensure that all the columns that you want to keep has a corresponding dict key in aggregation_functions in the downsample_data() funciton. 
4) Assign the resulting df to the corresponding mouse
5) test_event_numbers as a way to test if all events (no-halt events as they are frequent, can be changed) survived the downsampling. 

NB: Can be slow with large datasets, check your email

In [41]:
def downsample_data(df, time_col='Seconds', interval=0.001):
    '''
    Uses pandas resample and aggregate functions to downsample the data to the desired interval. 
    * Note: Aggregation functions must be applied for each variable that is to be included.
    https://pandas.pydata.org/docs/reference/api/pandas.core.resample.Resampler.aggregate.html
    * Note: because the donwsampling keeps the first non-NaN value in each interval, some values could be lost.
    '''
    # Convert the Seconds column to a TimedeltaIndex
    df = df.set_index(pd.to_timedelta(df[time_col], unit='s'))

    #define aggregation functions for all possible columns
    aggregation_functions = {
        '470_dfF': 'mean', # takes the mean signal of the datapoints going into each new downsampled datapoint
        '560_dfF': 'mean',
        'movementX': 'mean',
        'movementY': 'mean',
        'event': 'any', # events column is a bool, and if there is any True values in the interval, the downsampled datapoint will be True
        'ExperimentEvents': lambda x: x.dropna().iloc[0] if not x.dropna().empty else None, #first non-NaN value in the interval 
        'Experiment': 'first', # All values should be the same, so it can always just take the first string value
        'Session': 'first',
        'mouseID': 'first',
        'sex': 'first',
        'area': 'first',
        'No_halt': 'any', 
        'LinearMismatch_block': 'any', 
        'LinearPlaybackMismatch_block': 'any',
        'LinearRegular_block': 'any',
        'LinearClosedloopMismatch_block':'any',
        'LinearRegularMismatch_block':'any',
        'LinearNormal_block':'any',
    }

    # Filter aggregation_functions to only include columns present in df
    aggregation_functions = {key: func for key, func in aggregation_functions.items() if key in df.columns}

    print('downsampling...')
    # Resample with the specified interval and apply the filtered aggregations
    downsampled_df = df.resample(f'{interval}s').agg(aggregation_functions)

    # Reset the index to make the Seconds column normal again
    downsampled_df = downsampled_df.reset_index()
    downsampled_df[time_col] = downsampled_df[time_col].dt.total_seconds()  # Convert Timedelta back to seconds

    # Forward fill for categorical columns if needed, only if they exist in downsampled_df
    categorical_cols = ['Experiment', 'Session', 'mouseID', 'sex', 'area']
    for col in categorical_cols:
        if col in downsampled_df.columns:
            downsampled_df[col] = downsampled_df[col].ffill()

    # Remove consecutive duplicate values in the 'ExperimentEvents' column, if it exists
    if 'ExperimentEvents' in downsampled_df.columns:
        downsampled_df['ExperimentEvents'] = downsampled_df['ExperimentEvents'].where(
            downsampled_df['ExperimentEvents'] != downsampled_df['ExperimentEvents'].shift()
        )

    return downsampled_df

In [42]:
def test_event_numbers(downsampled_data, original_data, mouse):
    '''
    Counts number of True values in the No_halt columns in the original and the downsampled data
    This will indicate whether information was lost in the downsampling.
    If the original events somehow has been upsampled previously (for example if the tolerance was set too high in add_experiment_events()), 
    repeatings of the same event can also lead to fewer True events in the downsampled df.
    '''
    nohalt_down = len(downsampled_data.loc[downsampled_data['No_halt']==True])
    nohalt_original = len(original_data.loc[original_data['No_halt']==True])
    if nohalt_down != nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are actually {nohalt_original} no-halts, but the downsampled data only contains {nohalt_down}')
        print('Should re-run the downsampling. Try changing interval lenght. Othewise, consider not downsampling\n')
    if nohalt_down == nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are {nohalt_original} no-halts, and downsampled data contains {nohalt_down}\n')


In [66]:
downsampled_dict = {}
for mouse, df in data_dict.items():
    downsampled_df = downsample_data(df, time_col='Seconds', interval=0.001)
    downsampled_dict[mouse] = downsampled_df
    test_event_numbers(downsampled_df, df, mouse)


downsampling...
mouseB2M4
There are 46 no-halts, and downsampled data contains 46

downsampling...
mouseB2M5
There are 50 no-halts, and downsampled data contains 50

downsampling...
mouseB3M1
There are 14 no-halts, and downsampled data contains 14

downsampling...
mouseB3M2
There are 52 no-halts, and downsampled data contains 52



In [71]:

downsampled_dict[names[0]].loc[downsampled_dict[names[0]].No_halt == True]

Unnamed: 0,Seconds,470_dfF,movementX,movementY,event,ExperimentEvents,Experiment,Session,mouseID,sex,area,No_halt,LinearMismatch_block,LinearRegularMismatch_block,LinearNormal_block
48580,48.58,-0.194849,0.106138,0.116581,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
85780,85.78,-2.623289,0.071801,-0.001758,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
119984,119.984,3.988452,0.088557,0.032794,False,Check halt probability,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
151000,151.0,-0.317275,0.102905,-0.018893,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
189160,189.16,2.738187,0.040383,0.013785,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
245820,245.82,0.684096,0.099785,0.047015,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
507071,507.071,3.28047,0.111897,-0.008445,False,Check halt probability,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
637340,637.34,1.778559,-0.010813,0.007701,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,False,False,True
1308540,1308.54,-2.215302,0.13549,-0.028006,False,Check halt probability,MMclosed-and-Regular,day2,B2M4,M,V2M,True,True,False,False
1318744,1318.744,-0.809255,0.158008,0.01598,False,Wait for run threshold...,MMclosed-and-Regular,day2,B2M4,M,V2M,True,True,False,False


### Concat and reindex

In [72]:
All_data = pd.concat([Data for Data in downsampled_dict.values()])


# Set a file name and save

!!! Make sure to change file save names before running the below cell

In [73]:
All_data

Unnamed: 0,Seconds,470_dfF,movementX,movementY,event,ExperimentEvents,Experiment,Session,mouseID,sex,area,No_halt,LinearMismatch_block,LinearRegularMismatch_block,LinearNormal_block
0,0.000,0.000008,-0.004339,0.007701,False,,MMclosed-and-Regular,day2,B2M4,M,V2M,False,False,False,False
1,0.001,0.000026,-0.004026,0.007701,False,,MMclosed-and-Regular,day2,B2M4,M,V2M,False,False,False,False
2,0.002,0.000045,-0.003712,0.007701,False,,MMclosed-and-Regular,day2,B2M4,M,V2M,False,False,False,False
3,0.003,0.000063,-0.003398,0.007701,False,,MMclosed-and-Regular,day2,B2M4,M,V2M,False,False,False,False
4,0.004,0.000081,-0.003085,0.007701,False,,MMclosed-and-Regular,day2,B2M4,M,V2M,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1928186,1928.186,0.902084,0.000014,-0.000072,False,,MMclosed-and-Regular,day2,B3M2,M,V2M,False,False,False,False
1928187,1928.187,0.926105,0.000011,-0.000056,False,,MMclosed-and-Regular,day2,B3M2,M,V2M,False,False,False,False
1928188,1928.188,0.950127,0.000008,-0.000039,False,,MMclosed-and-Regular,day2,B3M2,M,V2M,False,False,False,False
1928189,1928.189,0.974148,0.000005,-0.000023,False,,MMclosed-and-Regular,day2,B3M2,M,V2M,False,False,False,False


In [74]:

if 'day1' in All_data.Session.values:
    All_data.to_csv('Mismatch_analysis/G8_MMclosed_regular_session1.csv', index=False) #Change name
if 'day2' in All_data.Session.values:
    All_data.to_csv('Mismatch_analysis/G8_MMclosed_regular_session2.csv', index=False) #Change name
    


In [None]:
All_data