In [30]:
import os
import pandas as pd
from pathlib import Path
from harp_resources import process, utils
from analysis_functions import *

In [31]:
mouse_info = {'B2M1': {'sex': 'M', 'area': 'V2M'},
              'B2M4': {'sex': 'M', 'area': 'V2M'},
              'B2M5': {'sex': 'M', 'area': 'V2M'},
              'B2M6': {'sex': 'M', 'area': 'V2M'},
              'B3M1': {'sex': 'M', 'area': 'V2M'},
              'B3M2': {'sex': 'M', 'area': 'V2M'},
              'B3M3': {'sex': 'F', 'area': 'V1'},
              'B3M4': {'sex': 'M', 'area': 'V2M'},
              'B3M5': {'sex': 'M', 'area': 'V2M'},
              'B3M6': {'sex': 'F', 'area': 'V2M'},
              'B3M7': {'sex': 'F', 'area': 'V2M'},
              'B3M8': {'sex': 'F', 'area': 'V2M'},
              'B0M0': {'sex': 'F', 'area': 'V2M'},
             }

session_info = {'220824': 'day1',
                '230824': 'day2',
                '190824': 'day1',
                '200824': 'day2',
                '120824': 'day1',
                '130824': 'day2',
                '070824': 'day1',
                '080824': 'day2',
               }

## Defining paths for grab or G8

In [32]:
rootdir = '/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824'#Enter root path
h5_paths = []
eventpaths = []
for dirpath, subdirs, files in os.walk(rootdir):
    for x in files:
        if '.h5' in x:
            eventpaths.append(dirpath)
            h5_paths.append(dirpath+'/'+x)

In [33]:
# Expression unit testing
'''h5_paths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5',
            '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5']
eventpaths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0',
             '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0']'''

"h5_paths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5',\n            '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5']\neventpaths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0',\n             '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0']"

### Loading data streams

In [41]:

import numpy as np
def load_h5_streams_to_dict(data_paths):
    '''
    Takes list of H5 file paths and, loads streams into dictionary, and save to dictionary named by mouse ID
    '''
    reconstructed_dict = {}  # Dictionary to save streams
    
    for input_file in data_paths:
        
        name = input_file.split('/')[-1][-7:-3]  # Extract mouse ID from file name
        
        if not os.path.exists(input_file):
            print(f'ERROR: {input_file} does not exist.')
            return None
        
        with h5py.File(input_file, 'r') as h5file:
            print(f'reconstructing streams for mouse {name}, from session folder: {input_file.split("/")[-3]}')
            
            common_index = h5file['HARP_timestamps'][:]
            reconstructed_streams = {}
            
            for source_name in h5file.keys():
                if source_name == 'HARP_timestamps':
                    continue
                
                reconstructed_streams[source_name] = {}
                source_group = h5file[source_name]
                
                for stream_name in source_group.keys():
                    stream_data = source_group[stream_name][:]
                    length_difference = len(common_index) - len(stream_data)
                    
                    # Pad or truncate to match common_index length
                    if len(stream_data) < len(common_index):
                        padding = np.full(len(common_index) - len(stream_data), np.nan)
                        stream_data = np.concatenate([stream_data, padding])
                        print(f"{source_name} - {stream_name}: Length difference: {length_difference}")
                        print(f"missing data, advicable to ensure correct alignment \n ")
                    elif len(stream_data) > len(common_index):
                        stream_data = stream_data[:len(common_index)]
                    
                    reconstructed_streams[source_name][stream_name] = pd.Series(data=stream_data, index=common_index)
        if name not in reconstructed_dict.keys():   
            reconstructed_dict[name] = reconstructed_streams
            print(f'  --> {name} streams reconstructed and added to dictionary \n')
        else: 
            reconstructed_dict[f'{name}_2'] = reconstructed_streams
            print(f'  --> {name} streams_2 reconstructed and added to dictionary \n')
    
    return reconstructed_dict




In [42]:
stream_dict_dict = load_h5_streams_to_dict(h5_paths)

/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824/2024-08-13T09-44-04_B2M4/resampled_streams_2024-08-13T09-44-04_B2M4.h5
reconstructing streams for mouse B2M4, from session folder: G8_MMclosed-and-Regular_130824
  --> B2M4 streams reconstructed and added to dictionary 

/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824/2024-08-13T10-36-27_B2M5/resampled_streams_2024-08-13T10-36-27_B2M5.h5
reconstructing streams for mouse B2M5, from session folder: G8_MMclosed-and-Regular_130824
  --> B2M5 streams reconstructed and added to dictionary 

/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824/2024-08-13T11-20-23_B3M1/resampled_streams_2024-08-13T11-20-23_B3M1.h5
reconstructing streams for mouse B3M1, from session folder: G8_MMclosed-and-Regular_130824
  --> B3M1 streams reconstructed and added to dictionary 

/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824/2024-08-13T12-07-21_B3M2/

In [40]:
stream_dict_dict['B2M4']

{'H1': {'OpticalTrackingRead0X(46)': 1.200027e+06   -0.211667
  1.200027e+06   -0.210185
  1.200027e+06   -0.208703
  1.200027e+06   -0.207222
  1.200027e+06   -0.205740
                    ...   
  1.202208e+06    0.000081
  1.202208e+06    0.000061
  1.202208e+06    0.000041
  1.202208e+06    0.000020
  1.202208e+06    0.000000
  Length: 21814748, dtype: float64,
  'OpticalTrackingRead0Y(46)': 1.200027e+06    0.363828
  1.200027e+06    0.363828
  1.200027e+06    0.363828
  1.200027e+06    0.363828
  1.200027e+06    0.363828
                    ...   
  1.202208e+06    0.000005
  1.202208e+06    0.000003
  1.202208e+06    0.000002
  1.202208e+06    0.000001
  1.202208e+06    0.000000
  Length: 21814748, dtype: float64},
 'H2': {},
 'ONIX': {'Photodiode': 1.200027e+06    True
  1.200027e+06    True
  1.200027e+06    True
  1.200027e+06    True
  1.200027e+06    True
                  ... 
  1.202208e+06    True
  1.202208e+06    True
  1.202208e+06    True
  1.202208e+06    True
  1.20

In [37]:
#Make a cut_info dict for the mouse with missing data
#cut_info = {'B2M5': 521, 'B3M2':174}

In [38]:
def make_dataframes(stream_dict_dict, cut_info = {}):
    data_dict = {}
    for mouse, streamdict in stream_dict_dict.items():
        
        print(f'\n--Making dataframe for {mouse}--')
        #Getting fluorescence traces
        try: 
            fluorescence = streamdict['Photometry']['470_dfF'] #Using '470_dfF' only
        except KeyError:
            fluorescence = streamdict['Photometry']['CH1-470']
        print('flourescence 470 extracted')
    
        #Getting mouse movement data and converting to cm / second
        movementX = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0X(46)'])*100
        movementY = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0Y(46)'])*100
        print('movement on x and Y axis extracted')
    
        #Getting eye movements and pupil diameter
        if 'SleapVideoData2' in streamdict:
            eye_center_x = streamdict['SleapVideoData2']['Ellipse.Center.X']
            eye_center_y = streamdict['SleapVideoData2']['Ellipse.Center.Y']
            eye_diameter = streamdict['SleapVideoData2']['Ellipse.Diameter']
            print('eye movement data extracted')
        else: 
            print('There was no eye movement data available for ', mouse)
    
        #Getting visual stimuli event times
        event = streamdict['ONIX']['Photodiode']
        print('photdiode halt info extracted')
        
        time = movementX.index - movementX.index[0]
        print('time in seconds from 0 extracted form X direction movement')
        
        dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
            'Seconds': time}
        #dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
         #   'TimeStamp': time, 'eye_x': eye_center_x, 'eye_y': eye_center_y, 'pupil_diameter': eye_diameter}
        
        df = pd.DataFrame(dict)
        print('dataframe created with columns: ', df.columns)
        #if mouse in cut_info:
            #df = df.iloc[:-cut_info[mouse]]
        
        df['event'] = df['event'].astype(bool) #In case column is not bool
        #Reversing, so that a halt appearst when 'event'==True
        df['event'] = ~df['event']
        print('Event column as bool, True values corresponding to halts')
        
        df.reset_index(inplace=False)
        
        data_dict[mouse]= df
    return data_dict

In [39]:
data_dict = make_dataframes(stream_dict_dict )
names = [name for name in data_dict]


--Making dataframe for B2M4--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B2M4
photdiode halt info extracted
time in seconds from 0 extracted form X direction movement
dataframe created with columns:  Index(['470_dfF', 'movementX', 'movementY', 'event', 'Seconds'], dtype='object')
Event column as bool, True values corresponding to halts

--Making dataframe for B2M5--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B2M5
photdiode halt info extracted
time in seconds from 0 extracted form X direction movement
dataframe created with columns:  Index(['470_dfF', 'movementX', 'movementY', 'event', 'Seconds'], dtype='object')
Event column as bool, True values corresponding to halts

--Making dataframe for B3M1--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B3M1
photdiode halt info extracted
time in seconds fr

In [11]:
data_dict['B3M3']

Unnamed: 0,470_dfF,movementX,movementY,event,Seconds
1.139104e+06,0.000000,0.000000e+00,0.000000,False,0.0000
1.139104e+06,0.000014,-4.634770e-05,0.000398,False,0.0001
1.139104e+06,0.000027,-9.269540e-05,0.000797,False,0.0002
1.139104e+06,0.000041,-1.390431e-04,0.001195,False,0.0003
1.139104e+06,0.000054,-1.853908e-04,0.001593,False,0.0004
...,...,...,...,...,...
1.141097e+06,5.178327,1.895550e-06,0.000000,False,1992.7267
1.141097e+06,5.177970,1.421663e-06,0.000000,False,1992.7268
1.141097e+06,5.177613,9.477751e-07,0.000000,False,1992.7269
1.141097e+06,5.177256,4.738876e-07,0.000000,False,1992.7270


In [12]:
for mouse, df in data_dict.items():
    percent_true = (len(df.loc[df['event']==True])*100)/len(df)
    print(f'for {mouse} the True values makes up {percent_true:.2f} % of the total df lenght' )
    if percent_true > 50:
        print('This is more than 50 %, which may be too much, consider inversing True/False or check experiment protocol for mouse')

for B2M4 the True values makes up 0.59 % of the total df lenght
for B2M5 the True values makes up 8.72 % of the total df lenght
for B3M1 the True values makes up 7.00 % of the total df lenght
for B3M2 the True values makes up 12.90 % of the total df lenght
for B3M3 the True values makes up 47.53 % of the total df lenght


### Loading Experiment events and session info

In [13]:
event_dict = {}
for eventpath in eventpaths:
    ExpEvents = read_ExperimentEvents(Path(eventpath))
    ExpEvents.set_index('Seconds', inplace = True)
    ExpEvents.index = ExpEvents.index.round(4)
    name = eventpath.split('/')[-1][-4:]
    ExpEvents['experiment'] = eventpath.split('/')[-2].split('_')[1]
    for key, item in session_info.items():
        if key in eventpath.split('/')[-2]:
            ExpEvents['session']=item
    if name not in event_dict.keys():  
        event_dict[name] = ExpEvents
    else:
        event_dict[f'{name}_2'] = ExpEvents


In [14]:
event_dict['B3M3'].head()

Unnamed: 0_level_0,Value,experiment,session
Seconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1139112.0,LinearNormal block started,MMclosed,day1
1139112.0,Homing platform,MMclosed,day1
1139112.0,Wait for run threshold...,MMclosed,day1
1139121.0,Logging started,MMclosed,day1
1139127.0,Check halt probability,MMclosed,day1


### Adding events (and non-events) and session info to data

In [15]:
data_dict = add_experiment_events(data_dict, event_dict,mouse_info)

Added new ExperimentEvents for B2M4
Added new ExperimentEvents for B2M5
Added new ExperimentEvents for B3M1
Added new ExperimentEvents for B3M2
Added new ExperimentEvents for B3M3


In [16]:
data_dict[names[0]].ExperimentEvents.unique() #Check random mouse to see what events are present

array([nan, 'Sync signal started', 'LinearNormal block started',
       'Homing platform', 'Wait for run threshold...', 'Logging started',
       'Block timer elapsed', 'LinearMismatch block started',
       'Halt delay: 1s', 'Apply halt: 1s'], dtype=object)

In [17]:
data_dict = add_no_halt_column(data_dict, event_dict)

No_halt events added to B2M4
  Correct number of no-halt events for B2M4

No_halt events added to B2M5
  Correct number of no-halt events for B2M5

No_halt events added to B3M1
  Correct number of no-halt events for B3M1

No_halt events added to B3M2
  Correct number of no-halt events for B3M2

No_halt events added to B3M3
  Correct number of no-halt events for B3M3



#### Add block columns

In [18]:
for name, df in data_dict.items():
    print('updating data for ', name,'...')
    blocks_added_df = add_block_columns(df, event_dict[name])
    blocks_added_df.replace({})
    data_dict[name] = blocks_added_df

check_block_overlap(data_dict)

updating data for  B2M4 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started
updating data for  B2M5 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started
updating data for  B3M1 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started
updating data for  B3M2 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started
updating data for  B3M3 ...
LinearNormal block started
LinearRegularMismatch block started
LinearMismatch block started
For B2M4: No overlapping True values, and each _block column has at least one True value
For B2M5: No overlapping True values, and each _block column has at least one True value
For B3M1: No overlapping True values, and each _block column has at least one True value
For B3M2: No overlapping True values, and each _block column has at least one True value
For B3M3: No overlapping True values, and ea

In [19]:
data_dict

{'B2M4':                470_dfF  movementX  movementY  event    Seconds  \
 1.126976e+06  0.000000   0.000000   0.000000  False     0.0000   
 1.126976e+06 -0.000005   0.000314   0.000128  False     0.0001   
 1.126976e+06 -0.000009   0.000627   0.000257  False     0.0002   
 1.126976e+06 -0.000014   0.000941   0.000385  False     0.0003   
 1.126976e+06 -0.000018   0.001254   0.000513  False     0.0004   
 ...                ...        ...        ...    ...        ...   
 1.128949e+06 -0.129943   0.000006   0.000000  False  1972.5663   
 1.128949e+06 -0.128553   0.000005   0.000000  False  1972.5664   
 1.128949e+06 -0.127162   0.000003   0.000000  False  1972.5665   
 1.128949e+06 -0.125772   0.000002   0.000000  False  1972.5666   
 1.128949e+06 -0.124382   0.000000   0.000000  False  1972.5667   
 
              ExperimentEvents Experiment Session mouseID sex area  No_halt  \
 1.126976e+06              NaN   MMclosed    day1    B2M4   M  V2M    False   
 1.126976e+06              N

In [20]:
def downsample_data(df, time_col='Seconds', interval=0.001):
    # Convert the Seconds column to a TimedeltaIndex
    df = df.set_index(pd.to_timedelta(df[time_col], unit='s'))

    # Define aggregation functions for all possible columns
    aggregation_functions = {
        '470_dfF': 'mean',
        'movementX': 'mean',
        'movementY': 'mean',
        'event': 'any',
        'ExperimentEvents': lambda x: x.dropna().iloc[0] if not x.dropna().empty else None,
        'Experiment': 'first',
        'Session': 'first',
        'mouseID': 'first',
        'sex': 'first',
        'area': 'first',
        'No_halt': 'any',
        'LinearMismatch_block': 'any',
        'LinearPlaybackMismatch_block': 'any',
        'LinearRegular_block': 'any',
        'LinearClosedloopMismatch_block':'any',
    }

    # Filter aggregation_functions to only include columns present in df
    aggregation_functions = {key: func for key, func in aggregation_functions.items() if key in df.columns}

    # Resample with the specified interval and apply the filtered aggregations
    downsampled_df = df.resample(f'{interval}s').agg(aggregation_functions)

    # Reset the index to make the Seconds column normal again
    downsampled_df = downsampled_df.reset_index()
    downsampled_df[time_col] = downsampled_df[time_col].dt.total_seconds()  # Convert Timedelta back to seconds

    # Forward fill for categorical columns if needed, only if they exist in downsampled_df
    categorical_cols = ['Experiment', 'Session', 'mouseID', 'sex', 'area']
    for col in categorical_cols:
        if col in downsampled_df.columns:
            downsampled_df[col] = downsampled_df[col].ffill()

    # Remove consecutive duplicate values in the 'ExperimentEvents' column, if it exists
    if 'ExperimentEvents' in downsampled_df.columns:
        downsampled_df['ExperimentEvents'] = downsampled_df['ExperimentEvents'].where(
            downsampled_df['ExperimentEvents'] != downsampled_df['ExperimentEvents'].shift()
        )

    return downsampled_df



In [21]:
def test_event_numbers(downsampled_data, original_data, mouse):
    nohalt_down = len(downsampled_data.loc[downsampled_data['No_halt']==True])
    nohalt_original = len(original_data.loc[original_data['No_halt']==True])
    if nohalt_down != nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are actually {nohalt_original} no-halts, but the downsampled data only contains {nohalt_down}')
        print('Should re-run the downsampling. Try changing interval lenght. Othewise, consider not downsampling\n')
    if nohalt_down == nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are {nohalt_original} no-halts, and downsampled data contains {nohalt_down}\n')
    
    

In [22]:
downsampled_dict = {}
for mouse, df in data_dict.items():
    downsampled_df = downsample_data(data_dict[mouse], time_col='Seconds', interval=0.001)
    downsampled_dict[mouse] = downsampled_df
    test_event_numbers(downsampled_df, df, mouse)


mouseB2M4
There are 23 no-halts, and downsampled data contains 23

mouseB2M5
There are 45 no-halts, and downsampled data contains 45

mouseB3M1
There are 42 no-halts, and downsampled data contains 42

mouseB3M2
There are 58 no-halts, and downsampled data contains 58

mouseB3M3
There are 41 no-halts, and downsampled data contains 41



In [23]:
mouse = 'B3M1'
downsampled_dict[mouse].loc[downsampled_dict[mouse].No_halt == True]

Unnamed: 0,Seconds,470_dfF,movementX,movementY,event,ExperimentEvents,Experiment,Session,mouseID,sex,area,No_halt,LinearMismatch_block
51988,51.988,-1.809708,0.13123,0.028391,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
71731,71.731,1.185743,0.096071,-0.028969,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
100068,100.068,5.754631,0.115885,-0.010176,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
136995,136.995,0.613546,0.10088,0.012168,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
191343,191.343,-3.002144,0.135672,0.024271,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
285423,285.423,2.174369,0.148732,0.055383,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
301623,301.623,2.784171,0.062531,0.028237,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
320405,320.405,6.144165,0.080952,0.040225,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
341855,341.855,4.794115,0.01192,0.003799,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False
372104,372.104,2.303832,0.166803,0.022038,False,Wait for run threshold...,MMclosed,day1,B3M1,M,V2M,True,False


# Set a file name and save

!!! Make sure to change file save names before running the below cell

In [26]:

for mouse, Data in downsampled_dict.items():
    Data = Data.reset_index()
    Data.index.name = 'Time'
    if 'day1' in Data.Session.values:
        Data.to_csv('G8_MMclosed_regular_session1.csv', index=False) #Change name
    if 'day2' in Data.Session.values:
        Data.to_csv('G8_MMclosed_regular_session2.csv', index=False) #Change name
        