In [29]:
import os
import pandas as pd
from pathlib import Path
from harp_resources import process, utils
from analysis_functions import *

In [30]:
mouse_info = {'B2M1': {'sex': 'M', 'area': 'V2M'},
              'B2M4': {'sex': 'M', 'area': 'V2M'},
              'B2M5': {'sex': 'M', 'area': 'V2M'},
              'B2M6': {'sex': 'M', 'area': 'V2M'},
              'B3M1': {'sex': 'M', 'area': 'V2M'},
              'B3M2': {'sex': 'M', 'area': 'V2M'},
              'B3M3': {'sex': 'F', 'area': 'V1'},
              'B3M4': {'sex': 'M', 'area': 'V2M'},
              'B3M5': {'sex': 'M', 'area': 'V2M'},
              'B3M6': {'sex': 'F', 'area': 'V2M'},
              'B3M7': {'sex': 'F', 'area': 'V2M'},
              'B3M8': {'sex': 'F', 'area': 'V2M'},
              'B0M0': {'sex': 'F', 'area': 'V2M'},
             }

session_info = {'220824': 'day1',
                '230824': 'day2',
                '190824': 'day1',
                '200824': 'day2',
                '120824': 'day1',
                '130824': 'day2',
                '070824': 'day1',
                '080824': 'day2',
               }

## Defining paths for grab or G8

In [31]:
rootdir = '/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-open_080824' #Enter root path
rootdir = 
h5_paths = []
eventpaths = []
for dirpath, subdirs, files in os.walk(rootdir):
    for x in files:
        if '.h5' in x:
            eventpaths.append(dirpath)
            h5_paths.append(dirpath+'/'+x)

SyntaxError: invalid syntax (4024593825.py, line 2)

In [32]:
# Expression unit testing
h5_paths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5',
            '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5']
eventpaths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0',
             '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0']

### Loading data streams

In [33]:

import numpy as np
def load_h5_streams_to_dict(data_paths):
    '''
    Takes list of H5 file paths and, loads streams into dictionary, and save to dictionary named by mouse ID
    '''
    reconstructed_dict = {}  # Dictionary to save streams
    
    for input_file in data_paths:
        name = input_file.split('/')[-1][-7:-3]  # Extract mouse ID from file name
        
        if not os.path.exists(input_file):
            print(f'ERROR: {input_file} does not exist.')
            return None
        
        with h5py.File(input_file, 'r') as h5file:
            print(f'reconstructing streams for mouse {name}, from session folder: {input_file.split("/")[-3]}')
            
            common_index = h5file['HARP_timestamps'][:]
            reconstructed_streams = {}
            
            for source_name in h5file.keys():
                if source_name == 'HARP_timestamps':
                    continue
                
                reconstructed_streams[source_name] = {}
                source_group = h5file[source_name]
                
                for stream_name in source_group.keys():
                    stream_data = source_group[stream_name][:]
                    length_difference = len(common_index) - len(stream_data)
                    
                    # Pad or truncate to match common_index length
                    if len(stream_data) < len(common_index):
                        padding = np.full(len(common_index) - len(stream_data), np.nan)
                        stream_data = np.concatenate([stream_data, padding])
                        print(f"{source_name} - {stream_name}: Length difference: {length_difference}")
                        print(f"missing data, advicable to ensure correct alignment \n ")
                    elif len(stream_data) > len(common_index):
                        stream_data = stream_data[:len(common_index)]
                    
                    reconstructed_streams[source_name][stream_name] = pd.Series(data=stream_data, index=common_index)
        if name not in reconstructed_dict.keys():   
            reconstructed_dict[name] = reconstructed_streams
            print(f'  --> {name} streams reconstructed and added to dictionary \n')
        else: 
            reconstructed_dict[f'{name}_2'] = reconstructed_streams
            print(f'  --> {name} streams_2 reconstructed and added to dictionary \n')
    
    return reconstructed_dict




In [34]:
stream_dict_dict = load_h5_streams_to_dict(h5_paths)

reconstructing streams for mouse B0M0, from session folder: Fake_data_080824
  --> B0M0 streams reconstructed and added to dictionary 

reconstructing streams for mouse B0M0, from session folder: Fake_data_070824
  --> B0M0 streams_2 reconstructed and added to dictionary 



In [46]:
stream_dict_dict['B0M0']

{'H1': {'OpticalTrackingRead0X(46)': 762869.6790    0.000000
  762869.6791    0.000010
  762869.6792    0.000021
  762869.6793    0.000031
  762869.6794    0.000042
                   ...   
  765063.2686   -0.367112
  765063.2687   -0.367121
  765063.2688   -0.367131
  765063.2689   -0.367141
  765063.2690   -0.367151
  Length: 21935901, dtype: float64,
  'OpticalTrackingRead0Y(46)': 762869.6790    0.0
  762869.6791    1.0
  762869.6792    1.0
  762869.6793    1.0
  762869.6794    1.0
                ... 
  765063.2686    0.0
  765063.2687    0.0
  765063.2688    0.0
  765063.2689    0.0
  765063.2690    0.0
  Length: 21935901, dtype: float64},
 'H2': {},
 'ONIX': {'Photodiode': 762869.6790    True
  762869.6791    True
  762869.6792    True
  762869.6793    True
  762869.6794    True
                 ... 
  765063.2686    True
  765063.2687    True
  765063.2688    True
  765063.2689    True
  765063.2690    True
  Length: 21935901, dtype: bool},
 'Photometry': {'CH1-470': 762869.679

In [47]:
#Make a cut_info dict for the mouse with missing data
#cut_info = {'B2M5': 521, 'B3M2':174}

In [48]:
def make_dataframes(stream_dict_dict, cut_info = {}):
    data_dict = {}
    for mouse, streamdict in stream_dict_dict.items():
        
        print(f'\n--Making dataframe for {mouse}--')
        #Getting fluorescence traces
        try: 
            fluorescence = streamdict['Photometry']['470_dfF'] #Using '470_dfF' only
        except KeyError:
            fluorescence = streamdict['Photometry']['CH1-470']
        print('flourescence 470 extracted')
    
        #Getting mouse movement data and converting to cm / second
        movementX = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0X(46)'])*100
        movementY = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0Y(46)'])*100
        print('movement on x and Y axis extracted')
    
        #Getting eye movements and pupil diameter
        if 'SleapVideoData2' in streamdict:
            eye_center_x = streamdict['SleapVideoData2']['Ellipse.Center.X']
            eye_center_y = streamdict['SleapVideoData2']['Ellipse.Center.Y']
            eye_diameter = streamdict['SleapVideoData2']['Ellipse.Diameter']
            print('eye movement data extracted')
        else: 
            print('There was no eye movement data available for ', mouse)
    
        #Getting visual stimuli event times
        event = streamdict['ONIX']['Photodiode']
        print('photdiode halt info extracted')
        
        time = movementX.index - movementX.index[0]
        print('time in seconds from 0 extracted form X direction movement')
        
        dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
            'Seconds': time}
        #dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
         #   'TimeStamp': time, 'eye_x': eye_center_x, 'eye_y': eye_center_y, 'pupil_diameter': eye_diameter}
        
        df = pd.DataFrame(dict)
        print('dataframe created with columns: ', df.columns)
        #if mouse in cut_info:
            #df = df.iloc[:-cut_info[mouse]]
        
        df['event'] = df['event'].astype(bool) #In case column is not bool
        #Reversing, so that a halt appearst when 'event'==True
        df['event'] = ~df['event']
        print('Event column as bool, True values corresponding to halts')
        
        df.reset_index(inplace=False)
        
        data_dict[mouse]= df
    return data_dict

In [49]:
data_dict = make_dataframes(stream_dict_dict )
names = [name for name in data_dict]


--Making dataframe for B0M0--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B0M0
photdiode halt info extracted
time in seconds from 0 extracted form X direction movement
dataframe created with columns:  Index(['470_dfF', 'movementX', 'movementY', 'event', 'Seconds'], dtype='object')
Event column as bool, True values corresponding to halts

--Making dataframe for B0M0_2--
flourescence 470 extracted
movement on x and Y axis extracted
There was no eye movement data available for  B0M0_2
photdiode halt info extracted
time in seconds from 0 extracted form X direction movement
dataframe created with columns:  Index(['470_dfF', 'movementX', 'movementY', 'event', 'Seconds'], dtype='object')
Event column as bool, True values corresponding to halts


In [50]:
data_dict['B0M0']

Unnamed: 0,470_dfF,movementX,movementY,event,Seconds
762869.6790,0.000000,0.000000e+00,0.000000,False,0.0000
762869.6791,0.000008,2.216568e-07,0.021167,False,0.0001
762869.6792,0.000016,4.433136e-07,0.021167,False,0.0002
762869.6793,0.000024,6.649704e-07,0.021167,False,0.0003
762869.6794,0.000031,8.866273e-07,0.021167,False,0.0004
...,...,...,...,...,...
765063.2686,0.482469,-7.770532e-03,0.000000,False,2193.5896
765063.2687,0.482462,-7.770738e-03,0.000000,False,2193.5897
765063.2688,0.482456,-7.770944e-03,0.000000,False,2193.5898
765063.2689,0.482449,-7.771150e-03,0.000000,False,2193.5899


In [51]:
for mouse, df in data_dict.items():
    percent_true = (len(df.loc[df['event']==True])*100)/len(df)
    print(f'for {mouse} the True values makes up {percent_true:.2f} % of the total df lenght' )
    if percent_true > 50:
        print('This is more than 50 %, which may be too much, consider inversing True/False or check experiment protocol for mouse')

for B0M0 the True values makes up 4.00 % of the total df lenght
for B0M0_2 the True values makes up 2.00 % of the total df lenght


### Loading Experiment events and session info

In [52]:
event_dict = {}
for eventpath in eventpaths:
    ExpEvents = read_ExperimentEvents(Path(eventpath))
    ExpEvents.set_index('Seconds', inplace = True)
    ExpEvents.index = ExpEvents.index.round(4)
    name = eventpath.split('/')[-1][-4:]
    ExpEvents['experiment'] = eventpath.split('/')[-2].split('_')[1]
    for key, item in session_info.items():
        if key in eventpath.split('/')[-2]:
            ExpEvents['session']=item
    if name not in event_dict.keys():  
        event_dict[name] = ExpEvents
    else:
        event_dict[f'{name}_2'] = ExpEvents


In [53]:
event_dict['B0M0'].head()

Unnamed: 0_level_0,Value,experiment,session
Seconds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
762890.227,No halt,data,day2
762946.7576,No halt,data,day2
763052.2132,No halt,data,day2
763296.9694,Block timer elapsed,data,day2
763482.422,No halt,data,day2


### Adding events (and non-events) and session info to data

In [17]:
data_dict = add_experiment_events(data_dict, event_dict,mouse_info)

Added new ExperimentEvents for B0M0
Added new ExperimentEvents for B0M0_2


In [18]:
data_dict[names[0]].ExperimentEvents.unique() #Check random mouse to see what events are present

array([nan, 'No halt', 'LinearClosedloopMismatch block started',
       'No_halt', 'Block timer elapsed',
       'LinearPlaybackMismatch block started'], dtype=object)

In [19]:
data_dict = add_no_halt_column(data_dict, event_dict)

No_halt events added to B0M0
  Correct number of no-halt events for B0M0

No_halt events added to B0M0_2
  Correct number of no-halt events for B0M0_2



#### Add block columns

In [20]:
for name, df in data_dict.items():
    print('updating data for ', name,'...')
    blocks_added_df = add_block_columns(df, event_dict[name])
    blocks_added_df.replace({})
    data_dict[name] = blocks_added_df

check_block_overlap(data_dict)

updating data for  B0M0 ...
LinearClosedloopMismatch block started
LinearPlaybackMismatch block started
LinearClosedloopMismatch block started
LinearPlaybackMismatch block started
updating data for  B0M0_2 ...
LinearClosedloopMismatch block started
LinearPlaybackMismatch block started
For B0M0: No overlapping True values, and each _block column has at least one True value
For B0M0_2: No overlapping True values, and each _block column has at least one True value


In [21]:
data_dict

{'B0M0':               470_dfF     movementX  movementY  event    Seconds  \
 762869.6790  0.000000  0.000000e+00   0.000000  False     0.0000   
 762869.6791  0.000008  2.216568e-07   0.021167  False     0.0001   
 762869.6792  0.000016  4.433136e-07   0.021167  False     0.0002   
 762869.6793  0.000024  6.649704e-07   0.021167  False     0.0003   
 762869.6794  0.000031  8.866273e-07   0.021167  False     0.0004   
 ...               ...           ...        ...    ...        ...   
 765063.2686  0.482469 -7.770532e-03   0.000000  False  2193.5896   
 765063.2687  0.482462 -7.770738e-03   0.000000  False  2193.5897   
 765063.2688  0.482456 -7.770944e-03   0.000000  False  2193.5898   
 765063.2689  0.482449 -7.771150e-03   0.000000  False  2193.5899   
 765063.2690  0.482442 -7.771356e-03   0.000000  False  2193.5900   
 
             ExperimentEvents Experiment Session mouseID sex area  No_halt  \
 762869.6790              NaN       data    day2    B0M0   F  V2M    False   
 76286

In [23]:
def downsample_data(df, time_col='Seconds', interval=0.001):
    # Convert the Seconds column to a TimedeltaIndex
    df = df.set_index(pd.to_timedelta(df[time_col], unit='s'))

    # Define aggregation functions for all possible columns
    aggregation_functions = {
        '470_dfF': 'mean',
        'movementX': 'mean',
        'movementY': 'mean',
        'event': 'any',
        'ExperimentEvents': lambda x: x.dropna().iloc[0] if not x.dropna().empty else None,
        'Experiment': 'first',
        'Session': 'first',
        'mouseID': 'first',
        'sex': 'first',
        'area': 'first',
        'No_halt': 'any',
        'LinearMismatch_block': 'any',
        'LinearPlaybackMismatch_block': 'any',
        'LinearRegular_block': 'any',
        'LinearClosedloopMismatch_block':'any',
    }

    # Filter aggregation_functions to only include columns present in df
    aggregation_functions = {key: func for key, func in aggregation_functions.items() if key in df.columns}

    # Resample with the specified interval and apply the filtered aggregations
    downsampled_df = df.resample(f'{interval}s').agg(aggregation_functions)

    # Reset the index to make the Seconds column normal again
    downsampled_df = downsampled_df.reset_index()
    downsampled_df[time_col] = downsampled_df[time_col].dt.total_seconds()  # Convert Timedelta back to seconds

    # Forward fill for categorical columns if needed, only if they exist in downsampled_df
    categorical_cols = ['Experiment', 'Session', 'mouseID', 'sex', 'area']
    for col in categorical_cols:
        if col in downsampled_df.columns:
            downsampled_df[col] = downsampled_df[col].ffill()

    # Remove consecutive duplicate values in the 'ExperimentEvents' column, if it exists
    if 'ExperimentEvents' in downsampled_df.columns:
        downsampled_df['ExperimentEvents'] = downsampled_df['ExperimentEvents'].where(
            downsampled_df['ExperimentEvents'] != downsampled_df['ExperimentEvents'].shift()
        )

    return downsampled_df



In [24]:
def test_event_numbers(downsampled_data, original_data, mouse):
    nohalt_down = len(downsampled_data.loc[downsampled_data['No_halt']==True])
    nohalt_original = len(original_data.loc[original_data['No_halt']==True])
    if nohalt_down != nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are actually {nohalt_original} no-halts, but the downsampled data only contains {nohalt_down}')
        print('Should re-run the downsampling. Try changing interval lenght. Othewise, consider not downsampling\n')
    if nohalt_down == nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are {nohalt_original} no-halts, and downsampled data contains {nohalt_down}\n')
    
    

In [25]:
downsampled_dict = {}
for mouse, df in data_dict.items():
    downsampled_df = downsample_data(data_dict[mouse], time_col='Seconds', interval=0.001)
    downsampled_dict[mouse] = downsampled_df
    test_event_numbers(downsampled_df, df, mouse)


mouseB0M0
There are 15 no-halts, and downsampled data contains 15

mouseB0M0_2
There are 15 no-halts, and downsampled data contains 15



In [26]:
downsampled_dict['B0M0'].loc[downsampled_dict['B0M0'].No_halt == True]

Unnamed: 0,Seconds,470_dfF,movementX,movementY,event,ExperimentEvents,Experiment,Session,mouseID,sex,area,No_halt,LinearPlaybackMismatch_block,LinearClosedloopMismatch_block
20548,20.548,0.999072,0.017693,0.021167,True,No halt,data,day2,B0M0,F,V2M,True,False,False
77078,77.078,-0.22745,0.020667,0.021167,True,No halt,data,day2,B0M0,F,V2M,True,False,False
182534,182.534,0.980254,0.005552,0.021167,False,No halt,data,day2,B0M0,F,V2M,True,False,True
612743,612.743,-0.84194,0.020578,0.021167,False,No halt,data,day2,B0M0,F,V2M,True,True,False
618904,618.904,-0.9963,0.019422,0.021167,False,No halt,data,day2,B0M0,F,V2M,True,True,False
669952,669.952,0.709743,0.018278,0.021167,True,No halt,data,day2,B0M0,F,V2M,True,True,False
819508,819.508,0.999255,-0.017762,0.0,True,No halt,data,day2,B0M0,F,V2M,True,True,False
900610,900.61,0.998851,0.001352,0.021167,False,No halt,data,day2,B0M0,F,V2M,True,True,False
924107,924.107,-0.317032,0.012248,0.021167,True,No halt,data,day2,B0M0,F,V2M,True,True,False
971850,971.85,0.802057,0.020026,0.021167,True,No halt,data,day2,B0M0,F,V2M,True,True,False


In [27]:
for mouse, Data in downsampled_dict.items():
    Data = Data.reset_index()
    Data.index.name = 'Time'
    if 'day1' in Data.Session.values:
        Data.to_csv('Fake_data_session1.csv', index=False) 
    if 'day2' in Data.Session.values:
        Data.to_csv('Fake_data_session2.csv', index=False) 
        