In [1]:
import os
import pandas as pd
from pathlib import Path
from harp_resources import process, utils
from analysis_functions import *

In [2]:
mouse_info = {'B2M1': {'sex': 'M', 'area': 'V2M'},
              'B2M4': {'sex': 'M', 'area': 'V2M'},
              'B2M5': {'sex': 'M', 'area': 'V2M'},
              'B2M6': {'sex': 'M', 'area': 'V2M'},
              'B3M1': {'sex': 'M', 'area': 'V2M'},
              'B3M2': {'sex': 'M', 'area': 'V2M'},
              'B3M3': {'sex': 'F', 'area': 'V1'},
              'B3M4': {'sex': 'M', 'area': 'V2M'},
              'B3M5': {'sex': 'M', 'area': 'V2M'},
              'B3M6': {'sex': 'F', 'area': 'V2M'},
              'B3M7': {'sex': 'F', 'area': 'V2M'},
              'B3M8': {'sex': 'F', 'area': 'V2M'},
             }

session_info = {'220824': 'day1',
                '230824': 'day2',
                '190824': 'day1',
                '200824': 'day2',
                '120824': 'day1',
                '130824': 'day2',
                '070824': 'day1',
                '080824': 'day2',
               }

## Defining paths for grab or G8

In [3]:
rootdir = '/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-open_080824' #Enter root path
h5_paths = []
eventpaths = []
for dirpath, subdirs, files in os.walk(rootdir):
    for x in files:
        if '.h5' in x:
            eventpaths.append(dirpath)
            h5_paths.append(dirpath+'/'+x)

### Loading data streams

In [4]:

import numpy as np
def load_h5_streams_to_dict(data_paths):
    '''
    Takes list of H5 file paths and, loads streams into dictionary, and save to dictionary named by mouse ID
    '''
    reconstructed_dict = {}  # Dictionary to save streams
    
    for input_file in data_paths:
        name = input_file.split('/')[-1][-7:-3]  # Extract mouse ID from file name
        
        if not os.path.exists(input_file):
            print(f'ERROR: {input_file} does not exist.')
            return None
        
        with h5py.File(input_file, 'r') as h5file:
            print(f'reconstructing streams for mouse {name}, from session folder: {input_file.split("/")[-3]}')
            
            common_index = h5file['HARP_timestamps'][:]
            reconstructed_streams = {}
            
            for source_name in h5file.keys():
                if source_name == 'HARP_timestamps':
                    continue
                
                reconstructed_streams[source_name] = {}
                source_group = h5file[source_name]
                
                for stream_name in source_group.keys():
                    stream_data = source_group[stream_name][:]
                    length_difference = len(common_index) - len(stream_data)
                    
                    # Pad or truncate to match common_index length
                    if len(stream_data) < len(common_index):
                        padding = np.full(len(common_index) - len(stream_data), np.nan)
                        stream_data = np.concatenate([stream_data, padding])
                        print(f"{source_name} - {stream_name}: Length difference: {length_difference}")
                        print(f"missing data, advicable to ensure correct alignment \n ")
                    elif len(stream_data) > len(common_index):
                        stream_data = stream_data[:len(common_index)]
                    
                    reconstructed_streams[source_name][stream_name] = pd.Series(data=stream_data, index=common_index)
                
        reconstructed_dict[name] = reconstructed_streams
        print(f'  --> {name} streams reconstructed and added to dictionary \n')
    
    return reconstructed_dict




In [5]:
stream_dict_dict = load_h5_streams_to_dict(h5_paths)

reconstructing streams for mouse B2M5, from session folder: G8_MMclosed-and-open_080824
ONIX - Photodiode: Length difference: 521
missing data, advicable to ensure correct alignment 
 
Photometry - 410_dfF: Length difference: 521
missing data, advicable to ensure correct alignment 
 
Photometry - 470_dfF: Length difference: 521
missing data, advicable to ensure correct alignment 
 
Photometry - 560_dfF: Length difference: 521
missing data, advicable to ensure correct alignment 
 
  --> B2M5 streams reconstructed and added to dictionary 

reconstructing streams for mouse B2M4, from session folder: G8_MMclosed-and-open_080824
  --> B2M4 streams reconstructed and added to dictionary 

reconstructing streams for mouse B3M3, from session folder: G8_MMclosed-and-open_080824
  --> B3M3 streams reconstructed and added to dictionary 

reconstructing streams for mouse B3M1, from session folder: G8_MMclosed-and-open_080824
  --> B3M1 streams reconstructed and added to dictionary 

reconstructing 

In [6]:
#Make a cut_info dict for the mouse with missing data
cut_info = {'B2M5': 521, 'B3M2':174}

In [7]:
def make_dataframes(stream_dict_dict, cut_info = {}):
    data_dict = {}
    for mouse, streamdict in stream_dict_dict.items():
        #Getting fluorescence traces
        fluorescence = streamdict['Photometry']['470_dfF'] #Using '470_dfF' only
    
        #Getting mouse movement data and converting to cm / second
        movementX = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0X(46)'])*100
        movementY = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0Y(46)'])*100
    
        #Getting eye movements and pupil diameter
        if 'SleapVideoData2' in streamdict:
            eye_center_x = streamdict['SleapVideoData2']['Ellipse.Center.X']
            eye_center_y = streamdict['SleapVideoData2']['Ellipse.Center.Y']
            eye_diameter = streamdict['SleapVideoData2']['Ellipse.Diameter']
        else: 
            print('There was no eye movement data available for ', mouse)
    
        #Getting visual stimuli event times
        event = streamdict['ONIX']['Photodiode']
        
        time = movementX.index - movementX.index[0]
        
        dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
            'Seconds': time}
        #dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
         #   'TimeStamp': time, 'eye_x': eye_center_x, 'eye_y': eye_center_y, 'pupil_diameter': eye_diameter}
        
        df = pd.DataFrame(dict)
        #if mouse in cut_info:
            #df = df.iloc[:-cut_info[mouse]]
        
        df['event'] = df['event'].astype(bool) #In case column is not bool
        #Reversing, so that a halt appearst when 'event'==True
        df['event'] = ~df['event']
        
        df.reset_index(inplace=False)
        
        data_dict[mouse]= df
    return data_dict

In [8]:
data_dict = make_dataframes(stream_dict_dict, cut_info = cut_info )
names = [name for name in data_dict]

There was no eye movement data available for  B2M5
There was no eye movement data available for  B2M4
There was no eye movement data available for  B3M1
There was no eye movement data available for  B3M2


In [9]:
for mouse, df in data_dict.items():
    percent_true = (len(df.loc[df['event']==True])*100)/len(df)
    print(f'for {mouse} the True values makes up {percent_true:.2f} % of the total df lenght' )
    if percent_true > 50:
        print('This is more than 50 %, which may be too much, consider inversing True/False or check experiment protocol for mouse')

for B2M5 the True values makes up 1.54 % of the total df lenght
for B2M4 the True values makes up 1.06 % of the total df lenght
for B3M3 the True values makes up 1.35 % of the total df lenght
for B3M1 the True values makes up 1.46 % of the total df lenght
for B3M2 the True values makes up 1.45 % of the total df lenght


### Loading Experiment events and session info

In [10]:
event_dict = {}
for eventpath in eventpaths:
    ExpEvents = read_ExperimentEvents(Path(eventpath))
    ExpEvents.set_index('Seconds', inplace = True)
    ExpEvents.index = ExpEvents.index.round(4)
    name = eventpath.split('/')[-1][-4:]
    ExpEvents['experiment'] = eventpath.split('/')[-2].split('_')[1]
    for key, item in session_info.items():
        if key in eventpath.split('/')[-2]:
            ExpEvents['session']=item
    event_dict[name] = ExpEvents


### Adding events (and non-events) and session info to data

In [11]:
data_dict = add_experiment_events(data_dict, event_dict,mouse_info)

Added new ExperimentEvents for B2M5
Added new ExperimentEvents for B2M4
Added new ExperimentEvents for B3M3
Added new ExperimentEvents for B3M1
Added new ExperimentEvents for B3M2


In [12]:
data_dict[names[1]].ExperimentEvents.unique() #Check random mouse to see what events are present

array([nan, 'Sync signal started', 'LinearMismatch block started',
       'Homing platform', 'Wait for run threshold...',
       'Halt delay: 0.461283987230288s', 'Apply halt: 1s',
       'Halt delay: 0.596169990625312s', 'Halt delay: 0.438826428558131s',
       'Halt delay: 0.467696949452021s', 'Halt delay: 0.410214138268593s',
       'Halt delay: 0.570348088522604s', 'Halt delay: 0.173700632934319s',
       'Halt delay: 0.215768420563903s', 'Halt delay: 0.361336558387306s',
       'Halt delay: 0.208732219603254s', 'Halt delay: 0.531457154653714s',
       'Halt delay: 0.181430522995736s', 'Halt delay: 0.367234571868197s',
       'Halt delay: 0.125860518229129s', 'Halt delay: 0.508705370458171s',
       'Halt delay: 0.385718896559309s',
       'LinearPlaybackMismatch block started'], dtype=object)

In [13]:
data_dict = add_no_halt_column(data_dict, event_dict)

No_halt events added to B2M5
  Correct number of no-halt events for B2M5

No_halt events added to B2M4
  Correct number of no-halt events for B2M4

No_halt events added to B3M3
  Correct number of no-halt events for B3M3

No_halt events added to B3M1
  Correct number of no-halt events for B3M1

No_halt events added to B3M2
  Correct number of no-halt events for B3M2



#### Add block columns

In [14]:
for name, df in data_dict.items():
    print('updating data for ', name)
    blocks_added_df = add_block_columns(df, event_dict[name])
    blocks_added_df.replace({})
    data_dict[name] = blocks_added_df

check_block_overlap(data_dict)

updating data for  B2M5
updating data for  B2M4
updating data for  B3M3
updating data for  B3M1
updating data for  B3M2
For B2M5: No overlapping True values, and each _block column has at least one True value
For B2M4: No overlapping True values, and each _block column has at least one True value
For B3M3: No overlapping True values, and each _block column has at least one True value
For B3M1: No overlapping True values, and each _block column has at least one True value
For B3M2: No overlapping True values, and each _block column has at least one True value


In [15]:
def downsample_data(df, time_col='Seconds', interval=0.001):
    # Convert the Seconds column to a TimedeltaIndex
    df = df.set_index(pd.to_timedelta(df[time_col], unit='s'))

    # Define aggregation functions for all possible columns
    aggregation_functions = {
        '470_dfF': 'mean',
        'movementX': 'mean',
        'movementY': 'mean',
        'event': 'any',
        'ExperimentEvents': lambda x: x.dropna().iloc[0] if not x.dropna().empty else None,
        'Experiment': 'first',
        'Session': 'first',
        'mouseID': 'first',
        'sex': 'first',
        'area': 'first',
        'No_halt': 'any',
        'LinearMismatch_block': 'any',
        'LinearPlaybackMismatch_block': 'any',
        'LinearRegular_block': 'any'
    }

    # Filter aggregation_functions to only include columns present in df
    aggregation_functions = {key: func for key, func in aggregation_functions.items() if key in df.columns}

    # Resample with the specified interval and apply the filtered aggregations
    downsampled_df = df.resample(f'{interval}s').agg(aggregation_functions)

    # Reset the index to make the Seconds column normal again
    downsampled_df = downsampled_df.reset_index()
    downsampled_df[time_col] = downsampled_df[time_col].dt.total_seconds()  # Convert Timedelta back to seconds

    # Forward fill for categorical columns if needed, only if they exist in downsampled_df
    categorical_cols = ['Experiment', 'Session', 'mouseID', 'sex', 'area']
    for col in categorical_cols:
        if col in downsampled_df.columns:
            downsampled_df[col] = downsampled_df[col].ffill()

    # Remove consecutive duplicate values in the 'ExperimentEvents' column, if it exists
    if 'ExperimentEvents' in downsampled_df.columns:
        downsampled_df['ExperimentEvents'] = downsampled_df['ExperimentEvents'].where(
            downsampled_df['ExperimentEvents'] != downsampled_df['ExperimentEvents'].shift()
        )

    return downsampled_df



In [16]:
def test_event_numbers(downsampled_data, original_data, mouse):
    nohalt_down = len(downsampled_data.loc[downsampled_data['No_halt']==True])
    nohalt_original = len(original_data.loc[original_data['No_halt']==True])
    if nohalt_down != nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are actually {nohalt_original} no-halts, but the downsampled data only contains {nohalt_down}')
        print('Should re-run the downsampling. Try changing interval lenght. Othewise, consider not downsampling\n')
    if nohalt_down == nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are {nohalt_original} no-halts, and downsampled data contains {nohalt_down}\n')
    
    

In [17]:
downsampled_dict = {}
for mouse, df in data_dict.items():
    downsampled_df = downsample_data(data_dict[mouse], time_col='Seconds', interval=0.001)
    downsampled_dict[mouse] = downsampled_df
    test_event_numbers(downsampled_df, df, mouse)


mouseB2M5
There are 29 no-halts, and downsampled data contains 29

mouseB2M4
There are 30 no-halts, and downsampled data contains 30

mouseB3M3
There are 60 no-halts, and downsampled data contains 60

mouseB3M1
There are 49 no-halts, and downsampled data contains 49

mouseB3M2
There are 55 no-halts, and downsampled data contains 55



In [18]:
Data = pooling_data(downsampled_dict)

In [19]:
Data = Data.reset_index()
Data = Data.drop(columns=['level_0'])  # Assuming the column name is 'level_0' after reset_index()
Data = Data.set_index('level_1')  # 'level_1' will be the numeric index part
Data.index.name = 'Time'

In [20]:
Data.loc[Data.mouseID == names[3]]
Data

Unnamed: 0_level_0,Seconds,470_dfF,movementX,movementY,event,ExperimentEvents,Experiment,Session,mouseID,sex,area,No_halt,LinearMismatch_block,LinearPlaybackMismatch_block
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.000,0.000011,0.002708,0.000000,False,,MMclosed-and-open,day2,B2M5,M,V2M,False,False,False
1,0.001,0.000034,0.002753,0.000000,False,,MMclosed-and-open,day2,B2M5,M,V2M,False,False,False
2,0.002,0.000058,0.002798,0.000000,False,,MMclosed-and-open,day2,B2M5,M,V2M,False,False,False
3,0.003,0.000081,0.002843,0.000000,False,,MMclosed-and-open,day2,B2M5,M,V2M,False,False,False
4,0.004,0.000104,0.002888,0.000000,False,,MMclosed-and-open,day2,B2M5,M,V2M,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968235,1968.235,,0.000127,0.000026,False,,MMclosed-and-open,day2,B3M2,M,V2M,False,False,False
1968236,1968.236,,0.000097,0.000020,False,,MMclosed-and-open,day2,B3M2,M,V2M,False,False,False
1968237,1968.237,,0.000067,0.000014,False,,MMclosed-and-open,day2,B3M2,M,V2M,False,False,False
1968238,1968.238,,0.000037,0.000008,False,,MMclosed-and-open,day2,B3M2,M,V2M,False,False,False


In [21]:
#!!!!Remember to change the of the csv file to not overwrite the previous one
Data.to_csv('G8_MMclosed_open_session2.csv', index=False) 