# Read H5 files to csv
Read h5 files created in Data_extration.ipynb

In [None]:
import os
import pandas as pd
from pathlib import Path
from harp_resources import process, utils
import h5py
#from analysis_functions import *

In [None]:
mouse_info = {'B2M1': {'sex': 'M', 'area': 'V2M'},
              'B2M4': {'sex': 'M', 'area': 'V2M'},
              'B2M5': {'sex': 'M', 'area': 'V2M'},
              'B2M6': {'sex': 'M', 'area': 'V2M'},
              'B3M1': {'sex': 'M', 'area': 'V2M'},
              'B3M2': {'sex': 'M', 'area': 'V2M'},
              'B3M3': {'sex': 'F', 'area': 'V1'},
              'B3M4': {'sex': 'M', 'area': 'V2M'},
              'B3M5': {'sex': 'M', 'area': 'V2M'},
              'B3M6': {'sex': 'F', 'area': 'V2M'},
              'B3M7': {'sex': 'F', 'area': 'V2M'},
              'B3M8': {'sex': 'F', 'area': 'V2M'},
              'B0M0': {'sex': 'F', 'area': 'V2M'},
             }

session_info = {'220824': 'day1',
                '230824': 'day2',
                '190824': 'day1',
                '200824': 'day2',
                '120824': 'day1',
                '130824': 'day2',
                '070824': 'day1',
                '080824': 'day2',
               }

#'SleapVideoData2'] = ['Ellipse.Diameter', 'Ellipse.Center.X', 'Ellipse.Center.Y']

## Defining paths for grab or G8

In [None]:
rootdir = '/Volumes/RanczLab/20240730_Mismatch_Experiment/G8_MMclosed-and-Regular_130824'#Enter root path
rootdir = '/Users/nora/Desktop/Cohort0_GCaMP_example/2024-08-08T10-05-26_B3M3'
h5_paths = []
eventpaths = []
for dirpath, subdirs, files in os.walk(rootdir):
    for x in files:
        if '.h5' in x:
            eventpaths.append(dirpath)
            h5_paths.append(dirpath+'/'+x)

In [None]:
h5_paths

In [None]:
# Expression unit testing
'''h5_paths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5',
            '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0/resampled_streams_Test_streams_B0M0.h5']
eventpaths = ['/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_080824/Test_streams_B0M0',
             '/Volumes/RanczLab/20240730_Mismatch_Experiment/Fake_data_070824/Test_streams_B0M0']'''

### Loading data streams

In [None]:

import numpy as np
def load_h5_streams_to_dict(data_paths):
    '''
    Takes list of H5 file paths and, loads streams into dictionary, and save to dictionary named by mouse ID
    '''
    reconstructed_dict = {}  # Dictionary to save streams
    
    for input_file in data_paths:
        
        name = input_file.split('/')[-1][-7:-3]  # Extract mouse ID from file name
        
        if not os.path.exists(input_file):
            print(f'ERROR: {input_file} does not exist.')
            return None
        
        with h5py.File(input_file, 'r') as h5file:
            print(f'reconstructing streams for mouse {name}, from session folder: {input_file.split("/")[-3]}')
            
            common_index = h5file['HARP_timestamps'][:]
            reconstructed_streams = {}
            
            for source_name in h5file.keys():
                if source_name == 'HARP_timestamps':
                    continue
                
                reconstructed_streams[source_name] = {}
                source_group = h5file[source_name]
                
                for stream_name in source_group.keys():
                    stream_data = source_group[stream_name][:]
                    length_difference = len(common_index) - len(stream_data)
                    
                    # Pad or truncate to match common_index length
                    if len(stream_data) < len(common_index):
                        padding = np.full(len(common_index) - len(stream_data), np.nan)
                        stream_data = np.concatenate([stream_data, padding])
                        print(f"{source_name} - {stream_name}: Length difference: {length_difference}")
                        print(f"missing data, advicable to ensure correct alignment \n ")
                    elif len(stream_data) > len(common_index):
                        stream_data = stream_data[:len(common_index)]
                    
                    reconstructed_streams[source_name][stream_name] = pd.Series(data=stream_data, index=common_index)
        if name not in reconstructed_dict.keys():   
            reconstructed_dict[name] = reconstructed_streams
            print(f'  --> {name} streams reconstructed and added to dictionary \n')
        else: 
            reconstructed_dict[f'{name}_2'] = reconstructed_streams
            print(f'  --> {name} streams_2 reconstructed and added to dictionary \n')
    
    return reconstructed_dict




In [None]:
stream_dict_dict = load_h5_streams_to_dict(h5_paths)

In [None]:
stream_dict_dict.keys()

In [None]:
stream_dict_dict['B3M3']#['Photometry']

In [None]:
#Make a cut_info dict for the mouse with missing data
#cut_info = {'B2M5': 521, 'B3M2':174}

In [None]:
def make_dataframes(stream_dict_dict, cut_info = {}):
    data_dict = {}
    for mouse, streamdict in stream_dict_dict.items():
        
        print(f'\n--Making dataframe for {mouse}--')
        #Getting fluorescence traces
        try: 
            fluorescence = streamdict['Photometry']['470_dfF'] #Using '470_dfF' only
        except KeyError:
            fluorescence = streamdict['Photometry']['CH1-470']
        print('flourescence 470 extracted')
    
        #Getting mouse movement data and converting to cm / second
        movementX = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0X(46)'])*100
        movementY = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0Y(46)'])*100
        print('movement on x and Y axis extracted')
    
        #Getting eye movements and pupil diameter 
        #'SleapVideoData2' = ['Ellipse.Diameter', 'Ellipse.Center.X', 'Ellipse.Center.Y']
        if 'SleapVideoData2' in streamdict:
            eye_center_x = streamdict['SleapVideoData2']['Ellipse.Center.X']
            eye_center_y = streamdict['SleapVideoData2']['Ellipse.Center.Y']
            eye_diameter = streamdict['SleapVideoData2']['Ellipse.Diameter']
            print('eye movement data extracted')
        else: 
            print('There was no eye movement data available for ', mouse)
    
        #Getting visual stimuli event times
        event = streamdict['ONIX']['Photodiode']
        print('photdiode halt info extracted')
        
        time = movementX.index - movementX.index[0]
        print('time in seconds from 0 extracted form X direction movement')
        
        dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
            'Seconds': time}
        #dict = {'470_dfF': fluorescence, 'movementX': movementX, 'movementY': movementY, 'event': event,
         #   'TimeStamp': time, 'eye_x': eye_center_x, 'eye_y': eye_center_y, 'pupil_diameter': eye_diameter}
        
        df = pd.DataFrame(dict)
        print('dataframe created with columns: ', df.columns)
        #if mouse in cut_info:
            #df = df.iloc[:-cut_info[mouse]]
        
        df['event'] = df['event'].astype(bool) #In case column is not bool
        #Reversing, so that a halt appearst when 'event'==True
        df['event'] = ~df['event']
        print('Event column as bool, True values corresponding to halts')
        
        df.reset_index(inplace=False)
        
        data_dict[mouse]= df
    return data_dict

In [None]:
def make_dataframes(stream_dict_dict, fluorescense_traces = ['470_dfF'], cut_info={}):
    data_dict = {}
    for mouse, streamdict in stream_dict_dict.items():
        
        print(f'\n--Making dataframe for {mouse}--')
        # Getting fluorescence traces
        fluorescence_dict = {}
        for trace in fluorescense_traces:
            try: 
                fluorescence_dict[trace] = streamdict['Photometry'][trace] 
            except KeyError:
                print(f'Trace {trace} not available')
                pass
    
        # Getting mouse movement data and converting to cm / second
        movementX = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0X(46)']) * 100
        movementY = process.running_unit_conversion(streamdict['H1']['OpticalTrackingRead0Y(46)']) * 100
        print('movement on x and Y axis extracted')
    
        # Getting eye movements and pupil diameter
        # 'SleapVideoData2' = ['Ellipse.Diameter', 'Ellipse.Center.X', 'Ellipse.Center.Y']
        eye_data_available = 'SleapVideoData2' in streamdict
        if eye_data_available:
            eye_center_x = streamdict['SleapVideoData2']['Ellipse.Center.X']
            eye_center_y = streamdict['SleapVideoData2']['Ellipse.Center.Y']
            eye_diameter = streamdict['SleapVideoData2']['Ellipse.Diameter']
            print('eye movement data extracted')
        else: 
            print('There was no eye movement data available for ', mouse)
    
        # Getting visual stimuli event times
        event = streamdict['ONIX']['Photodiode']
        print('photodiode halt info extracted')
        
        time = movementX.index - movementX.index[0]
        print('time in seconds from 0 extracted from X direction movement')
        
        # Creating the dictionary for the DataFrame
        data_dict_for_df = {
            'movementX': movementX, 
            'movementY': movementY, 
            'event': event,
            'Seconds': time
        }
        data_dict_for_df.update(fluorescence_dict)
        
        # Adding eye data to the dictionary if available
        if eye_data_available:
            data_dict_for_df['eye_x'] = eye_center_x
            data_dict_for_df['eye_y'] = eye_center_y
            data_dict_for_df['pupil_diameter'] = eye_diameter
        
        df = pd.DataFrame(data_dict_for_df)
        print('dataframe created with columns: ', df.columns)
        
        # if mouse in cut_info:
        #     df = df.iloc[:-cut_info[mouse]]
        
        df['event'] = df['event'].astype(bool)  # In case column is not bool
        # Reversing, so that a halt appears when 'event'==True
        df['event'] = ~df['event']
        print('Event column as bool, True values corresponding to halts')
        
        df.reset_index(inplace=False)
        
        data_dict[mouse] = df
    return data_dict


In [None]:
data_dict = make_dataframes(stream_dict_dict, fluorescense_traces = ['470_dfF', 'z_470'])
names = [name for name in data_dict]

In [None]:
data_dict[names[0]]

In [None]:
for mouse, df in data_dict.items():
    percent_true = (len(df.loc[df['event']==True])*100)/len(df)
    print(f'for {mouse} the True values makes up {percent_true:.2f} % of the total df lenght' )
    if percent_true > 50:
        print('This is more than 50 %, which may be too much, consider inversing True/False or check experiment protocol for mouse')

### Loading Experiment events and session info

In [None]:
eventpath = '/Users/nora/Desktop/Cohort0_GCaMP_example/2024-08-08T10-05-26_B3M3/'
eventpath.split('/')[-2].split('_')[-1]

In [None]:
event_dict = {}
for eventpath in eventpaths:
    eventpath = '/Users/nora/Desktop/Cohort0_GCaMP_example/2024-08-08T10-05-26_B3M3/'
    ExpEvents = process.read_ExperimentEvents(Path(eventpath))
    ExpEvents.set_index('Seconds', inplace = True)
    ExpEvents.index = ExpEvents.index.round(4)
    name = eventpath.split('/')[-1][-4:]
    ExpEvents['experiment'] = eventpath.split('/')[-2].split('_')[-1]
    for key, item in session_info.items():
        if key in eventpath.split('/')[-2]:
            ExpEvents['session']=item
    if name not in event_dict.keys():  
        event_dict[name] = ExpEvents
    else:
        event_dict[f'{name}_2'] = ExpEvents


In [None]:
event_dict['B3M3'] = event_dict['']

In [None]:
event_dict['B3M3']['experiment']='closedopenMM'

### Adding events (and non-events) and session info to data
1) add_experiment_events() takes the event data, and inserts the values for every timepoint into an event column in the main data
* Sometimes, there are multiple event strings during the same timepoint. Then, if a crucial event takes place during this, this one will be priotized
* The crucial events can be defined from strings they contain in the first line of the add_experiment_events function.
* In no crucial event are at that timepoint, all the events will be assinged to the timepoint in the main df as one string, seperated by a comma (use .split(',') later if it becomes necessary to seperate them during analysis)

2) The No_halt events are used to make a column where the no-halt events are used to make a bool similar to the 'event' (halt) column
   * These can later be used as control as they appear when there could have been a halt but there was none
   * The number of no-halt events is controlled to ensure that all of them were actually used.

In [None]:
def add_experiment_events(data_dict, events_dict, mouse_info):
    # Iterate over each mouse key in the dictionaries
    for mouse_key in data_dict:
        # Retrieve the main and event DataFrames
        main_df = data_dict[mouse_key]
        event_df = events_dict[mouse_key]

        # Ensure both indices are sorted
        main_df = main_df.sort_index()
        event_df = event_df.sort_index()

        # Perform a merge_asof on the index to add 'Value' as 'ExperimentEvents' with backward matching
        merged_df = pd.merge_asof(
            main_df,
            event_df[['Value']],  # Only select the 'Value' column from event_df
            left_index=True,
            right_index=True,
            direction='backward',
            tolerance=0  # Adjust tolerance for matching on the index
        )

        # Rename the 'Value' column to 'ExperimentEvents'
        if 'ExperimentEvents' in merged_df.columns:
            merged_df['ExperimentEvents'] = merged_df.pop('Value')  # Replace existing column with the new 'Value' column
            print(f'Pre-existing ExperimentEvents column was replaced with new for {mouse_key}')
        else:
            merged_df = merged_df.rename(columns={'Value': 'ExperimentEvents'})  # Add new column
            print(f'Added new ExperimentEvents for {mouse_key}')

        # Add metadata from event_df
        #merged_df['Experiment'] = event_df['experiment'].unique()[0]
        #merged_df['Session'] = event_df['session'].unique()[0]

        # Add mouse ID, sex, and brain area
        mouse_info_name = mouse_key[:4]
        merged_df['mouseID'] = mouse_info_name
        merged_df['sex'] = mouse_info[mouse_info_name]['sex']
        merged_df['area'] = mouse_info[mouse_info_name]['area']

        # Update the dictionary with the merged DataFrame
        data_dict[mouse_key] = merged_df

    return data_dict

In [None]:
data_dict = add_experiment_events(data_dict, event_dict,mouse_info)

In [None]:
data_dict[names[0]].ExperimentEvents.unique() #Check random mouse to see what events are present

In [None]:
data_dict = process.add_no_halt_column(data_dict, event_dict)

#### Add block columns
For each mouse and corresponding df, update the df to include columns for each block of the experiment for easy slicing later in analysis.
The add_block_columns() function will also test if each of the created block columns contains at least one True value and that there are no temporal overlaps 

In [None]:
for name, df in data_dict.items():
    print('\n updating data for ', name,'...')
    blocks_added_df = process.add_block_columns(df, event_dict[name])
    blocks_added_df.replace({})
    data_dict[name] = blocks_added_df

process.check_block_overlap(data_dict)

In [None]:
process.check_block_overlap(data_dict)

### The downsample_data function can be used to make the datset smaller. 
1) Make an empty dict to fill with downsampled versions of the dfs.
2) Loop though the mice and dfs, and use the function for each
3) Set the name of the time column to use and decide on the frequency of the output df datapoints
        * Ensure that all the columns that you want to keep has a corresponding dict key in aggregation_functions in the downsample_data() funciton. 
4) Assign the resulting df to the corresponding mouse
5) test_event_numbers as a way to test if all events (no-halt events as they are frequent, can be changed) survived the downsampling. 

NB: Can be slow with large datasets, check your email

In [None]:
def downsample_data(df, time_col='Seconds', interval=0.001):
    '''
    Uses pandas resample and aggregate functions to downsample the data to the desired interval. 
    * Note: Aggregation functions must be applied for each variable that is to be included.
    https://pandas.pydata.org/docs/reference/api/pandas.core.resample.Resampler.aggregate.html
    * Note: because the donwsampling keeps the first non-NaN value in each interval, some values could be lost.
    '''
    # Convert the Seconds column to a TimedeltaIndex
    df = df.set_index(pd.to_timedelta(df[time_col], unit='s'))

    #define aggregation functions for all possible columns
    aggregation_functions = {
        '470_dfF': 'mean', # takes the mean signal of the datapoints going into each new downsampled datapoint
        '560_dfF': 'mean',
        'movementX': 'mean',
        'movementY': 'mean',
        'event': 'any', # events column is a bool, and if there is any True values in the interval, the downsampled datapoint will be True
        'ExperimentEvents': lambda x: x.dropna().iloc[0] if not x.dropna().empty else None, #first non-NaN value in the interval 
        'Experiment': 'first', # All values should be the same, so it can always just take the first string value
        'Session': 'first',
        'mouseID': 'first',
        'sex': 'first',
        'area': 'first',
        'No_halt': 'any', 
        'LinearMismatch_block': 'any', 
        'LinearPlaybackMismatch_block': 'any',
        'LinearRegular_block': 'any',
        'LinearClosedloopMismatch_block':'any',
        'LinearRegularMismatch_block':'any',
        'LinearNormal_block':'any',
    }

    # Filter aggregation_functions to only include columns present in df
    aggregation_functions = {key: func for key, func in aggregation_functions.items() if key in df.columns}

    print('downsampling...')
    # Resample with the specified interval and apply the filtered aggregations
    downsampled_df = df.resample(f'{interval}s').agg(aggregation_functions)

    # Reset the index to make the Seconds column normal again
    downsampled_df = downsampled_df.reset_index()
    downsampled_df[time_col] = downsampled_df[time_col].dt.total_seconds()  # Convert Timedelta back to seconds

    # Forward fill for categorical columns if needed, only if they exist in downsampled_df
    categorical_cols = ['Experiment', 'Session', 'mouseID', 'sex', 'area']
    for col in categorical_cols:
        if col in downsampled_df.columns:
            downsampled_df[col] = downsampled_df[col].ffill()

    # Remove consecutive duplicate values in the 'ExperimentEvents' column, if it exists
    if 'ExperimentEvents' in downsampled_df.columns:
        downsampled_df['ExperimentEvents'] = downsampled_df['ExperimentEvents'].where(
            downsampled_df['ExperimentEvents'] != downsampled_df['ExperimentEvents'].shift()
        )

    return downsampled_df

In [None]:
def test_event_numbers(downsampled_data, original_data, mouse):
    '''
    Counts number of True values in the No_halt columns in the original and the downsampled data
    This will indicate whether information was lost in the downsampling.
    If the original events somehow has been upsampled previously (for example if the tolerance was set too high in add_experiment_events()), 
    repeatings of the same event can also lead to fewer True events in the downsampled df.
    '''
    nohalt_down = len(downsampled_data.loc[downsampled_data['No_halt']==True])
    nohalt_original = len(original_data.loc[original_data['No_halt']==True])
    if nohalt_down != nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are actually {nohalt_original} no-halts, but the downsampled data only contains {nohalt_down}')
        print('Should re-run the downsampling. Try changing interval lenght. Othewise, consider not downsampling\n')
    if nohalt_down == nohalt_original:
        print(f'mouse{mouse}')
        print(f'There are {nohalt_original} no-halts, and downsampled data contains {nohalt_down}\n')


In [None]:
downsampled_dict = {}
for mouse, df in data_dict.items():
    downsampled_df = process.downsample_data(df, time_col='Seconds', interval=0.001)
    downsampled_dict[mouse] = downsampled_df
    process.test_event_numbers(downsampled_df, df, mouse)


In [None]:

downsampled_dict[names[0]].loc[downsampled_dict[names[0]].No_halt == True]

### Concat and reindex

In [None]:
All_data = pd.concat([Data for Data in downsampled_dict.values()])


# Set a file name and save

!!! Make sure to change file save names before running the below cell

In [None]:
All_data

In [None]:

if 'day1' in All_data.Session.values:
    All_data.to_csv('Mismatch_analysis/G8_MMclosed_regular_session1.csv', index=False) #Change name
if 'day2' in All_data.Session.values:
    All_data.to_csv('Mismatch_analysis/G8_MMclosed_regular_session2.csv', index=False) #Change name
    


In [None]:
All_data

In [None]:
downsampled_dict['B3M3'].to_csv('Mismatch_analysis/B3M3_G8_MMclosed_session1.csv', index=False) #Change name