File to create and test data structures for this project

In [29]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
import mat73
from scipy.io import loadmat
import re
import os
import json

In [2]:
### modify this, will need more specificity
### so essentially we will have recording info classes in patient data for movie, preSleepFR, postSleepFR
### we will have a method to take the recording from just the start/end unix or relative time lol
### more robust to just deal with absolute unix? we will have to make relative at some point
### better for the PatientData class to have relative information - dicionary of movie, preSleep, postSleep FR times?
### or should I have movie, etc hardcoded? better to have recordinginfo class likely

@dataclass
class RecordingInfo:
    """Contains metadata about a recording session"""
    start_unix: float
    end_unix: float
    experiment_type: str  # e.g., 'movie', 'preSleep', 'postSleep'

In [3]:
@dataclass
class Neuron:
    """
    Class for a single Neuron in a single patient
    pid: str pid of patient (redundancy maybe but better safe than sorry)
    area: str | None  recording area
    spikes: list | np.ndarray all spike times (in seconds)
    """
    neuron_id: str
    pid: str
    spike_times: np.ndarray
    area: str | None = None
    metadata: dict | None = None
    
    @property
    def firing_rate(self, window: Tuple[float, float] = None) -> float:
        """
        Function to get firing rate of neuron within a certain time period (default whole recording)

        times are in seconds
        """
        if window:
            spikes = self.spike_times[(self.spike_times >= window[0]) & (self.spike_times <= window[1])]
            duration = window[1] - window[0]
        else:
            spikes = self.spike_times
            duration = self.spike_times[-1] - self.spike_times[0]

        assert duration > 0, "Duration < 0, Error"

        return len(spikes) / duration
        


In [45]:
class PatientData:
    """
    Contains all relevant information for a single patient

    - movie drift adjusted times with CSV (create new csv to use every time or run code every time -- not expensive so will do second for reproducibility)
    - patient info from all exp epochs (start unix, pre/post, etc etc)
    - recordings!!  
        - dictionary with all neurons and firing times? but also want to be able to filter by brain area


    - methods for analysis?
        - can be functions for general, not specific to patient
        - make analysis class?
            - would have functions for heatmaps, decoders, etc?
        - these will clutter patient class, I mainly just want all data for a single patient concentrated in one place, easy to use and access
    """

    def __init__(self, pid: str):
        # want to load csv, fix correlation issue
        # so call function to load csv, multiple times by coefficient, 
        # get concept onsets from the 
        self.pid = pid


        d = Dataloader() # type: ignore

        self.neurons: list[Neuron] = d.get_all_patient_neurons(self.pid)


    def _load_data(self) -> None:
        """
        takes pid, loads spike data, json timing info data, concept onset data
            spike data function
                - want all neurons recorded with a list of times that they spike relative to recording start
                - go through all mat files, create instances of Neuron class for each Neuron
                    - list of neuron class
                - neuron class 
                    - area
                    - spike times list/array
                    - firing rate method - start/end time optional argument
                    
            json timing info function
            concept onset function
                - adjust timing csv with correlation factor in json
                - get relative times for each recall concept onset
        """


    
    

brainstorming of patient class, has good thoughts


    Contains all relevant information for a single patient

    - movie drift adjusted times with CSV (create new csv to use every time or run code every time -- not expensive so will do second for reproducibility)
    - patient info from all exp epochs (start unix, pre/post, etc etc)
    - recordings!!  
        - dictionary with all neurons and firing times? but also want to be able to filter by brain area


    - methods for analysis?
        - can be functions for general, not specific to patient
        - make analysis class?
            - would have functions for heatmaps, decoders, etc?
        - these will clutter patient class, I mainly just want all data for a single patient concentrated in one place, easy to use and access


In [None]:
class Dataloader:
    """Class to contain functions to load data"""


    def parse_filename(self, filename):
        base = filename.split('-')[-1].replace('.mat', '')

        parsed = filename.replace('.mat', '').split('-')
        if len(parsed) == 2:  # Normal case like GA2-RAH7
            base = parsed[-1]
        elif len(parsed) == 3:  # Case with hyphenated area like GA3-RSUB-PHG1
            base = '-'.join(parsed[1:])  # Join with hyphen to preserve structure
        else:
            return (filename.replace('.mat', ''), None)

        match = re.match(r'(.*?[-]?\w+?)(\d+)$', base)
        if match:
            area_name = match.group(1)  # Group 1 contains everything before the numbers
            channel_num = match.group(2)  # Group 2 contains the numbers
            return base, area_name
        return (base, None) # None for no areaname
    

    def _get_neurons_from_mat(self, file_path, pid):
        """
        Load spike data from .mat file, handling different MATLAB file versions

        Return instances of the Neuron class, adding spike data to each one
        """
        try:
            data = loadmat(file_path)
        except (NotImplementedError, TypeError):
            data = mat73.loadmat(file_path)
        
        # Extract cluster_class data
        cluster_class = data['cluster_class']
        
        # Extract timestampsStart
        ts_start = data["timestampsStart"]
        if ts_start.shape == ():
            ts_start = float(ts_start)
        else:
            ts_start = float(ts_start[0][0])

        filename = file_path.split('/')[-1]
        base, area_name = self.parse_filename(filename)
        neurons = []
        unique_clusters = np.unique(cluster_class[:, 0])
        for cluster_id in unique_clusters:
            mask = cluster_class[:, 0] == cluster_id
            spike_times = cluster_class[mask, 1]

            neurons.append(Neuron(
                neuron_id=f"{base}-{int(cluster_id)}",
                pid = pid,
                spike_times=spike_times,
                area=area_name,
                metadata={'ts_start': ts_start}
                ))

        return neurons
    
    def get_all_patient_neurons(self, pid, base_dir="./Data"):
        neurons = []
        for patient_dir in os.listdir(base_dir): # lists 566_movie paradigm, etc dirs
            patient_dict_name = f"{patient_dir.replace('_MovieParadigm', '')}_files"

            if pid in patient_dir: # we have the correct patient id
                for exp_dir in os.listdir(os.path.join(base_dir, patient_dir)):
                    if len(exp_dir.split('-')) > 2: # then we have our exp-5-6-7 pattern directory with spiking files
                        for file in os.listdir(os.path.join(base_dir, patient_dir, exp_dir, 'CSC_micro_spikes')):
                            file_path = os.path.join(base_dir, patient_dir, exp_dir, 'CSC_micro_spikes', file)
                            neurons += self._get_neurons_from_mat(file_path=file_path, pid=pid)
        return neurons # list of all neurons
    
    
        





testing space

In [None]:
def package_data(base_dir):
    # altering to be method to go through all mat files and make instances of neuron class for each neuron
    """
    Input: base directory
    
    Returns: area_dict -- dict[str, list[tuple[array, float]]]
    """
    
    area_dict = {}
    for filename in os.listdir(base_dir):
        if not filename.startswith('times_manual_') or not filename.endswith('.mat'):
            continue
            
        file_info = parse_filename(filename)
        if not file_info:
            continue
        base, area_name = file_info

        full_filename = os.path.join(base_dir, filename)
        cluster_class, ts_start = load_spike_data(full_filename)
        
        if area_name in area_dict:
            area_dict[area_name].append((cluster_class, ts_start))
        else:
            area_dict[area_name] = [(cluster_class, ts_start)]
    return area_dict


In [None]:
path = "./Data/562_MovieParadigm/Experiment-5-6-7/CSC_micro_spikes/times_manual_GA1-RAI2.mat"
cluster_class, ts_start = load_spike_data(path)
cluster_class[100:130, :]
np.max(cluster_class, )


(39582,)
(214701,)
(17156,)
(47242,)


array([3.25699902e+00, 3.32734251e+00, 3.45099878e+00, ...,
       4.13640798e+04, 4.13640838e+04, 4.13640864e+04])

In [9]:
base, area_name = parse_filename(path.split('/')[-1])
print(base, area_name)

RAI2 RAI


In [37]:
d = Dataloader()
pid = '566'
res = d._get_all_patient_neurons(pid=pid)
n1 = res[1]
print(n1)
len(res)

Neuron(neuron_id='LSTG6-1', pid='566', spike_times=array([3.49606180e+00, 4.35087299e+00, 4.86249804e+00, ...,
       4.44968594e+04, 4.44968747e+04, 4.44969399e+04]), area='LSTG', metadata={'ts_start': 1691269348.894448})


169

In [51]:
pids = ['562', '563', '566']
patients = {}
for pid in pids:
    patients[pid] = PatientData(pid=pid)