# Debugging Data Loading and Structure
This notebook explores the raw data files, their internal structure, and the preprocessing pipeline outputs.

In [2]:
import os

import scipy.io as sio

# List all .mat files in the current directory
mat_files = ["/Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/hc_data_raw.mat"]
print(f"Found {len(mat_files)} .mat files:")
for f in mat_files:
    print(f"  - {f}")

# Load and analyze the first .mat file
if mat_files:
    mat_path = mat_files[0]
    mat_data = sio.loadmat(mat_path)
    
    print(f"\n--- Structure of {mat_path} ---")
    print(f"Keys: {list(mat_data.keys())}")
    
    # Analyze each variable
    for key in mat_data.keys():
        if not key.startswith('__'):  # Skip metadata
            value = mat_data[key]
            print(f"\n{key}:")
            print(f"  Type: {type(value)}")
            print(f"  Shape: {value.shape if hasattr(value, 'shape') else 'N/A'}")
            print(f"  Dtype: {value.dtype if hasattr(value, 'dtype') else 'N/A'}")

Found 1 .mat files:
  - /Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/hc_data_raw.mat

--- Structure of /Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/hc_data_raw.mat ---
Keys: ['__header__', '__version__', '__globals__', 'spike_times', 'pos_times', 'pos']

spike_times:
  Type: <class 'numpy.ndarray'>
  Shape: (58, 1)
  Dtype: object

pos_times:
  Type: <class 'numpy.ndarray'>
  Shape: (1, 219089)
  Dtype: float64

pos:
  Type: <class 'numpy.ndarray'>
  Shape: (219089, 2)
  Dtype: float64


## Data Exploration: Hippocampus Dataset
Load and inspect the structure of the Hippocampus raw data file (`hc_data_raw.mat`).

In [3]:
mat_files = ["/Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/m1_data_raw.mat"]
print(f"Found {len(mat_files)} .mat files:")
for f in mat_files:
    print(f"  - {f}")

# Load and analyze the first .mat file
if mat_files:
    mat_path = mat_files[0]
    mat_data = sio.loadmat(mat_path)
    
    print(f"\n--- Structure of {mat_path} ---")
    print(f"Keys: {list(mat_data.keys())}")
    
    # Analyze each variable
    for key in mat_data.keys():
        if not key.startswith('__'):  # Skip metadata
            value = mat_data[key]
            print(f"\n{key}:")
            print(f"  Type: {type(value)}")
            print(f"  Shape: {value.shape if hasattr(value, 'shape') else 'N/A'}")
            print(f"  Dtype: {value.dtype if hasattr(value, 'dtype') else 'N/A'}")

Found 1 .mat files:
  - /Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/m1_data_raw.mat

--- Structure of /Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/m1_data_raw.mat ---
Keys: ['__header__', '__version__', '__globals__', 'spike_times', 'vels', 'vel_times', 'pos', 'acc']

spike_times:
  Type: <class 'numpy.ndarray'>
  Shape: (164, 1)
  Dtype: object

vels:
  Type: <class 'numpy.ndarray'>
  Shape: (1264999, 2)
  Dtype: float64

vel_times:
  Type: <class 'numpy.ndarray'>
  Shape: (1264999, 1)
  Dtype: float64

pos:
  Type: <class 'numpy.ndarray'>
  Shape: (1264999, 2)
  Dtype: float64

acc:
  Type: <class 'numpy.ndarray'>
  Shape: (1264999, 2)
  Dtype: float64


## Data Exploration: Motor Cortex Dataset
Load and inspect the structure of the M1 raw data file (`m1_data_raw.mat`).

In [10]:
# Load, preprocess, and analyze output structure
import sys
import os
from pathlib import Path

# Add 'src' directory to sys.path to allow importing neural_decoding module
current_dir = Path(os.getcwd())
src_path = current_dir.parent / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from neural_decoding.data.loader import load_dataset
from neural_decoding.main import run_preprocessing, DEFAULT_BIN_SIZE, DEFAULT_START_TIME

# Path to your .mat data file (adjusted for notebook location in 'notebooks/')
mat_path = Path("../data/raw/m1_data_raw.mat")

# Check if file exists to avoid confusion
if not mat_path.exists():
    # Try absolute path if relative fails (fallback)
    mat_path = Path("/Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/m1_data_raw.mat")

# Load data
raw_data = load_dataset(mat_path)
neural_data = raw_data["spike_times"]
outputs = (raw_data["outputs"], raw_data["output_times"])

# Preprocess data
config = {"bin_size": DEFAULT_BIN_SIZE, "start_time": DEFAULT_START_TIME}
X_train, X_test, y_train, y_test = run_preprocessing(neural_data, outputs, config)

# Analyze output structure
def print_array_info(name, arr):
    print(f"{name}: shape={arr.shape}, dtype={arr.dtype}, min={arr.min()}, max={arr.max()}")

print_array_info("X_train", X_train)
print_array_info("X_test", X_test)
print_array_info("y_train", y_train)
print_array_info("y_test", y_test)

2026-01-23 17:46:57,005 - main.py - run_preprocessing - INFO - Preprocessing data...
2026-01-23 17:46:57,008 - main.py - run_preprocessing - DEBUG - Binning spikes with bin_size=0.050
2026-01-23 17:47:15,406 - main.py - run_preprocessing - DEBUG - Creating train/test split with bins_before=0, bins_after=0, bins_current=1
2026-01-23 17:47:15,461 - main.py - run_preprocessing - INFO - Preprocessing complete: X_train shape=(17710, 164), X_test shape=(5060, 164)


X_train: shape=(17710, 164), dtype=float64, min=0.0, max=13.0
X_test: shape=(5060, 164), dtype=float64, min=0.0, max=12.0
y_train: shape=(17710, 2), dtype=float64, min=-31.79176907599171, max=33.244782071208746
y_test: shape=(5060, 2), dtype=float64, min=-29.870950396348764, max=32.23741366414379


## Data Preprocessing Pipeline Check
Run the full preprocessing pipeline on the M1 dataset and verify the shapes of training and test splits.

X_train: shape=(17710, 164), dtype=float64, min=0.0, max=13.0
Training feature matrix. 17,710 time bins (samples), 164 neurons (features). Each value is the binned spike count for a neuron in a time bin.

X_test: shape=(5060, 164), dtype=float64, min=0.0, max=12.0
Test feature matrix. 5,060 time bins, 164 neurons. Same format as X_train, but for the test set.

y_train: shape=(17710, 2), dtype=float64, min=-31.79..., max=33.24...
Training target/output matrix. 17,710 samples, 2 output variables (e.g., position X and Y). Values are the behavioral measurements aligned to each time bin.

y_test: shape=(5060, 2), dtype=float64, min=-29.87..., max=32.23...
Test target/output matrix. 5,060 samples, 2 output variables. Same format as y_train, but for the test set.



In [11]:
# Analyze detailed structure of spike times loading
import numpy as np
import scipy.io as sio
from pathlib import Path

# Load raw .mat data
mat_path = Path("../data/raw/m1_data_raw.mat")
if not mat_path.exists():
    mat_path = Path("/Users/kidus/Desktop/Neural_Decoding/Neural_Decoding_DS_Final/data/raw/m1_data_raw.mat")
data = sio.loadmat(mat_path)

print("1. data['spike_times'] (Loaded from .mat):")
spike_times_raw = data["spike_times"]
print(f"   Type: {type(spike_times_raw)}")
print(f"   Shape: {spike_times_raw.shape}")
print(f"   Dtype: {spike_times_raw.dtype}")
print(f"   Content example (first element): {spike_times_raw[0, 0]}")
print(f"   Type of first element: {type(spike_times_raw[0, 0])}")

print("\n2. Processing loop (Transformation):")
# Simulate the loop for the first neuron
i = 0
element = spike_times_raw[i, 0]
print(f"   Raw element shape: {element.shape}")
as_array = np.array(element)
print(f"   Converted to np.array shape: {as_array.shape}")
flattened = as_array.flatten()
print(f"   Flattened shape: {flattened.shape}")
print(f"   First 5 spike times: {flattened[:5]}")

print("\n3. spike_times (Final List):")
spike_times = [
    np.array(spike_times_raw[i, 0]).flatten()
    for i in range(spike_times_raw.shape[0])
]
print(f"   Type: {type(spike_times)}")
print(f"   Length: {len(spike_times)}")
print(f"   Type of element 0: {type(spike_times[0])}")
print(f"   Shape of element 0: {spike_times[0].shape}")

1. data['spike_times'] (Loaded from .mat):
   Type: <class 'numpy.ndarray'>
   Shape: (164, 1)
   Dtype: object
   Content example (first element): [[4.47300000e-01 4.91366667e-01 1.31610000e+00 ... 1.30698600e+03
  1.30727720e+03 1.30728827e+03]]
   Type of first element: <class 'numpy.ndarray'>

2. Processing loop (Transformation):
   Raw element shape: (1, 15215)
   Converted to np.array shape: (1, 15215)
   Flattened shape: (15215,)
   First 5 spike times: [0.4473     0.49136667 1.3161     2.5238     2.80613333]

3. spike_times (Final List):
   Type: <class 'list'>
   Length: 164
   Type of element 0: <class 'numpy.ndarray'>
   Shape of element 0: (15215,)


## Deep Dive: Spike Times Loading and Formatting
Detailed analysis of how `spike_times` are extracted and transformed from the raw MATLAB cell array into Python list format.