In [1]:
import torch
import neuroprobe.config as neuroprobe_config

# Make sure the config ROOT_DIR is set correctly
print("Expected braintreebank data at:", neuroprobe_config.ROOT_DIR)
print("Sampling rate:", neuroprobe_config.SAMPLING_RATE, "Hz")

Expected braintreebank data at: /om2/user/zaho/braintreebank/braintreebank/
Sampling rate: 2048 Hz


## The BrainTreebankSubject Class

In [2]:
from neuroprobe.braintreebank_subject import BrainTreebankSubject

subject_id = 1

# use cache=True to load this trial's neural data into RAM, if you have enough memory!
# It will make the loading process faster.
subject = BrainTreebankSubject(subject_id, allow_corrupted=False, cache=True, dtype=torch.float32)
print("Electrode labels:", subject.electrode_labels) # list of electrode labels

# Optionally, subset the electrodes to a specific set of electrodes.
subject.set_electrode_subset(['F3aOFa2', 'F3aOFa3', 'F3aOFa4', 'F3aOFa7']) # if you change this line when using cache=True, you need to clear the cache after: subject.clear_neural_data_cache()
print("Electrode labels after subsetting:", subject.electrode_labels)

Electrode labels: ['F3aOFa2', 'F3aOFa3', 'F3aOFa4', 'F3aOFa7', 'F3aOFa8', 'F3aOFa9', 'F3aOFa10', 'F3aOFa11', 'F3aOFa12', 'F3aOFa13', 'F3aOFa14', 'F3aOFa15', 'F3aOFa16', 'F3bIaOFb1', 'F3bIaOFb2', 'F3bIaOFb3', 'F3bIaOFb4', 'F3bIaOFb5', 'F3bIaOFb6', 'F3bIaOFb7', 'F3bIaOFb8', 'F3bIaOFb9', 'F3bIaOFb10', 'F3bIaOFb11', 'F3bIaOFb12', 'F3bIaOFb13', 'F3bIaOFb14', 'F3bIaOFb15', 'F3bIaOFb16', 'F3cId1', 'F3cId2', 'F3cId3', 'F3cId4', 'F3cId5', 'F3cId6', 'F3cId7', 'F3cId8', 'F3cId9', 'F3cId10', 'T1aIb1', 'T1aIb2', 'T1aIb3', 'T1aIb4', 'T1aIb5', 'T1aIb6', 'T1aIb7', 'T1aIb8', 'T2aA1', 'T2aA2', 'T2aA3', 'T2aA4', 'T2aA5', 'T2aA6', 'T2aA7', 'T2aA8', 'T2aA9', 'T2aA10', 'T2aA11', 'T2aA12', 'T2bHa1', 'T2bHa3', 'T2bHa4', 'T2bHa5', 'T2bHa7', 'T2bHa8', 'T2bHa9', 'T2bHa10', 'T2bHa11', 'T2bHa12', 'T2bHa13', 'T2bHa14', 'T1bIc1', 'T1bIc2', 'T1bIc3', 'T1bIc4', 'T1bIc5', 'T1bIc6', 'T1bIc7', 'T1bIc8', 'F3dIe1', 'F3dIe2', 'F3dIe3', 'F3dIe4', 'F3dIe5', 'F3dIe6', 'F3dIe7', 'F3dIe8', 'F3dIe9', 'F3dIe10', 'F3dIe14', 'T3aHb6

Loading the electrode data and electrode coordinates

In [3]:
trial_id = 1

subject.load_neural_data(trial_id)
window_from = None
window_to = None # if None, the whole trial will be loaded

print("All neural data shape:")
print(subject.get_all_electrode_data(trial_id, window_from=window_from, window_to=window_to).shape) # (n_electrodes, n_samples). To get the data for a specific electrode, use subject.get_electrode_data(trial_id, electrode_label)

print("\nElectrode coordinates:")
print(subject.get_electrode_coordinates()) # L, P, I coordinates of the electrodes

All neural data shape:
torch.Size([4, 21401009])

Electrode coordinates:
tensor([[116., 131.,  71.],
        [113., 131.,  72.],
        [109., 130.,  74.],
        [ 98., 127.,  79.]])


## The BrainTreebankSubjectTrialBenchmarkDataset Class

In [4]:
from neuroprobe.datasets import BrainTreebankSubjectTrialBenchmarkDataset

# Options for eval_name (from the Neuroprobe paper):
#   frame_brightness, global_flow, local_flow, global_flow_angle, local_flow_angle, face_num, volume, pitch, delta_volume, 
#   delta_pitch, speech, onset, gpt2_surprisal, word_length, word_gap, word_index, word_head_pos, word_part_speech, speaker
eval_name = "volume"

# if True, the eval is Neuroprobe-Lite (the default), otherwise it is Neuroprobe-Full
lite = True

# if True, the eval is Neuroprobe-Nano, otherwise it is Neuroprobe-Lite (if lite is True)
nano = False

# if True, the dataset will output the indices of the samples in the neural data in a tuple: (index_from, index_to); 
# if False, the dataset will output the neural data directly
output_indices = False

start_neural_data_before_word_onset = 0 # the number of samples to start the neural data before each word onset
end_neural_data_after_word_onset = neuroprobe_config.SAMPLING_RATE * 1 # the number of samples to end the neural data after each word onset -- here we use 1 second


dataset = BrainTreebankSubjectTrialBenchmarkDataset(subject, trial_id, dtype=torch.float32, eval_name=eval_name, output_indices=output_indices, 
                                                    start_neural_data_before_word_onset=start_neural_data_before_word_onset, end_neural_data_after_word_onset=end_neural_data_after_word_onset,
                                                    lite=lite, nano=nano)
# P.S. Allow partial cache -- whether to allow partial caching of the neural data, if only part of it is needed for this particular dataset. Better set to False when doing multiple evals back to back, but better set to True when doing a single eval.

print("Items in the dataset:", len(dataset), "\n")
print("The first item:", dataset[0][0], f"label = {dataset[0][1]}", sep="\n")

Items in the dataset: 3500 

The first item:
tensor([[-133.1872, -132.9213, -129.9971,  ..., -100.7544, -103.1470,
         -104.7420],
        [  36.1546,   31.6353,   26.5843,  ...,  -85.0697,  -82.1454,
          -80.2845],
        [  10.6337,    7.9753,    6.3802,  ...,  -70.7141,  -70.4483,
          -68.8533],
        [ -27.9135,  -33.2303,  -35.3571,  ..., -151.2645, -153.1254,
         -151.2645]])
label = 0


## Train/Test Splits

In this example, we generate 5 train/test splits for the Single Subject Single Movie (SS-SM) evaluation.

All options: generate_splits_SS_SM, generate_splits_SS_DM, generate_splits_DS_DM, generate_splits_DS_SM

In [5]:
import neuroprobe.train_test_splits as neuroprobe_train_test_splits

# train_datasets and test_datasets are arrays of length k_folds, each element is a BrainTreebankSubjectTrialBenchmarkDataset for the train/test split
train_datasets, test_datasets = neuroprobe_train_test_splits.generate_splits_SS_SM(subject, trial_id, eval_name, dtype=torch.float32, 
                                                                                # Put the dataset parameters here
                                                                                output_indices=output_indices, start_neural_data_before_word_onset=start_neural_data_before_word_onset, end_neural_data_after_word_onset=end_neural_data_after_word_onset,
                                                                                lite=lite, nano=nano)
print("len(train_datasets) = len(test_datasets) = k_folds =", len(train_datasets))

len(train_datasets) = len(test_datasets) = k_folds = 5


## Example Linear Regression on SS/ST

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

for fold_idx in range(len(train_datasets)):
    print(f"Fold {fold_idx+1} of {len(train_datasets)}")
    train_dataset = train_datasets[fold_idx]
    test_dataset = test_datasets[fold_idx]

    # Convert PyTorch dataset to numpy arrays for scikit-learn
    X_train = np.array([item[0].flatten() for item in train_dataset])
    y_train = np.array([item[1] for item in train_dataset])
    X_test = np.array([item[0].flatten() for item in test_dataset])
    y_test = np.array([item[1] for item in test_dataset])

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train logistic regression
    clf = LogisticRegression(random_state=42, max_iter=1000, tol=1e-3)
    clf.fit(X_train, y_train)

    # Evaluate model
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f"\t Train accuracy: {train_score:.3f} | Test accuracy: {test_score:.3f}")

Fold 1 of 5
	 Train accuracy: 0.998 | Test accuracy: 0.463
Fold 2 of 5
	 Train accuracy: 0.998 | Test accuracy: 0.549
Fold 3 of 5
	 Train accuracy: 0.999 | Test accuracy: 0.511
Fold 4 of 5
	 Train accuracy: 1.000 | Test accuracy: 0.461
Fold 5 of 5
	 Train accuracy: 0.995 | Test accuracy: 0.490
