This notebook is adapted for running on a local machine.

# 1. General data prep

In [10]:
## Importing packages

from config import *

In [11]:
# --- Sorting fMRI files ---

# Creating a common dictionary of all fMRI files by study participant
all_files = sorted(glob.glob(
    "Data/ds000113/sub-*/ses-movie/func/sub-*_ses-movie_task-movie_run-*_bold.nii.gz"
))

def extract_subject_id(path):
    match = re.search(r"sub-(\d+)", path)
    return match.group(1) if match else None

subject_runs = defaultdict(list)

for path in all_files:
    subject_id = extract_subject_id(path)
    if subject_id:
        subject_runs[subject_id].append(path)

# 2. Cleaning up the emotion annotations

In [12]:
# --- Creating a common dataframe with all the emotions --- 
allemotion = glob.glob('Data/gump_emotions/raw/av1o0*.csv')
emotiondf = []
for file in allemotion:
    df = pd.read_csv(file)
    df['participant'] = file 
    emotiondf.append(df)

# Removing the non-labeled moments
emotiondf = [df.dropna(subset=['emotion']) for df in emotiondf]
emotions = pd.concat(emotiondf, ignore_index=True) 

# Adding the TR correspondents to every label
TR = 2  # TR in seconds chosen based on the fMRI configuration

emotions['start_tr'] = np.floor(emotions['start']  / TR).astype(int)
emotions['end_tr']   = np.ceil (emotions['end']    / TR).astype(int)


In [None]:
# --- Creating a new dataframe of the format: ----
#
#           Participant 1 | Participant 2 | ... | Participant 7 | General Emotion
# TR1       Emotion_x        NaN                    Emotion_x     Emotion_x
# TR2
# .
# .
# TR_final
#


max_tr = emotions['end_tr'].max()   
n_TRs  = max_tr                    
participants = emotions['participant'].unique()

wide = pd.DataFrame(
    index = range(n_TRs),
    columns = participants,
    data = np.nan
)

for _, row in emotions.iterrows():
    pid       = row['participant']      # Participant ID
    start_tr  = int(row['start_tr'])    # When the label starts
    end_tr    = int(row['end_tr'])      # When the label ends
    label     = row['emotion']          # The label itself

    wide.loc[start_tr:end_tr-1, pid] = label

wide['general_emotion'] = wide.mode(axis=1)[0]

# Renaming the columns and indices
wide.index.name      = 'global_TR'
wide.columns.name    = 'participant'

In [14]:
# --- Keeping only the labels agreed upon by half of the participants ---

participant_cols = [c for c in wide.columns if c != 'general_emotion']
n_participants = len(participant_cols)
half_thresh = math.ceil(n_participants / 2)

def half_of_all_consensus(row):
    counts = row[participant_cols].value_counts(dropna=True)
    if counts.empty:
        return False
    top_count = counts.iloc[0]
    return top_count >= half_thresh

mask = wide.apply(half_of_all_consensus, axis=1)

wide_consensus_all = wide.loc[mask].copy()
print(f"Kept {len(wide_consensus_all)} of {len(wide)} TRs where ≥{half_thresh} participants (half of {n_participants}) agreed")


Kept 1010 of 3526 TRs where ≥5 participants (half of 9) agreed


In [15]:
#--- Determining the individual fMRI segments in terms of TRs as reported in the underlying paper (Labs et al. ) ---

TR = 2.0
segments = [
    (   0.0,  902.0),
    ( 886.0, 1768.0),
    (1752.0, 2628.0),
    (2612.0, 3588.0),
    (3572.0, 4496.0),
    (4480.0, 5358.0),
    (5342.0, 6426.0),
    (6410.0, 7086.0),
]

# Finding the starting TRs and length in TRs
run_lengths = [int((e - s) / TR) for s, e in segments]
run_starts  = np.cumsum([0] + run_lengths[:-1])

In [16]:
#--- Inspecting amount of labels ---
label_counts = wide_consensus_all['general_emotion'].value_counts()
print(label_counts)

general_emotion
FEAR              236
SADNESS           235
HAPPINESS         206
ANGERRAGE         143
LOVE               78
DISAPPOINTMENT     25
ADMIRATION         18
CONTEMPT           12
PRIDE              11
COMPASSION         10
SHAME              10
REMORSE             9
GLOATING            6
GRATITUDE           4
RELIEF              3
HOPE                2
HATE                2
Name: count, dtype: int64


In [17]:
#--- Balancing the labels ---

# Value to keep for each emotion
threshold = 70      

df = wide_consensus_all[['general_emotion']].reset_index() 
counts = df['general_emotion'].value_counts()
keep = counts[counts >= threshold].index
df = df[df['general_emotion'].isin(keep)]


balanced = (
    df
    .groupby('general_emotion', group_keys=False)
    .apply(lambda grp: grp.sample(n=threshold, random_state=0))
)

sampled_TRs = balanced['global_TR'].values
wide_balanced = wide_consensus_all.loc[sampled_TRs].copy()


  .apply(lambda grp: grp.sample(n=threshold, random_state=0))


# 3. Data prep for the CNN

In [18]:
#--- Creating a metadata dictionary for the fMRI data ---

CACHE_PATH = 'Xmetadata.pkl'
if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, 'rb') as f:
        examples = pickle.load(f)
    print(f"Loaded {len(examples)} examples from cache.")
else:

    def tr_to_run_idx(tr):
        for ri, start in enumerate(run_starts):
            if start <= tr < start + run_lengths[ri]:
                return ri, int(tr - start)
        raise ValueError(f"TR {tr} out of bounds")

    examples = []
    for subj, run_files in tqdm(subject_runs.items()):
        for global_tr in wide_balanced.index:
            run_idx, local_tr = tr_to_run_idx(global_tr)
            filepath = run_files[run_idx]
            img      = nib.load(filepath)
            vol3d    = img.dataobj[..., local_tr]
            examples.append({
                'subject': subj,
                'run_idx': run_idx+1,
                'TR_local': local_tr,
                'label':   wide_balanced.at[global_tr,'general_emotion'],
                'volume':  vol3d,
                'affine':  img.affine
            })

    with open(CACHE_PATH, 'wb') as f:
        pickle.dump(examples, f)
        print(f"Computed and cached {len(examples)} examples.")

Loaded 5250 examples from cache.


In [19]:
#--- Creating a training/testing split ---
labels = [ex['label'] for ex in examples]
indices = list(range(len(examples)))

train_idx, val_idx = train_test_split(
    indices,
    test_size=0.2,
    stratify=labels,
    random_state=42
)

# Saving the indices of validation dataset for later usage
with open('valindices.pkl', 'wb') as f:
    pickle.dump(val_idx, f)
    print(f"Saved {len(val_idx)} validation indices to val_indices.pkl.")

Saved 1050 validation indices to val_indices.pkl.


In [20]:
#--- Necessary preparations for the model ---

# Creating a mapping between labels and their integer indices
unique_labels = sorted({ex['label'] for ex in examples})
label_to_int  = {lab: i for i, lab in enumerate(unique_labels)}

# Processing the metadata into lists 
vol_list, label_list, meta_list = [], [], []
for ex in examples:
    vol = ex['volume']  # shape = (X, Y, Z), numpy

    # Normalizing the data
    mu, sigma = vol.mean(), vol.std()
    vol = (vol - mu) / (sigma + 1e-6)

    # Filling the lists and reshaping the training data for the model
    vol_list.append(vol[np.newaxis, ...])            # now (1, X, Y, Z)
    label_list.append(label_to_int[ex['label']])
    meta_list.append({
        'subject':  ex['subject'],
        'run_idx':  ex['run_idx'],
        'TR_local': ex['TR_local'],
        'affine':   ex['affine'],
    })

# 4. Exporting the data

In [None]:
#--- Generating and exporting X_train, y_train, X_val, y_val ---

# Convert lists to numpy arrays
X = np.stack(vol_list)  # Shape: (N, 1, X, Y, Z)
y = np.array(label_list)  # Shape: (N,)

# Split into training and validation sets
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]

# Define base path for saving
base = 'readydata'
os.makedirs(base, exist_ok=True)

# Save the datasets
torch.save(torch.from_numpy(X_train).float(), f'{base}/X_train.pt')
torch.save(torch.from_numpy(y_train).long(), f'{base}/y_train.pt')
torch.save(torch.from_numpy(X_val).float(), f'{base}/X_val.pt')
torch.save(torch.from_numpy(y_val).long(), f'{base}/y_val.pt')

print(f"Exported datasets to '{base}' directory.")