## Sync phoneme times to excerpt times
Get phoneme times and convert to MEG timestamps for all annotated excerpts. Saved to `/mnt/sphere/nbl/processed_meg/{subject}/{session}/emcom/phon_events.csv`


In [1]:
# load packages
import numpy as np
import pandas as pd
from pathlib import Path
import os
import pickle
import mne
import sys
import re
import textgrid
from mne.io import read_raw_fif
from mne.preprocessing.ica import read_ica
from mne import (pick_types, find_events, Epochs, Evoked, compute_covariance, write_cov, read_cov, setup_source_space,
                 write_source_spaces, make_bem_model, make_bem_solution, make_bem_solution, make_forward_solution,
                 write_forward_solution, read_forward_solution, write_bem_solution, convert_forward_solution, read_epochs, 
                 grade_to_vertices, read_source_estimate)
from mne.minimum_norm import make_inverse_operator, apply_inverse_epochs, apply_inverse, write_inverse_operator, read_inverse_operator
from mne.preprocessing import ICA
from mne.viz import plot_source_estimates
import matplotlib.pyplot as plt
mne.set_log_level(verbose='INFO')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
# session info
subject = 'nbl_004'
session = '01'

emcom_dir = os.path.join('/mnt/sphere/nbl/processed_meg', subject, 'ses-'+session, 'emcom')
events_df = pd.read_csv(os.path.join(emcom_dir, 'emcom_events.csv'))
events_df.head()

Unnamed: 0,t_start_meg,t_start_psychopy,file_handle,stim_file,stim_index,stim_condition,rate_condition,side_condition,progress,rating,subjectID,subjectIndex,date,FIF
0,82.809,97.634136,s3903b-ex02,stims/s3903b-ex02_normed.wav,34.0,speech,emotion,cis,1,"[(0.011560299433767796, 0.0), (0.0150975994765...",nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
1,119.865,145.243336,s0305a-ex01,stims/s0305a-ex01_normed.wav,32.0,speech,emotion,cis,2,"[(0.01022220030426979, 0.0), (0.01457380037754...",nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
2,168.99,221.60411,s2702a_110_339,stims/s2702a_110_339_normed.wav,4.0,speech,emotion,cis,3,"[(0.011198499239981174, 0.0), (0.0145920999348...",nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
3,243.954,267.312181,dutch-m3_01,stims/dutch-m3_01_normed.wav,71.0,speech,emotion,cis,4,"[(0.011219799518585205, 0.0), (0.0145713994279...",nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
4,291.431,302.191275,s2302a_81_174,stims/s2302a_81_174_normed.wav,13.0,speech,emotion,cis,5,"[(0.010308399796485901, 0.0), (0.0137200998142...",nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...


In [3]:
# load phoneme data
stim_times_metadata_fname = '/mnt/sphere/nbl/analysis/code/speech_annotations/stim-times.tsv'
stim_times_addons_fname = '/mnt/sphere/nbl/analysis/code/speech_annotations/stim-times_addon.tsv'
stim_times_df = pd.read_csv(stim_times_metadata_fname, sep="\t")
stim_times_df.drop(stim_times_df.index[0], inplace=True)
stim_times_df.reset_index(drop=True, inplace=True)
stim_times_addons_df = pd.read_csv(stim_times_addons_fname, sep="\t")
stim_times_addons_df = stim_times_addons_df.drop(columns=['Unnamed: 0'])

In [4]:
cmp = (pd.merge(stim_times_df[['ex_name']].drop_duplicates(),
                stim_times_addons_df[['ex_name']].drop_duplicates(),
                on='ex_name', how='outer', indicator=True, validate='one_to_one'))
print('excerpts occuring in both:', cmp.loc[cmp['_merge']=='both', 'ex_name'].tolist())

excerpts occuring in both: []


In [5]:
stim_annots_df = pd.concat([stim_times_df, stim_times_addons_df])
print(len(stim_annots_df.ex_name.unique()), 'excerpts annotated')
stim_annots_df.head()

141 excerpts annotated


Unnamed: 0,ex_name,t_start,duration,word,pos,phone
0,sayEnglishWords,0.04978310533,0.1429955972,at,IN,ae
1,sayEnglishWords,0.1927787025,0.1192273526,at,IN,t
2,sayEnglishWords,0.3120060552,0.02015110185,the,DT,dh
3,sayEnglishWords,0.332157157,0.03694368673,the,DT,ah
4,sayEnglishWords,0.36910084,0.1729636242,prompt,NN,p


In [6]:
stim_played = pd.Series(events_df['file_handle']).dropna().unique()
stim_annotated = pd.Series(stim_annots_df['ex_name']).dropna().unique()
missing = np.setdiff1d(stim_played, stim_annotated)
if not (len(missing) == 0): print("Missing from stim_annots_df.ex_name:", missing)

Missing from stim_annots_df.ex_name: ['arabic_f01' 'arabic_f02' 'arabic_f03' 'arabic_m01' 'arabic_m02'
 'arabic_m03' 'arabic_m04' 'arabic_m05' 'dutch-f1_01' 'dutch-f1_02'
 'dutch-f2_01' 'dutch-f2_02' 'dutch-f3_01' 'dutch-f3_02' 'dutch-m1_01'
 'dutch-m1_02' 'dutch-m2_01' 'dutch-m2_02' 'dutch-m3_01' 'dutch-m3_02']


In [7]:
phones_data = []
for _, event_row in events_df.iterrows():
    phone_rows = stim_annots_df[stim_annots_df['ex_name'] == event_row['file_handle']]
    if phone_rows.empty: continue  # skip excerpts not found in stim_annots_df
    for _, phone_row in phone_rows.iterrows():
        combined_row = {
            
            # time of phoneme onset
            't_start_meg': float(event_row['t_start_meg']) + float(phone_row['t_start']),
            'file_handle': event_row['file_handle'],
            
            # add information from stim times metadata
            'phone': phone_row['phone'],
            'word': phone_row['word'],
            'pos': phone_row['pos'],
            'duration': phone_row['duration'],

            # add other event data
            'stim_file': event_row['stim_file'],
            'stim_index': event_row['stim_index'],
            'stim_condition': event_row['stim_condition'],
            'rate_condition': event_row['rate_condition'],
            'side_condition': event_row['side_condition'],
            'subjectID': event_row['subjectID'],
            'subjectIndex': event_row['subjectIndex'],
            'date': event_row['date'],
            'FIF': event_row['FIF']
            
        }
        phones_data.append(combined_row)

phones_df = pd.DataFrame(phones_data)
print(len(phones_df), 'phonemes identified')
phones_df.head()

21705 phonemes identified


Unnamed: 0,t_start_meg,file_handle,phone,word,pos,duration,stim_file,stim_index,stim_condition,rate_condition,side_condition,subjectID,subjectIndex,date,FIF
0,82.909,s3903b-ex02,VOCNOISE,<VOCNOISE>,,0.058396,stims/s3903b-ex02_normed.wav,34.0,speech,emotion,cis,nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
1,82.967396,s3903b-ex02,g,got,VBD,0.037198,stims/s3903b-ex02_normed.wav,34.0,speech,emotion,cis,nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
2,83.004594,s3903b-ex02,aa,got,VBD,0.035182,stims/s3903b-ex02_normed.wav,34.0,speech,emotion,cis,nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
3,83.039776,s3903b-ex02,dx_t,got,VBD,0.044818,stims/s3903b-ex02_normed.wav,34.0,speech,emotion,cis,nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...
4,83.084594,s3903b-ex02,ih,a,DT,0.09,stims/s3903b-ex02_normed.wav,34.0,speech,emotion,cis,nbl_004,0,2025-10-03_14h06m47s,/mnt/sphere/nbl/processed_meg/nbl_004/ses-01/e...


In [8]:
phones_df.to_csv(os.path.join(emcom_dir, 'phon_events.csv'), index=False)