In [69]:
import ast
import pandas as pd
import mne
import mne_bids
from tqdm.auto import tqdm
import yaml

In [None]:
yaml.safe_load

In [63]:
ROOT_DIR = "/om/data/public/language-eeg/MEG-MASC/"

In [64]:
# Old slow and unnecessary code to load entire BIDS representation and then
# extract event data.
#
# def parse_annotations(raw: mne.io.BaseRaw, kind) -> pd.DataFrame:
#     rows = []
#     for annot in raw.annotations:
#         d = eval(annot.pop("description"))
#         if d["kind"] == kind:
#             for k, v in annot.items():
#                 assert k not in d
#                 d[k] = v
#             rows.append(d)
#     return pd.DataFrame(rows)
#
# def load_all_annotations(subject, session, task):
#     try:
#         bids_path = mne_bids.BIDSPath(
#             subject=subject,
#             session=session,
#             task=task,
#             datatype="meg",
#             root=ROOT_DIR,
#         )
#         raw = mne_bids.read_raw_bids(bids_path)
#     except Exception as e:
#         # TODO
#         print(e)
#         return None
    
#     sounds_df = parse_annotations(raw, "sound")
#     phonemes_df = parse_annotations(raw, "phoneme")
#     words_df = parse_annotations(raw, "word")
    
#     return sounds_df, phonemes_df, words_df

In [91]:
def parse_annotations(x, kind) -> pd.DataFrame:
    rows = []
    for _, annot in x.iterrows():
        data_str = annot.pop("trial_type")
        # HACK: check pre-parsing to save time
        if f"'kind': '{kind}'" not in data_str:
            continue

        d = ast.literal_eval(data_str)
        for k, v in annot.items():
            assert k not in d
            d[k] = v
        rows.append(d)
    return pd.DataFrame(rows)


def load_all_annotations(subject, session, task):
    bids_event_path = mne_bids.BIDSPath(
        subject=subject,
        session=session,
        task=task,
        datatype="meg",
        root=ROOT_DIR,

        suffix="events",
        extension=".tsv",
    )

    df = pd.read_csv(str(bids_event_path), sep="\t")
    return {
        kind: parse_annotations(df, kind)
        for kind in ["sound", "word", "phoneme"]
    }

In [104]:
all_dfs = {(subject, task): load_all_annotations(subject, "0", task)
           for subject in tqdm(["01", "02", "03"], desc="subject")
           for task in tqdm(["0", "1", "2", "3"], desc="task")}

subject:   0%|          | 0/3 [00:00<?, ?it/s]

task:   0%|          | 0/4 [00:00<?, ?it/s]

task:   0%|          | 0/4 [00:00<?, ?it/s]

task:   0%|          | 0/4 [00:00<?, ?it/s]

## Check words

In [93]:
word_01_0 = all_dfs["01", "0"]["word"]
word_02_0 = all_dfs["02", "0"]["word"]

In [94]:
def baseline(xs): return xs - xs.min()

word_01_0["baseline_onset"] = word_01_0.groupby("sound").onset.transform(baseline)
word_02_0["baseline_onset"] = word_02_0.groupby("sound").onset.transform(baseline)

In [95]:
word_01_0

Unnamed: 0,story,story_uid,sound_id,kind,meg_file,start,sound,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,baseline_onset
0,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.30,697,23506,0.00
1,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.310000,stimuli/audio/lw1_0.wav,stood,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.24,698,23816,0.31
2,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,0.550000,stimuli/audio/lw1_0.wav,stock,0.0,sentence,2.0,205.0,Allison,1.0,24.056,0.37,699,24056,0.55
3,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.080000,stimuli/audio/lw1_0.wav,still,0.0,sentence,3.0,205.0,Allison,1.0,24.586,0.40,700,24586,1.08
4,lw1,0.0,0.0,word,A0167_MASC_1_16Mar17_01.con,1.630000,stimuli/audio/lw1_0.wav,waiting,0.0,sentence,4.0,205.0,Allison,1.0,25.136,0.41,701,25136,1.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.070000,stimuli/audio/lw1_3.wav,end,52.0,sentence,15.0,205.0,Allison,1.0,361.097,0.17,3119,361097,49.97
664,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.250000,stimuli/audio/lw1_3.wav,for,52.0,sentence,16.0,205.0,Allison,1.0,361.277,0.14,3120,361277,50.15
665,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,50.460000,stimuli/audio/lw1_3.wav,project,52.0,sentence,18.0,205.0,Allison,1.0,361.487,0.58,3121,361487,50.36
666,lw1,0.0,3.0,word,A0167_MASC_1_16Mar17_01.con,51.179999,stimuli/audio/lw1_3.wav,and,52.0,sentence,19.0,205.0,Allison,1.0,362.207,0.15,3122,362207,51.08


In [96]:
word_02_0

Unnamed: 0,story,story_uid,sound_id,kind,meg_file,start,sound,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,baseline_onset
0,lw1,0.0,0.0,word,A0168_MASC_1_25Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,18.389,0.30,697,18389,0.00
1,lw1,0.0,0.0,word,A0168_MASC_1_25Mar17_01.con,0.310000,stimuli/audio/lw1_0.wav,stood,0.0,sentence,1.0,205.0,Allison,1.0,18.699,0.24,698,18699,0.31
2,lw1,0.0,0.0,word,A0168_MASC_1_25Mar17_01.con,0.550000,stimuli/audio/lw1_0.wav,stock,0.0,sentence,2.0,205.0,Allison,1.0,18.939,0.37,699,18939,0.55
3,lw1,0.0,0.0,word,A0168_MASC_1_25Mar17_01.con,1.080000,stimuli/audio/lw1_0.wav,still,0.0,sentence,3.0,205.0,Allison,1.0,19.469,0.40,700,19469,1.08
4,lw1,0.0,0.0,word,A0168_MASC_1_25Mar17_01.con,1.630000,stimuli/audio/lw1_0.wav,waiting,0.0,sentence,4.0,205.0,Allison,1.0,20.019,0.41,701,20019,1.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,lw1,0.0,3.0,word,A0168_MASC_1_25Mar17_01.con,50.070000,stimuli/audio/lw1_3.wav,end,52.0,sentence,15.0,205.0,Allison,1.0,356.763,0.17,3119,356763,49.97
664,lw1,0.0,3.0,word,A0168_MASC_1_25Mar17_01.con,50.250000,stimuli/audio/lw1_3.wav,for,52.0,sentence,16.0,205.0,Allison,1.0,356.943,0.14,3120,356943,50.15
665,lw1,0.0,3.0,word,A0168_MASC_1_25Mar17_01.con,50.460000,stimuli/audio/lw1_3.wav,project,52.0,sentence,18.0,205.0,Allison,1.0,357.153,0.58,3121,357153,50.36
666,lw1,0.0,3.0,word,A0168_MASC_1_25Mar17_01.con,51.179999,stimuli/audio/lw1_3.wav,and,52.0,sentence,19.0,205.0,Allison,1.0,357.873,0.15,3122,357873,51.08


In [97]:
pd.testing.assert_series_equal(word_01_0.baseline_onset, word_02_0.baseline_onset)

In [109]:
all_dfs["02", "1"]["sound"]

Unnamed: 0,index,subject,story,story_uid,sound_id,kind,meg_file,start,sound,onset,duration,value,sample
0,15.0,A0168,cable_spool_fort,1.0,0.0,sound,A0168_MASC_2_25Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_0.0.wav,28.305,0.0,1,28305
1,17.0,A0168,cable_spool_fort,1.0,1.0,sound,A0168_MASC_2_25Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_1.0.wav,134.283,0.0,2,134283
2,19.0,A0168,cable_spool_fort,1.0,2.0,sound,A0168_MASC_2_25Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_2.0.wav,275.61,0.0,3,275610
3,21.0,A0168,cable_spool_fort,1.0,3.0,sound,A0168_MASC_2_25Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_3.0.wav,423.887,0.0,4,423887
4,23.0,A0168,cable_spool_fort,1.0,4.0,sound,A0168_MASC_2_25Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_4.0.wav,536.782,0.0,5,536782
5,25.0,A0168,cable_spool_fort,1.0,5.0,sound,A0168_MASC_2_25Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_5.0.wav,665.826,0.0,6,665826


## Check phonemes

In [98]:
phoneme_01_0 = all_dfs["01", "0"]["phoneme"]
phoneme_02_0 = all_dfs["02", "0"]["phoneme"]

In [99]:
def baseline(xs): return xs - xs.min()

phoneme_01_0["baseline_onset"] = phoneme_01_0.groupby("sound").onset.transform(baseline)
phoneme_02_0["baseline_onset"] = phoneme_02_0.groupby("sound").onset.transform(baseline)

In [100]:
phoneme_01_0

Unnamed: 0,story,story_uid,sound_id,kind,meg_file,start,sound,phoneme,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,baseline_onset
0,lw1,0.0,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.00,stimuli/audio/lw1_0.wav,t_B,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.08,5,23506,0.00
1,lw1,0.0,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.08,stimuli/audio/lw1_0.wav,eh_I,0.0,sentence,0.0,205.0,Allison,1.0,23.586,0.09,6,23586,0.08
2,lw1,0.0,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.17,stimuli/audio/lw1_0.wav,r_I,0.0,sentence,0.0,205.0,Allison,1.0,23.676,0.07,7,23676,0.17
3,lw1,0.0,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.24,stimuli/audio/lw1_0.wav,ah_E,0.0,sentence,0.0,205.0,Allison,1.0,23.746,0.06,8,23746,0.24
4,lw1,0.0,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.31,stimuli/audio/lw1_0.wav,s_B,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.06,9,23816,0.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2457,lw1,0.0,3.0,phoneme,A0167_MASC_1_16Mar17_01.con,51.85,stimuli/audio/lw1_3.wav,p_I,52.0,sentence,22.0,205.0,Allison,1.0,362.877,0.09,2962,362877,51.75
2458,lw1,0.0,3.0,phoneme,A0167_MASC_1_16Mar17_01.con,51.94,stimuli/audio/lw1_3.wav,iy_I,52.0,sentence,22.0,205.0,Allison,1.0,362.967,0.09,2963,362967,51.84
2459,lw1,0.0,3.0,phoneme,A0167_MASC_1_16Mar17_01.con,52.03,stimuli/audio/lw1_3.wav,sh_I,52.0,sentence,22.0,205.0,Allison,1.0,363.057,0.08,2964,363057,51.93
2460,lw1,0.0,3.0,phoneme,A0167_MASC_1_16Mar17_01.con,52.11,stimuli/audio/lw1_3.wav,iy_I,52.0,sentence,22.0,205.0,Allison,1.0,363.137,0.01,2965,363137,52.01


In [101]:
phoneme_02_0

Unnamed: 0,story,story_uid,sound_id,kind,meg_file,start,sound,phoneme,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,baseline_onset
0,lw1,0.0,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,0.00,stimuli/audio/lw1_0.wav,t_B,0.0,sentence,0.0,205.0,Allison,1.0,18.389,0.08,5,18389,0.00
1,lw1,0.0,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,0.08,stimuli/audio/lw1_0.wav,eh_I,0.0,sentence,0.0,205.0,Allison,1.0,18.469,0.09,6,18469,0.08
2,lw1,0.0,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,0.17,stimuli/audio/lw1_0.wav,r_I,0.0,sentence,0.0,205.0,Allison,1.0,18.559,0.07,7,18559,0.17
3,lw1,0.0,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,0.24,stimuli/audio/lw1_0.wav,ah_E,0.0,sentence,0.0,205.0,Allison,1.0,18.629,0.06,8,18629,0.24
4,lw1,0.0,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,0.31,stimuli/audio/lw1_0.wav,s_B,0.0,sentence,1.0,205.0,Allison,1.0,18.699,0.06,9,18699,0.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2457,lw1,0.0,3.0,phoneme,A0168_MASC_1_25Mar17_01.con,51.85,stimuli/audio/lw1_3.wav,p_I,52.0,sentence,22.0,205.0,Allison,1.0,358.543,0.09,2962,358543,51.75
2458,lw1,0.0,3.0,phoneme,A0168_MASC_1_25Mar17_01.con,51.94,stimuli/audio/lw1_3.wav,iy_I,52.0,sentence,22.0,205.0,Allison,1.0,358.633,0.09,2963,358633,51.84
2459,lw1,0.0,3.0,phoneme,A0168_MASC_1_25Mar17_01.con,52.03,stimuli/audio/lw1_3.wav,sh_I,52.0,sentence,22.0,205.0,Allison,1.0,358.723,0.08,2964,358723,51.93
2460,lw1,0.0,3.0,phoneme,A0168_MASC_1_25Mar17_01.con,52.11,stimuli/audio/lw1_3.wav,iy_I,52.0,sentence,22.0,205.0,Allison,1.0,358.803,0.01,2965,358803,52.01


In [102]:
pd.testing.assert_series_equal(phoneme_01_0.baseline_onset, phoneme_02_0.baseline_onset)

## Conclusion

OK, so the deal is that, for each subject (possibly for each session?) for a given task, the subject is presented with a story split into a series of chunks ("sounds").
The sounds have different onsets in the time series by subject (and possibly also by session).
So the data need to be realigned/truncated to make the stimuli line up.

Once that's done, however, it looks like word onset information matches exactly. Which makes sense.