In [1]:
import pandas as pd
import numpy as np

In [19]:
presentations = [
    {"key": {"subject": "01", "session": 0, "story_name": "lw1"},
     "sound": "lw1.01.0.sound.csv",
     "word": "lw1.01.0.word.csv",
     "phoneme": "lw1.01.0.phoneme.csv"},
    {"key": {"subject": "02", "session": 0, "story_name": "lw1"},
     "sound": "lw1.02.0.sound.csv",
     "word": "lw1.02.0.word.csv",
     "phoneme": "lw1.02.0.phoneme.csv"},
]
output_path = "session_alignment.csv"

In [20]:
def load_presentation(presentation_dict):
    ret = {}
    recording_key = presentation_dict.pop("key")
    for df_key, path in presentation_dict.items():
        ret[df_key] = pd.read_csv(path, index_col=0).assign(**recording_key)
    return ret

In [21]:
all_presentations = [load_presentation(presentations_i) for presentations_i in presentations]
# transpose
all_presentations = {key: pd.concat([p_i[key] for p_i in all_presentations])
                     for key in all_presentations[0].keys()}
all_presentations

{'sound':    index subject story  story_uid  sound_id   kind  \
 0    5.0      01   lw1        0.0       0.0  sound   
 1    7.0      01   lw1        0.0       1.0  sound   
 2    9.0      01   lw1        0.0       2.0  sound   
 3   11.0      01   lw1        0.0       3.0  sound   
 0    5.0      02   lw1        0.0       0.0  sound   
 1    7.0      02   lw1        0.0       1.0  sound   
 2    9.0      02   lw1        0.0       2.0  sound   
 3   11.0      02   lw1        0.0       3.0  sound   
 
                       meg_file  start                      sound    onset  \
 0  A0167_MASC_1_16Mar17_01.con    0.0  stimuli/audio/lw1_0.0.wav   23.506   
 1  A0167_MASC_1_16Mar17_01.con    0.0  stimuli/audio/lw1_1.0.wav  127.185   
 2  A0167_MASC_1_16Mar17_01.con    0.0  stimuli/audio/lw1_2.0.wav  210.048   
 3  A0167_MASC_1_16Mar17_01.con    0.0  stimuli/audio/lw1_3.0.wav  311.027   
 0  A0168_MASC_1_25Mar17_01.con    0.0  stimuli/audio/lw1_0.0.wav   18.389   
 1  A0168_MASC_1_25Mar17_0

In [35]:
all_sounds = all_presentations["sound"].groupby("subject") \
    .apply(lambda xs: sorted(xs.groupby(["story_name", "sound_id"]).groups.keys())).tolist()
for sounds_i in all_sounds[1:]:
    assert sounds_i == all_sounds[0]

In [39]:
# Compute sound start points
index_key = ["subject", "session", "story_name", "sound_id"]
presentation_onsets = all_presentations["sound"][index_key + ["onset"]].set_index(index_key)
presentation_onsets

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset
subject,session,story_name,sound_id,Unnamed: 4_level_1
1,0,lw1,0.0,23.506
1,0,lw1,1.0,127.185
1,0,lw1,2.0,210.048
1,0,lw1,3.0,311.027
2,0,lw1,0.0,18.389
2,0,lw1,1.0,121.868
2,0,lw1,2.0,207.648
2,0,lw1,3.0,306.693


## Check words

In [92]:
w = all_presentations["word"].set_index(index_key)
w["onset_sound"] = presentation_onsets
w["onset_baselined"] = (w.onset - w["onset_sound"]).round(4)
w

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,story,story_uid,kind,meg_file,start,sound,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,onset_sound,onset_baselined
subject,session,story_name,sound_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01,0,lw1,0.0,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,0.000000,stimuli/audio/lw1_0.wav,Tara,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.30,697,23506,23.506,0.00
01,0,lw1,0.0,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,0.310000,stimuli/audio/lw1_0.wav,stood,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.24,698,23816,23.506,0.31
01,0,lw1,0.0,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,0.550000,stimuli/audio/lw1_0.wav,stock,0.0,sentence,2.0,205.0,Allison,1.0,24.056,0.37,699,24056,23.506,0.55
01,0,lw1,0.0,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,1.080000,stimuli/audio/lw1_0.wav,still,0.0,sentence,3.0,205.0,Allison,1.0,24.586,0.40,700,24586,23.506,1.08
01,0,lw1,0.0,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,1.630000,stimuli/audio/lw1_0.wav,waiting,0.0,sentence,4.0,205.0,Allison,1.0,25.136,0.41,701,25136,23.506,1.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
02,0,lw1,3.0,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,50.070000,stimuli/audio/lw1_3.wav,end,52.0,sentence,15.0,205.0,Allison,1.0,356.763,0.17,3119,356763,306.693,50.07
02,0,lw1,3.0,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,50.250000,stimuli/audio/lw1_3.wav,for,52.0,sentence,16.0,205.0,Allison,1.0,356.943,0.14,3120,356943,306.693,50.25
02,0,lw1,3.0,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,50.460000,stimuli/audio/lw1_3.wav,project,52.0,sentence,18.0,205.0,Allison,1.0,357.153,0.58,3121,357153,306.693,50.46
02,0,lw1,3.0,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,51.179999,stimuli/audio/lw1_3.wav,and,52.0,sentence,19.0,205.0,Allison,1.0,357.873,0.15,3122,357873,306.693,51.18


In [94]:
# For each word presentation, get number of different onsets relative to sound onset.
# Should be 1 for everything.
num_unique_word_onsets = w.groupby(["story_name", "sequence_id", "word_index", "condition"]).apply(lambda xs: len(set(xs.onset_baselined)))
assert set(num_unique_word_onsets.values) == {1}

In [97]:
# And there should be N word presentations per word.
num_per_word_onset = w.groupby(["sound", "onset_baselined"]).size()
assert set(num_per_word_onset) == {len(presentations)}

In [88]:
# TODO check that these and the word lists are the same for all subjects/sessions
w.reset_index().set_index(["condition", "subject"]).loc["pseudo_words"].groupby("subject").head(10)

Unnamed: 0_level_0,session,story_name,sound_id,story,story_uid,kind,meg_file,start,sound,word,...,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,onset_sound,onset_baselined
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,lw1,3.0,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,33.23,stimuli/audio/lw1_3.wav,ro,...,20.0,205.0,Allison,1.0,344.257,0.15,3071,344257,311.027,33.23
2,0,lw1,3.0,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,33.23,stimuli/audio/lw1_3.wav,ro,...,20.0,205.0,Allison,1.0,339.923,0.15,3071,339923,306.693,33.23


## Check phonemes

In [98]:
p = all_presentations["phoneme"].set_index(index_key)
p["onset_sound"] = presentation_onsets
p["onset_baselined"] = (p.onset - p["onset_sound"]).round(4)
p

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,story,story_uid,kind,meg_file,start,sound,phoneme,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,onset_sound,onset_baselined
subject,session,story_name,sound_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01,0,lw1,0.0,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.00,stimuli/audio/lw1_0.wav,t_B,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.08,5,23506,23.506,0.00
01,0,lw1,0.0,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.08,stimuli/audio/lw1_0.wav,eh_I,0.0,sentence,0.0,205.0,Allison,1.0,23.586,0.09,6,23586,23.506,0.08
01,0,lw1,0.0,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.17,stimuli/audio/lw1_0.wav,r_I,0.0,sentence,0.0,205.0,Allison,1.0,23.676,0.07,7,23676,23.506,0.17
01,0,lw1,0.0,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.24,stimuli/audio/lw1_0.wav,ah_E,0.0,sentence,0.0,205.0,Allison,1.0,23.746,0.06,8,23746,23.506,0.24
01,0,lw1,0.0,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.31,stimuli/audio/lw1_0.wav,s_B,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.06,9,23816,23.506,0.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
02,0,lw1,3.0,lw1,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,51.85,stimuli/audio/lw1_3.wav,p_I,52.0,sentence,22.0,205.0,Allison,1.0,358.543,0.09,2962,358543,306.693,51.85
02,0,lw1,3.0,lw1,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,51.94,stimuli/audio/lw1_3.wav,iy_I,52.0,sentence,22.0,205.0,Allison,1.0,358.633,0.09,2963,358633,306.693,51.94
02,0,lw1,3.0,lw1,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,52.03,stimuli/audio/lw1_3.wav,sh_I,52.0,sentence,22.0,205.0,Allison,1.0,358.723,0.08,2964,358723,306.693,52.03
02,0,lw1,3.0,lw1,0.0,phoneme,A0168_MASC_1_25Mar17_01.con,52.11,stimuli/audio/lw1_3.wav,iy_I,52.0,sentence,22.0,205.0,Allison,1.0,358.803,0.01,2965,358803,306.693,52.11


In [102]:
# For each phoneme presentation, get number of different onsets relative to sound onset.
# Should be 1 for everything.
num_unique_phoneme_onsets = p.groupby(["story_name", "sequence_id", "word_index", "condition", "value"]).apply(lambda xs: len(set(xs.onset_baselined)))
assert set(num_unique_phoneme_onsets.values) == {1}

In [104]:
# And there should be N presentations per phoneme.
num_per_phoneme_onset = w.groupby(["sound", "onset_baselined"]).size()
assert set(num_per_phoneme_onset) == {len(presentations)}

## Build and save a canonical alignment

In [123]:
# Use first session as the canonical alignment. Arbitrary.
_, sound_sequence = next(iter(all_presentations["sound"].groupby(["subject", "session"])))
canonical_sound_sequence = sound_sequence[["story_name", "sound_id"]].drop_duplicates()

In [126]:
presentation_onsets = pd.merge(
    canonical_sound_sequence,
    all_presentations["sound"][["story_name", "sound_id", "subject", "session", "onset"]],
    how="left")

presentation_onsets

Unnamed: 0,story_name,sound_id,subject,session,onset
0,lw1,0.0,1,0,23.506
1,lw1,0.0,2,0,18.389
2,lw1,1.0,1,0,127.185
3,lw1,1.0,2,0,121.868
4,lw1,2.0,1,0,210.048
5,lw1,2.0,2,0,207.648
6,lw1,3.0,1,0,311.027
7,lw1,3.0,2,0,306.693


In [127]:
presentation_onsets.to_csv(output_path)