In [142]:
import logging
from pprint import pprint

import pandas as pd
import numpy as np

In [168]:
presentations = [
    {"key": {"subject": "01", "session": 0, "story_name": "lw1"},
     "sound": "lw1.01.0.sound.csv",
     "word": "lw1.01.0.word.csv",
     "phoneme": "lw1.01.0.phoneme.csv"},
    {"key": {"subject": "02", "session": 0, "story_name": "lw1"},
     "sound": "lw1.02.0.sound.csv",
     "word": "lw1.02.0.word.csv",
     "phoneme": "lw1.02.0.phoneme.csv"},
    {"key": {"subject": "01", "session": 0, "story_name": "cable_spool_fort"},
     "sound": "cable_spool_fort.01.0.sound.csv",
     "word": "cable_spool_fort.01.0.word.csv",
     "phoneme": "cable_spool_fort.01.0.phoneme.csv"},
]
output_path = "session_alignment.csv"

In [169]:
def load_presentation(presentation_dict):
    ret = {}
    recording_key = presentation_dict.pop("key")
    for df_key, path in presentation_dict.items():
        ret[df_key] = pd.read_csv(path, index_col=0).assign(**recording_key)
    return ret

In [170]:
all_presentations = [load_presentation(presentations_i) for presentations_i in presentations]
# transpose
all_presentations = {key: pd.concat([p_i[key] for p_i in all_presentations])
                     for key in all_presentations[0].keys()}

In [171]:
# Fix typo?
# in other reprs paths are denoted as e.g. stimuli/audio/lw1_0.wav
# but here we see stimuli/audio/lw1_0.0.wav
all_presentations["sound"]["sound"] = all_presentations["sound"].sound.str.replace(".0.wav", ".wav", regex=False)
all_presentations["sound"]

Unnamed: 0,index,subject,story,story_uid,sound_id,kind,meg_file,start,sound,onset,duration,value,sample,session,story_name
0,5.0,1,lw1,0.0,0.0,sound,A0167_MASC_1_16Mar17_01.con,0.0,stimuli/audio/lw1_0.wav,23.506,0.0,2,23506,0,lw1
1,7.0,1,lw1,0.0,1.0,sound,A0167_MASC_1_16Mar17_01.con,0.0,stimuli/audio/lw1_1.wav,127.185,0.0,3,127185,0,lw1
2,9.0,1,lw1,0.0,2.0,sound,A0167_MASC_1_16Mar17_01.con,0.0,stimuli/audio/lw1_2.wav,210.048,0.0,4,210048,0,lw1
3,11.0,1,lw1,0.0,3.0,sound,A0167_MASC_1_16Mar17_01.con,0.0,stimuli/audio/lw1_3.wav,311.027,0.0,1,311027,0,lw1
0,5.0,2,lw1,0.0,0.0,sound,A0168_MASC_1_25Mar17_01.con,0.0,stimuli/audio/lw1_0.wav,18.389,0.0,2,18389,0,lw1
1,7.0,2,lw1,0.0,1.0,sound,A0168_MASC_1_25Mar17_01.con,0.0,stimuli/audio/lw1_1.wav,121.868,0.0,3,121868,0,lw1
2,9.0,2,lw1,0.0,2.0,sound,A0168_MASC_1_25Mar17_01.con,0.0,stimuli/audio/lw1_2.wav,207.648,0.0,4,207648,0,lw1
3,11.0,2,lw1,0.0,3.0,sound,A0168_MASC_1_25Mar17_01.con,0.0,stimuli/audio/lw1_3.wav,306.693,0.0,1,306693,0,lw1
0,14.0,1,cable_spool_fort,1.0,0.0,sound,A0167_MASC_2_16Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_0.wav,8.245,0.0,1,8245,0,cable_spool_fort
1,16.0,1,cable_spool_fort,1.0,1.0,sound,A0167_MASC_2_16Mar17_01.con,0.0,stimuli/audio/cable_spool_fort_1.wav,114.957,0.0,2,114957,0,cable_spool_fort


In [172]:
all_sounds = all_presentations["sound"].groupby(["subject", "session"]) \
    .apply(lambda xs: sorted(xs.groupby(["story_name", "sound_id"]).groups.keys())).tolist()

pprint(all_sounds[0])
for sounds_i in all_sounds[1:]:
    if set(sounds_i) != set(all_sounds[0]):
        logging.warning("Some subject--sessions have different amounds of sounds")



[('cable_spool_fort', 0.0),
 ('cable_spool_fort', 1.0),
 ('cable_spool_fort', 2.0),
 ('cable_spool_fort', 3.0),
 ('cable_spool_fort', 4.0),
 ('cable_spool_fort', 5.0),
 ('lw1', 0.0),
 ('lw1', 1.0),
 ('lw1', 2.0),
 ('lw1', 3.0)]


In [173]:
# Compute sound start points
index_key = ["subject", "session", "story_name", "sound_id", "sound"]
presentation_onsets = all_presentations["sound"][index_key + ["onset"]].set_index(index_key)
presentation_onsets

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,onset
subject,session,story_name,sound_id,sound,Unnamed: 5_level_1
1,0,lw1,0.0,stimuli/audio/lw1_0.wav,23.506
1,0,lw1,1.0,stimuli/audio/lw1_1.wav,127.185
1,0,lw1,2.0,stimuli/audio/lw1_2.wav,210.048
1,0,lw1,3.0,stimuli/audio/lw1_3.wav,311.027
2,0,lw1,0.0,stimuli/audio/lw1_0.wav,18.389
2,0,lw1,1.0,stimuli/audio/lw1_1.wav,121.868
2,0,lw1,2.0,stimuli/audio/lw1_2.wav,207.648
2,0,lw1,3.0,stimuli/audio/lw1_3.wav,306.693
1,0,cable_spool_fort,0.0,stimuli/audio/cable_spool_fort_0.wav,8.245
1,0,cable_spool_fort,1.0,stimuli/audio/cable_spool_fort_1.wav,114.957


## Check words

In [187]:
w = all_presentations["word"].set_index(index_key)
w["onset_sound"] = presentation_onsets
w["onset_baselined"] = (w.onset - w["onset_sound"]).round(4)
w.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,story,story_uid,kind,meg_file,start,word,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,onset_sound,onset_baselined
subject,session,story_name,sound_id,sound,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01,0,cable_spool_fort,0.0,stimuli/audio/cable_spool_fort_0.wav,cable_spool_fort,1.0,word,A0167_MASC_2_16Mar17_01.con,0.000000,The,0.0,sentence,0.0,160.0,Ava,1.0,8.245,0.13,982,8245,8.245,0.00
01,0,cable_spool_fort,0.0,stimuli/audio/cable_spool_fort_0.wav,cable_spool_fort,1.0,word,A0167_MASC_2_16Mar17_01.con,0.130000,Cable,0.0,sentence,1.0,160.0,Ava,1.0,8.375,0.48,983,8375,8.245,0.13
01,0,cable_spool_fort,0.0,stimuli/audio/cable_spool_fort_0.wav,cable_spool_fort,1.0,word,A0167_MASC_2_16Mar17_01.con,1.020000,Fort,0.0,sentence,3.0,160.0,Ava,1.0,9.265,0.42,984,9265,8.245,1.02
01,0,cable_spool_fort,0.0,stimuli/audio/cable_spool_fort_0.wav,cable_spool_fort,1.0,word,A0167_MASC_2_16Mar17_01.con,1.470000,by,0.0,sentence,4.0,160.0,Ava,1.0,9.715,0.23,985,9715,8.245,1.47
01,0,cable_spool_fort,0.0,stimuli/audio/cable_spool_fort_0.wav,cable_spool_fort,1.0,word,A0167_MASC_2_16Mar17_01.con,1.700000,Bill,0.0,sentence,5.0,160.0,Ava,1.0,9.945,0.25,986,9945,8.245,1.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
02,0,lw1,3.0,stimuli/audio/lw1_3.wav,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,50.070000,end,52.0,sentence,15.0,205.0,Allison,1.0,356.763,0.17,3119,356763,306.693,50.07
02,0,lw1,3.0,stimuli/audio/lw1_3.wav,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,50.250000,for,52.0,sentence,16.0,205.0,Allison,1.0,356.943,0.14,3120,356943,306.693,50.25
02,0,lw1,3.0,stimuli/audio/lw1_3.wav,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,50.460000,project,52.0,sentence,18.0,205.0,Allison,1.0,357.153,0.58,3121,357153,306.693,50.46
02,0,lw1,3.0,stimuli/audio/lw1_3.wav,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,51.179999,and,52.0,sentence,19.0,205.0,Allison,1.0,357.873,0.15,3122,357873,306.693,51.18


In [188]:
# For each word presentation, get number of different onsets relative to sound onset.
# Should be 1 for everything.
num_unique_word_onsets = w.groupby(["story_name", "sequence_id", "word_index", "condition"]).apply(lambda xs: len(set(xs.onset_baselined)))
assert set(num_unique_word_onsets.values) == {1}

In [196]:
# And there should be N word presentations for a sound played N times.
for sound, sound_presentations in presentation_onsets.groupby("sound"):
    assert set(w.xs(sound, level="sound").groupby("onset_baselined").size()) == {len(sound_presentations)}

In [197]:
# TODO check that these and the word lists are the same for all subjects/sessions
w.reset_index().set_index(["condition", "subject"]).loc["pseudo_words"].groupby("subject").head(10)

Unnamed: 0_level_0,session,story_name,sound_id,sound,story,story_uid,kind,meg_file,start,word,...,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,onset_sound,onset_baselined
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,lw1,3.0,stimuli/audio/lw1_3.wav,lw1,0.0,word,A0167_MASC_1_16Mar17_01.con,33.23,ro,...,20.0,205.0,Allison,1.0,344.257,0.15,3071,344257,311.027,33.23
2,0,lw1,3.0,stimuli/audio/lw1_3.wav,lw1,0.0,word,A0168_MASC_1_25Mar17_01.con,33.23,ro,...,20.0,205.0,Allison,1.0,339.923,0.15,3071,339923,306.693,33.23


## Check phonemes

In [199]:
p = all_presentations["phoneme"].set_index(index_key)
p["onset_sound"] = presentation_onsets
p["onset_baselined"] = (p.onset - p["onset_sound"]).round(4)
p

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,story,story_uid,kind,meg_file,start,phoneme,sequence_id,condition,word_index,speech_rate,voice,pronounced,onset,duration,value,sample,onset_sound,onset_baselined
subject,session,story_name,sound_id,sound,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,lw1,0.0,stimuli/audio/lw1_0.wav,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.000000,t_B,0.0,sentence,0.0,205.0,Allison,1.0,23.506,0.08,5,23506,23.506,0.00
1,0,lw1,0.0,stimuli/audio/lw1_0.wav,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.080000,eh_I,0.0,sentence,0.0,205.0,Allison,1.0,23.586,0.09,6,23586,23.506,0.08
1,0,lw1,0.0,stimuli/audio/lw1_0.wav,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.170000,r_I,0.0,sentence,0.0,205.0,Allison,1.0,23.676,0.07,7,23676,23.506,0.17
1,0,lw1,0.0,stimuli/audio/lw1_0.wav,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.240000,ah_E,0.0,sentence,0.0,205.0,Allison,1.0,23.746,0.06,8,23746,23.506,0.24
1,0,lw1,0.0,stimuli/audio/lw1_0.wav,lw1,0.0,phoneme,A0167_MASC_1_16Mar17_01.con,0.310000,s_B,0.0,sentence,1.0,205.0,Allison,1.0,23.816,0.06,9,23816,23.506,0.31
1,0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0,cable_spool_fort,5.0,stimuli/audio/cable_spool_fort_5.wav,cable_spool_fort,1.0,phoneme,A0167_MASC_2_16Mar17_01.con,53.719999,ao_B,128.0,sentence,2.0,190.0,Allison,1.0,700.486,0.07,6478,700486,646.766,53.72
1,0,cable_spool_fort,5.0,stimuli/audio/cable_spool_fort_5.wav,cable_spool_fort,1.0,phoneme,A0167_MASC_2_16Mar17_01.con,53.789999,l_E,128.0,sentence,2.0,190.0,Allison,1.0,700.556,0.07,6479,700556,646.766,53.79
1,0,cable_spool_fort,5.0,stimuli/audio/cable_spool_fort_5.wav,cable_spool_fort,1.0,phoneme,A0167_MASC_2_16Mar17_01.con,53.859999,r_B,128.0,sentence,3.0,190.0,Allison,1.0,700.626,0.09,6480,700626,646.766,53.86
1,0,cable_spool_fort,5.0,stimuli/audio/cable_spool_fort_5.wav,cable_spool_fort,1.0,phoneme,A0167_MASC_2_16Mar17_01.con,53.949999,ay_I,128.0,sentence,3.0,190.0,Allison,1.0,700.716,0.14,6481,700716,646.766,53.95


In [200]:
# For each phoneme presentation, get number of different onsets relative to sound onset.
# Should be 1 for everything.
num_unique_phoneme_onsets = p.groupby(["story_name", "sequence_id", "word_index", "condition", "value"]).apply(lambda xs: len(set(xs.onset_baselined)))
assert set(num_unique_phoneme_onsets.values) == {1}

In [201]:
# And there should be N word presentations for a sound played N times.
for sound, sound_presentations in presentation_onsets.groupby("sound"):
    assert set(p.xs(sound, level="sound").groupby("onset_baselined").size()) == {len(sound_presentations)}

## Build and save a canonical alignment

In [202]:
# Use first session as the canonical alignment. Arbitrary.
_, sound_sequence = next(iter(all_presentations["sound"].groupby(["subject", "session"])))
canonical_sound_sequence = sound_sequence[["story_name", "sound_id"]].drop_duplicates()

In [203]:
presentation_onsets = pd.merge(
    canonical_sound_sequence,
    all_presentations["sound"][["story_name", "sound_id", "subject", "session", "onset"]],
    how="left")

presentation_onsets

Unnamed: 0,story_name,sound_id,subject,session,onset
0,lw1,0.0,1,0,23.506
1,lw1,0.0,2,0,18.389
2,lw1,1.0,1,0,127.185
3,lw1,1.0,2,0,121.868
4,lw1,2.0,1,0,210.048
5,lw1,2.0,2,0,207.648
6,lw1,3.0,1,0,311.027
7,lw1,3.0,2,0,306.693
8,cable_spool_fort,0.0,1,0,8.245
9,cable_spool_fort,1.0,1,0,114.957


In [204]:
presentation_onsets.to_csv(output_path)