This notebook reproduces the N400 ERP result on the pre-computed Frank et al. 2015 epoched data.

NB to be consistent with others, indexing is as follows:
- sentence_idx: 1-based
- subject_idx: 1-based
- word_idx: 0-based

Sorry about that. :)

In [1]:
import scipy.io
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
from functools import reduce

basedir = Path("..").resolve()

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-7jhxn9no because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
data = scipy.io.loadmat("/om/data/public/language-eeg/frank2015/stimuli_erp.mat", simplify_cells=True)

In [3]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'ERP', 'artefact', 'reject', 'ERPbase', 'dH_pos', 'dH', 'sentences', 'surp_rnn', 'surp_psg', 'surp_pos_psg', 'surp_pos_rnn', 'wordlength', 'surp_ngram', 'surp_ngramfull', 'surp_pos_ngram', 'logwordfreq', 'sentence_position', 'presentation_order', 'SOAs'])

## Prepare data

### N400 results and annotations

In [4]:
data_components = ["ELAN", "LAN", "N400", "EPNP", "P600", "PNP"]
n400_idx = data_components.index("N400")

In [5]:
def make_signal_df(key, target_key=None):
    target_key = target_key or key
    return pd.concat(
        {sentence_idx: pd.DataFrame(mat[:, :, n400_idx], index=pd.RangeIndex(mat.shape[0], name="word_idx")) \
            .reset_index().melt(id_vars=["word_idx"], var_name="subject_idx", value_name=target_key)
        for sentence_idx, mat in enumerate(data[key])},
        names=["sentence_idx", "idx"]) \
        .reset_index().drop(columns=["idx"]).set_index(["subject_idx", "sentence_idx", "word_idx"])

In [6]:
n400_df = pd.merge(make_signal_df("ERP", "value_N400"), make_signal_df("ERPbase", "base_N400"),
                   left_index=True, right_index=True)
n400_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value_N400,base_N400
subject_idx,sentence_idx,word_idx,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,-4.587560,-4.639654
0,0,1,-1.859916,-2.745968
0,0,2,-0.326976,-0.486989
0,0,3,3.984934,1.054487
0,0,4,6.883699,2.721698
...,...,...,...,...
23,204,7,-2.024615,-4.434225
23,204,8,4.551344,1.269528
23,204,9,4.102198,-2.200850
23,204,10,-2.131469,5.741549


In [7]:
def make_feature_df(key, target_key=None, final_axis_name="subject_idx"):
    target_key = target_key or key
    return pd.concat(
        {sentence_idx: pd.DataFrame(mat, index=pd.RangeIndex(mat.shape[0], name="word_idx")) \
                             .reset_index().melt(id_vars=["word_idx"], var_name=final_axis_name, value_name=target_key)
         for sentence_idx, mat in enumerate(data[key])},
        names=["sentence_idx", "idx"]) \
        .reset_index().drop(columns=["idx"]).set_index(["subject_idx", "sentence_idx", "word_idx"])

In [8]:
by_participant_features = ["artefact", "reject"]
results_df = reduce(lambda acc, feature: pd.merge(acc, make_feature_df(feature), left_index=True, right_index=True),
                    by_participant_features, n400_df)
results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value_N400,base_N400,artefact,reject
subject_idx,sentence_idx,word_idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,-4.587560,-4.639654,0,1
0,0,1,-1.859916,-2.745968,0,1
0,0,2,-0.326976,-0.486989,0,0
0,0,3,3.984934,1.054487,0,0
0,0,4,6.883699,2.721698,0,1
...,...,...,...,...,...,...
23,204,7,-2.024615,-4.434225,0,0
23,204,8,4.551344,1.269528,0,0
23,204,9,4.102198,-2.200850,0,0
23,204,10,-2.131469,5.741549,0,0


### Stimulus features

In [9]:
def make_surprisal_df(key, target_key=None, i_offset=0):
    target_key = target_key or key
    return pd.concat(
        {sentence_idx: pd.DataFrame(mat, index=pd.RangeIndex(mat.shape[0], name="word_idx"),
                                    columns=[f"{target_key}_{i_offset+idx}" for idx in range(mat.shape[1])])
         for sentence_idx, mat in enumerate(data[key])},
        names=["sentence_idx", "word_idx"])

In [10]:
surprisal_df = pd.concat([make_surprisal_df("surp_ngram", "surp_ngram_order", i_offset=2),
                          make_surprisal_df("surp_rnn", "surp_rnn_size", i_offset=1)],
                         axis=1)
surprisal_df

Unnamed: 0_level_0,Unnamed: 1_level_0,surp_ngram_order_2,surp_ngram_order_3,surp_ngram_order_4,surp_rnn_size_1,surp_rnn_size_2,surp_rnn_size_3,surp_rnn_size_4,surp_rnn_size_5,surp_rnn_size_6,surp_rnn_size_7,surp_rnn_size_8,surp_rnn_size_9,surp_rnn_size_10
sentence_idx,word_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0,3.044363,3.044340,3.044340,3.482672,3.145689,3.121044,2.511534,2.865847,2.992646,3.094798,2.977805,3.289495,2.993416
0,1,5.447571,5.298179,5.292745,9.514440,8.214565,7.030273,6.441496,5.929814,5.246744,5.131305,5.886331,5.891977,5.654219
0,2,4.487278,3.561708,3.084290,9.111075,8.330564,6.914139,5.580521,5.000933,4.906635,4.620777,4.347193,4.178630,4.391322
0,3,1.639671,1.415385,0.676790,3.594287,3.210452,2.787990,2.840195,1.975472,1.707842,1.536145,1.337063,1.115086,1.451231
0,4,7.144369,4.562181,4.629370,8.459698,8.541803,8.242652,8.169230,8.290066,8.272437,7.855889,7.984851,7.794711,8.075972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,7,3.851212,4.149281,4.245483,4.631127,3.924689,3.901125,3.368505,3.373968,3.190540,2.908144,2.281913,2.811087,2.694225
204,8,4.030146,4.339636,4.272032,8.139806,7.945024,6.722867,6.237772,5.792343,5.422487,4.335986,3.894010,3.503744,3.809468
204,9,6.416153,6.987241,6.910058,10.800107,9.959173,8.835783,8.886982,8.520737,8.073103,8.324292,6.935927,6.700170,6.845393
204,10,3.648262,2.545669,2.342811,4.568191,3.884360,3.874658,3.318967,3.432938,3.411648,3.273675,2.664433,3.156238,3.111732


In [11]:
def make_control_df(key):
    return pd.concat([pd.DataFrame({key: wf}, index=pd.RangeIndex(len(wf), name="word_idx"))
                      for idx, wf in enumerate(data[key])],
                     names=["sentence_idx"], keys=np.arange(len(data[key])))

In [12]:
control_features = ["logwordfreq", "wordlength"]
control_df = reduce(lambda acc, feature: pd.merge(acc, make_control_df(feature), left_index=True, right_index=True),
                    control_features[1:], make_control_df(control_features[0]))
control_df

Unnamed: 0_level_0,Unnamed: 1_level_0,logwordfreq,wordlength
sentence_idx,word_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,-5.250051,1
0,1,-8.384545,6
0,2,-8.332967,4
0,3,-5.541296,3
0,4,-6.162017,5
...,...,...,...
204,7,-5.442780,3
204,8,-7.989015,4
204,9,-8.380188,6
204,10,-5.442780,3


## Merge and analyze

In [19]:
merged_df = pd.merge(results_df, surprisal_df, left_index=True, right_index=True)
merged_df = pd.merge(merged_df, control_df, left_index=True, right_index=True)
assert len(merged_df) == len(n400_df)

In [20]:
# Fix indexing.
merged_df = merged_df.reset_index()
merged_df["sentence_idx"] += 1
merged_df["subject_idx"] += 1
merged_df = merged_df.set_index(["subject_idx", "sentence_idx", "word_idx"]).sort_index()

In [21]:
merged_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value_N400,base_N400,artefact,reject,surp_ngram_order_2,surp_ngram_order_3,surp_ngram_order_4,surp_rnn_size_1,surp_rnn_size_2,surp_rnn_size_3,surp_rnn_size_4,surp_rnn_size_5,surp_rnn_size_6,surp_rnn_size_7,surp_rnn_size_8,surp_rnn_size_9,surp_rnn_size_10,logwordfreq,wordlength
subject_idx,sentence_idx,word_idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,-4.587560,-4.639654,0,1,3.044363,3.044340,3.044340,3.482672,3.145689,3.121044,2.511534,2.865847,2.992646,3.094798,2.977805,3.289495,2.993416,-5.250051,1
1,1,1,-1.859916,-2.745968,0,1,5.447571,5.298179,5.292745,9.514440,8.214565,7.030273,6.441496,5.929814,5.246744,5.131305,5.886331,5.891977,5.654219,-8.384545,6
1,1,2,-0.326976,-0.486989,0,0,4.487278,3.561708,3.084290,9.111075,8.330564,6.914139,5.580521,5.000933,4.906635,4.620777,4.347193,4.178630,4.391322,-8.332967,4
1,1,3,3.984934,1.054487,0,0,1.639671,1.415385,0.676790,3.594287,3.210452,2.787990,2.840195,1.975472,1.707842,1.536145,1.337063,1.115086,1.451231,-5.541296,3
1,1,4,6.883699,2.721698,0,1,7.144369,4.562181,4.629370,8.459698,8.541803,8.242652,8.169230,8.290066,8.272437,7.855889,7.984851,7.794711,8.075972,-6.162017,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,205,7,-2.024615,-4.434225,0,0,3.851212,4.149281,4.245483,4.631127,3.924689,3.901125,3.368505,3.373968,3.190540,2.908144,2.281913,2.811087,2.694225,-5.442780,3
24,205,8,4.551344,1.269528,0,0,4.030146,4.339636,4.272032,8.139806,7.945024,6.722867,6.237772,5.792343,5.422487,4.335986,3.894010,3.503744,3.809468,-7.989015,4
24,205,9,4.102198,-2.200850,0,0,6.416153,6.987241,6.910058,10.800107,9.959173,8.835783,8.886982,8.520737,8.073103,8.324292,6.935927,6.700170,6.845393,-8.380188,6
24,205,10,-2.131469,5.741549,0,0,3.648262,2.545669,2.342811,4.568191,3.884360,3.874658,3.318967,3.432938,3.411648,3.273675,2.664433,3.156238,3.111732,-5.442780,3


In [22]:
merged_df.to_csv(basedir / "output/frank_erp_n400.csv")