# Pipeline 2: Single trial linear mixed effects time-series modeling

In [1]:
from pathlib import Path
import logging
import os
import functools
import re
import pprint as pp
import datetime
import warnings

import pandas as pd
import fitgrid
import fitgrid.utils as fgutil

from udck19_filenames import (
    EEG_EPOCHS_DIR, EEG_MODELING_DIR, 
    PREPOCHS_TRMD_EEG_F,
)

from udck19_utils import (
    get_udck19_logger,
    check_ENV,
    N_EPOCH_SAMPS,  # epoch length in samples
    N_TRMD_EEG_EPOCHS,  # number of epochs after EEG screening in pipeline_1
    EEG_SCREEN_COL,  # HDF5 dataset key
    EEG_EXPT_SPECS,
    EEG_26_STREAMS,
    RHS_VARS,
    LMER_MODELS,
    LMER_MODELS_BY_EXPT,
    check_epochs_shape, 
    standardize,
    fit_lmer_formulas,
)

# enforce active conda env
check_ENV()

# logging config
__file__ = 'udck19_pipeline_2.ipynb'
logging.shutdown()
LOGGER = get_udck19_logger(__file__)

pipeline_start = datetime.datetime.now()

LOGGER.info(f"""
udck19 Supplementary Materials 2
CONDA_DEFAULT_ENV: {os.environ['CONDA_DEFAULT_ENV']}
pandas: {pd.__version__} 
fitgrid: {fitgrid.__version__}
Start {pipeline_start.strftime("%d.%b %Y %H:%M:%S")}
""")

udck19_pipeline_2.ipynb:INFO:
udck19 Supplementary Materials 2
CONDA_DEFAULT_ENV: udck19_pnas_110819
pandas: 0.25.2 
fitgrid: 0.4.6
Start 08.Nov 2019 09:11:49



# Notebook globals

In [2]:
# N_SAMPS is after downsampling to 125Hz == 8 ms sampling period
# N_TRMD_EEG_EPOCHS is after EEG artifact screening

# (N_TRMD_EEG_SAMP, N_TRMD_EEG_EPOCHS) = (375, 12046) 
# EEG_SCREEN_COL = 'eeg_screen'

PRERUN = False  # True

# Optionally pre-run

In [3]:
if PRERUN:
    step = 5
    time_slice = pd.IndexSlice[:, slice(-200, 600, step)]
    LMER_CHANNELS = LMER_CHANNELS = ['MiPf', 'MiCe', 'MiPa', 'MiOc']
    modl_path = EEG_MODELING_DIR / "prerun"
    pfx = f'step{step}_chans{len(LMER_CHANNELS)}_'
else:
    time_slice = pd.IndexSlice[:, :]
    LMER_CHANNELS = EEG_26_STREAMS
    modl_path = EEG_MODELING_DIR 
    pfx = ""

assert modl_path.exists()

# Set the LMER fitter

In [4]:
lmer_fitter = functools.partial(
    fgutil.summary.summarize,
    modeler='lmer', 
    LHS=LMER_CHANNELS,
    parallel=True, 
    n_cores=26,
    REML=False
)

# Model 3 data sets pooled

* `expt` as random effect

* `expt` as fixed effect

Notes:

* prerun 0.3 hrs 4 channels, step 5, -200 600, n_cores=32 light CPU traffic


In [5]:
for model_set in LMER_MODELS.keys():
    LOGGER.info(f"""
    {model_set}
    {pp.pformat(LMER_MODELS[model_set])}
    """)

udck19_pipeline_2.ipynb:INFO:
    lmer_acz_ranef
    ['article_cloze_z + (article_cloze_z | expt) + (article_cloze_z | sub_id) + '
 '(article_cloze_z | article_item_id)',
 'article_cloze_z + (article_cloze_z | expt) + (article_cloze_z | sub_id) + (1 '
 '| article_item_id)',
 'article_cloze_z + (article_cloze_z | expt) + (1 | sub_id) + (article_cloze_z '
 '| article_item_id)',
 'article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (article_cloze_z '
 '| article_item_id)',
 'article_cloze_z + (article_cloze_z | expt) + (1 | sub_id) + (1 | '
 'article_item_id)',
 'article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (1 | '
 'article_item_id)',
 'article_cloze_z + (1 | expt) + (1 | sub_id) + (article_cloze_z | '
 'article_item_id)',
 'article_cloze_z + (1 | expt) + (1 | sub_id) + (1 | article_item_id)',
 'article_cloze_z + (1 | sub_id) + (1 | article_item_id)',
 'article_cloze_z + (1 | expt) + (1 | sub_id)',
 'article_cloze_z + (1 | expt) + (1 | article_item_id)',
 '(article_

## Read single trial data and standardize cloze

In [6]:
LOGGER.info(f"Loading prepared and artifact screened epochs: {PREPOCHS_TRMD_EEG_F}")
prepochs_trmd_eeg_df = pd.read_hdf(
    PREPOCHS_TRMD_EEG_F, EEG_SCREEN_COL, mode='r'
).reset_index().set_index(["Epoch_idx", "Time"])

# sanity single trial epochs as screened in pipeline_1
assert (N_EPOCH_SAMPS, N_TRMD_EEG_EPOCHS) == check_epochs_shape(prepochs_trmd_eeg_df)
assert all([val == 'accept' for val in prepochs_trmd_eeg_df[EEG_SCREEN_COL]])

# standardize cloze values after exclusions
prepochs_trmd_eeg_df, prepochs_trmd_eeg_means_sds = standardize(
    prepochs_trmd_eeg_df,
    ['article_cloze', 'ART_noun_cloze', 'NA_noun_cloze', ]
)
assert all(
    col in prepochs_trmd_eeg_df.columns or col in prepochs_trmd_eeg_df.index.names
    for col in EEG_26_STREAMS + RHS_VARS
)
 
msg = f"""
Prepared epochs after dropping EEG artifacts
    timestamps: {N_EPOCH_SAMPS}, epochs {N_TRMD_EEG_EPOCHS}
    index names: {prepochs_trmd_eeg_df.index.names}
    columns: {prepochs_trmd_eeg_df.columns}
    means and sds: {prepochs_trmd_eeg_means_sds}")
"""
LOGGER.info(msg)


LOGGER.info(prepochs_trmd_eeg_df.head())
LOGGER.info(prepochs_trmd_eeg_df.tail())

udck19_pipeline_2.ipynb:INFO:Loading prepared and artifact screened epochs: /mnt/cube/home/turbach/papers/udck19/analysis/data/epochs/prepochs_trimd_eeg.h5
udck19_pipeline_2.ipynb:INFO:
Prepared epochs after dropping EEG artifacts
    timestamps: 375, epochs 12043
    index names: ['Epoch_idx', 'Time']
    columns: Index(['expt', 'sub_id', 'item_id', 'h5_dataset', 'dataset_index',
       'event_code', 'regex_match', 'regex_anchor', 'garv_reject', 'article',
       'adjective', 'noun', 'article_cloze', 'ART_noun_cloze', 'NA_noun_cloze',
       'lle', 'lhz', 'MiPf', 'LLPf', 'RLPf', 'LMPf', 'RMPf', 'LDFr', 'RDFr',
       'LLFr', 'RLFr', 'LMFr', 'RMFr', 'LMCe', 'RMCe', 'MiCe', 'MiPa', 'LDCe',
       'RDCe', 'LDPa', 'RDPa', 'LMOc', 'RMOc', 'LLTe', 'RLTe', 'LLOc', 'RLOc',
       'MiOc', 'A2', 'HEOG', 'rle', 'rhz', 'article_item_id', 'ptp_excursion',
       'blocked', 'garv_blink', 'garv_screen', 'eeg_screen', 'article_cloze_z',
       'ART_noun_cloze_z', 'NA_noun_cloze_z'],
      dtype='obje

# Load the data into `fitgrid` and run the model fitting

In [7]:
prepochs_trmd_eeg_fg = fitgrid.epochs_from_dataframe(
    prepochs_trmd_eeg_df
    .loc[time_slice, RHS_VARS + LMER_CHANNELS],  # prerun slicing, if any
    epoch_id='Epoch_idx',
    time='Time',
    channels=LMER_CHANNELS
)

if not PRERUN:
    # enforce for full data set analysis
    assert (N_EPOCH_SAMPS, N_TRMD_EEG_EPOCHS) == check_epochs_shape(
        prepochs_trmd_eeg_fg.table.set_index('Time', append=True)
    )

start_time = datetime.datetime.now()
LOGGER.info(f"Start modeling: {start_time.strftime('%d.%b %Y %H:%M:%S')}")
            
for model_set in LMER_MODELS.keys():
    LOGGER.info(f"""{model_set}""")
            
    # supress pandas FutureWarning for rpy2 DataFrame.from_items
    with warnings.catch_warnings():
        warnings.simplefilter("ignore") 
        fit_lmer_formulas(
            prepochs_trmd_eeg_fg,
            lmer_fitter,
            LMER_MODELS[model_set],
            modl_path / (pfx + model_set + ".h5"),
            LOGGER
        )

elapsed = datetime.datetime.now() - start_time
LOGGER.info(f"Elapsed time modeling: {elapsed}") 

udck19_pipeline_2.ipynb:INFO:Start modeling: 08.Nov 2019 09:12:14
udck19_pipeline_2.ipynb:INFO:lmer_acz_ranef
udck19_pipeline_2.ipynb:INFO:
    random effects structures
    file: /mnt/cube/home/turbach/papers/udck19/analysis/measures/modeling/prerun/step5_chans4_lmer_acz_ranef.h5
    ['article_cloze_z + (article_cloze_z | expt) + (article_cloze_z | sub_id) + '
 '(article_cloze_z | article_item_id)',
 'article_cloze_z + (article_cloze_z | expt) + (article_cloze_z | sub_id) + (1 '
 '| article_item_id)',
 'article_cloze_z + (article_cloze_z | expt) + (1 | sub_id) + (article_cloze_z '
 '| article_item_id)',
 'article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (article_cloze_z '
 '| article_item_id)',
 'article_cloze_z + (article_cloze_z | expt) + (1 | sub_id) + (1 | '
 'article_item_id)',
 'article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (1 | '
 'article_item_id)',
 'article_cloze_z + (1 | expt) + (1 | sub_id) + (article_cloze_z | '
 'article_item_id)',
 'article_cloz

100%|██████████| 21/21 [00:03<00:00,  5.62it/s]
udck19_pipeline_2.ipynb:INFO:Elapsed time: 0:00:14.816006

udck19_pipeline_2.ipynb:INFO:Elapsed time modeling: 0:17:07.811246


# Model the 3 Experiments separately

In [8]:
 LOGGER.info(f"""
 Models for each EEG dataset separately
 {pp.pformat(LMER_MODELS_BY_EXPT)}
 """)

udck19_pipeline_2.ipynb:INFO:
Models for each EEG dataset separately
['article_cloze_z + (article_cloze_z | sub_id) + (1 | article_item_id)',
 '(article_cloze_z | sub_id) + (1 | article_item_id)',
 'article_cloze_z + (1 | sub_id) + (1 | article_item_id)',
 '(1 | sub_id) + (1 | article_item_id)']



## Slice out experiment data, standardize cloze per data set, fit the models

In [9]:
%%time

start_time = datetime.datetime.now()
LOGGER.info(f"Start modeling each experiment: {start_time.strftime('%d.%b %Y %H:%M:%S')}")

for expt in EEG_EXPT_SPECS.keys():
    LOGGER.info(f"LMER modeling {expt}")
            
    prepochs_trmd_eeg_expt_df = (
        prepochs_trmd_eeg_df
        .query("expt == @expt")
        .drop(columns=['article_cloze_z', 'ART_noun_cloze_z', 'NA_noun_cloze_z'])
    )
    
    prepochs_trmd_eeg_expt_df, prepochs_trmd_eeg_expt_means_sds = standardize(
        prepochs_trmd_eeg_expt_df,
        ['article_cloze', 'ART_noun_cloze', 'NA_noun_cloze', ]
    )
    LOGGER.info(f"Experiment: {expt}: {prepochs_trmd_eeg_expt_means_sds}")
           
    # load for fitgrid
    expt_fg = fitgrid.epochs_from_dataframe(
        prepochs_trmd_eeg_expt_df
        #prepochs_trmd_eeg_df.query("expt == @expt")  # expt slicing always
        .loc[time_slice, LMER_CHANNELS + RHS_VARS],  # prerun slicing, if any
        epoch_id = "Epoch_idx",
        time = 'Time',
        channels=LMER_CHANNELS
    )
            
    assert expt_fg.table['expt'].unique()[0] == expt
    out_f = modl_path / f"{pfx}lmer_acz_comp_{expt}.h5"
            
    # supress pandas FutureWarnings for rpy2 DataFrame.from_items
    with warnings.catch_warnings():
        warnings.simplefilter("ignore") 
        fit_lmer_formulas(
            expt_fg,
            lmer_fitter,
            LMER_MODELS_BY_EXPT,
            out_f,
            LOGGER
        )

elapsed = datetime.datetime.now() - start_time
LOGGER.info(f"Elapsed time modeling experiments separately: {elapsed}") 

udck19_pipeline_2.ipynb:INFO:Start modeling each experiment: 08.Nov 2019 09:29:22
udck19_pipeline_2.ipynb:INFO:LMER modeling eeg_1
udck19_pipeline_2.ipynb:INFO:Experiment: eeg_1: {'article_cloze': {'mean': 0.374555243445773, 'sd': 0.35027258879514567}, 'ART_noun_cloze': {'mean': 0.4624236891387456, 'sd': 0.4193737235680949}, 'NA_noun_cloze': {'mean': 0.4028398876403854, 'sd': 0.412367204390152}}
udck19_pipeline_2.ipynb:INFO:
    random effects structures
    file: /mnt/cube/home/turbach/papers/udck19/analysis/measures/modeling/prerun/step5_chans4_lmer_acz_comp_eeg_1.h5
    ['article_cloze_z + (article_cloze_z | sub_id) + (1 | article_item_id)',
 '(article_cloze_z | sub_id) + (1 | article_item_id)',
 'article_cloze_z + (1 | sub_id) + (1 | article_item_id)',
 '(1 | sub_id) + (1 | article_item_id)']
    
udck19_pipeline_2.ipynb:INFO:removing previous /mnt/cube/home/turbach/papers/udck19/analysis/measures/modeling/prerun/step5_chans4_lmer_acz_comp_eeg_1.h5
udck19_pipeline_2.ipynb:INFO:fitt

CPU times: user 1min 40s, sys: 59.5 s, total: 2min 40s
Wall time: 2min 46s


In [10]:
# log execution time
pipeline_stop = datetime.datetime.now()

elapsed =  pipeline_stop - pipeline_start
LOGGER.info(f"""
Done {pipeline_stop.strftime("%d.%b %Y %H:%M:%S")}
Elapsed time: {elapsed}
""")

udck19_pipeline_2.ipynb:INFO:
Done 08.Nov 2019 09:32:09
Elapsed time: 0:20:19.984226

