# Nuts & bots of `PyMC` nodes and PPC

In [1]:
%matplotlib inline

# Preparation
import os, hddm, time, csv
import glob
import kabuki 
import datetime
from datetime import date
from copy import deepcopy

import pymc as pm
import hddm
import kabuki
print("The current HDDM version is: ", hddm.__version__)
from patsy import dmatrix, demo_data

import arviz as az
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns

# import sparse # test whether package `sparse` is installed; doesn't matter if not installed.

from p_tqdm import p_map
from functools import partial

# set the color of plots
from cycler import cycler
plt.rcParams['axes.prop_cycle'] = cycler(color='bgrcmykw')



The current HDDM version is:  0.8.0


In [21]:
import feather

##  Related scripts

https://github.com/hddm-devs/kabuki/blob/master/kabuki/analyze.py#L287

## Understand `pymc` knode

In HDDM, PPC is based on `pymc` knode, which makes it very convenient.

In [2]:
data = hddm.load_csv('/opt/conda/lib/python3.7/site-packages/hddm/examples/cavanagh_theta_nn.csv')
data.tail()

Unnamed: 0,subj_idx,stim,rt,response,theta,dbs,conf
3983,13,LL,1.45,0.0,-1.237166,0,HC
3984,13,WL,0.711,1.0,-0.37745,0,LC
3985,13,WL,0.784,1.0,-0.694194,0,LC
3986,13,LL,2.35,0.0,-0.546536,0,HC
3987,13,WW,1.25,1.0,0.752388,0,HC


We cna easily check the design matrix of parameter `a` for the example mentioned in the HDDM official toturails.

## Fit a model
 With `a` has the same regression model as in the official tutorial, and `v` as within effect.

In [3]:
# define a function to run model in parallel

# M0_0: base model: simplified
def run_m0(id, df=None, samples=None, burn=None, save_name="ms0"): 
#     print('running model %i'%id);
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id    
    m = hddm.HDDM(df)
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m

# M1: base model: full model
def run_m1(id, df=None, samples=None, burn=None, save_name="ms1"): 
#     print('running model %i'%id);
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id    
    m = hddm.HDDM(df, include=['z', 'sv', 'sz', 'st'])
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m


# M2: treat within-subj as between-subj: full model
def run_m2(id, df=None, samples=None, burn=None, save_name="ms2"): 
#     print('running model %i'%id);
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id    
    m = hddm.HDDM(df, include=['z', 'sv', 'st', 'sz'], 
                  depends_on={'v': 'conf'})
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m


# M3: regression model (varying intercept)
def run_m3(id, df=None, samples=None, burn=None, save_name="ms3"): 
#     print('running model %i'%id);
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id   
    m = hddm.HDDMRegressor(data,  
                           "v ~ C(conf, Treatment('LC'))", 
                           group_only_regressors=True,
                           keep_regressor_trace=True,
                           include=['z', 'sv', 'st', 'sz'])
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m

# M4: regression model (varying intercept and slope)
def run_m4(id, df=None, samples=None, burn=None, save_name="ms4"): 
#     print('running model %i'%id);
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id   
    m = hddm.HDDMRegressor(data,
                           "v ~ C(conf, Treatment('LC'))", 
                           group_only_regressors=False,
                           keep_regressor_trace=True,
                           include=['z', 'sv', 'st', 'sz'])
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m

# M5: regression model + theta as an additional predictor of `a`
def run_m5(id, df=None, samples=None, burn=None, save_name="ms5"): 
#     print('running model %i'%id);
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id
    m = hddm.HDDMRegressor(data,
                           "a ~ theta:C(conf, Treatment('LC'))",
                           depends_on={'v': 'conf'},
                           group_only_regressors=False,
                           keep_regressor_trace=True,
                           include=['z', 'sv', 'st', 'sz'])
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m

# M6: Regression for both parameters
def run_m6(id, df=None, samples=None, burn=None, save_name="ms6"): 
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id
    a_reg = {'model': "a ~ theta:C(conf, Treatment('LC'))", 'link_func': lambda x: x}
    v_reg = {'model': "v ~ C(conf, Treatment('LC'))", 'link_func': lambda x: x}
    reg_descr = [a_reg, v_reg]
    
    m = hddm.HDDMRegressor(data,
                           reg_descr,
                           group_only_regressors=False,
                           keep_regressor_trace=True,
                           include=['z', 'sv', 'st', 'sz'])
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m

# M7: Regression for both parameters
def run_m7(id, df=None, samples=None, burn=None, save_name="ms7"): 
    import hddm
    
    dbname = save_name + '_chain_%i.db'%id 
    mname  = save_name + '_chain_%i'%id
    a_reg = {'model': "a ~ theta:C(conf, Treatment('LC')):C(dbs, Treatment('0'))", 'link_func': lambda x: x}
    v_reg = {'model': "v ~ C(conf, Treatment('LC'))", 'link_func': lambda x: x}
    reg_descr = [a_reg, v_reg]
    
    m = hddm.HDDMRegressor(data,
                           reg_descr,
                           group_only_regressors=False,
                           keep_regressor_trace=True,
                           include=['z', 'sv', 'st', 'sz'])
    m.find_starting_values()
    m.sample(samples, burn=burn, dbname=dbname, db='pickle') # it's neccessary to save the model data
    m.save(mname)
    
    return m


In [4]:
# parameters for model fitting
samples = 2000
burn = 500
chains = 4
test_mode = True

In [5]:
%%time

# below are for run multiple models
## Step 1: run models in parallel
file_path = "/home/jovyan/hddm/temp/"

if test_mode:
    model_func = [run_m2, run_m5, run_m7]
    
    models_name = ["ms2", 
                   "ms5", 
                   "ms7"]
    
    models = {"ms2": [], 
              "ms5": [], 
              "ms7": []}

else: 
    model_func = [run_m0, run_m1, run_m2, run_m3, 
                  run_m4, run_m5, run_m6, run_m7]

    # models = [[]] * len(model_func) hcp, using list in list is very tricky here, when loading 
    # models by for loop, it will end up with each list in the list "models" has 32 elements. 

    models_name = ["ms0", "ms1", "ms2", 
                   "ms3", "ms4", "ms5", 
                   "ms6", "ms7"]
        
    models = {"ms0": [], "ms1": [], "ms2": [], 
              "ms3": [], "ms4": [], "ms5": [], 
              "ms6": [], "ms7": []}

for ii in range(len(model_func)):
    #print(model_func[ii])
    #print(model_name[ii])
    file_full_path = file_path + "*" + models_name[ii] + "_chain_*[!db]" 
    # print(file_full_path)
    file_names = glob.glob(file_full_path, recursive=False)
    print(file_names)
    
    if file_names:
        for jj in file_names:
            print('current loading: ', jj, '\n')
            models[models_name[ii]].append(hddm.load(jj))
    else:
        print('current estimating: models_name[ii]')
        models[models_name[ii]] = p_map(partial(model_func[ii], df=data, samples=samples, 
                                         burn=burn),
                                 range(chains))

['/home/jovyan/hddm/temp/ms2_chain_2', '/home/jovyan/hddm/temp/ms2_chain_3', '/home/jovyan/hddm/temp/ms2_chain_1', '/home/jovyan/hddm/temp/ms2_chain_0']
current loading:  /home/jovyan/hddm/temp/ms2_chain_2 

current loading:  /home/jovyan/hddm/temp/ms2_chain_3 

current loading:  /home/jovyan/hddm/temp/ms2_chain_1 

current loading:  /home/jovyan/hddm/temp/ms2_chain_0 

['/home/jovyan/hddm/temp/ms5_chain_0', '/home/jovyan/hddm/temp/ms5_chain_3', '/home/jovyan/hddm/temp/ms5_chain_2', '/home/jovyan/hddm/temp/ms5_chain_1']
current loading:  /home/jovyan/hddm/temp/ms5_chain_0 

current loading:  /home/jovyan/hddm/temp/ms5_chain_3 

current loading:  /home/jovyan/hddm/temp/ms5_chain_2 

current loading:  /home/jovyan/hddm/temp/ms5_chain_1 

['/home/jovyan/hddm/temp/ms7_chain_3', '/home/jovyan/hddm/temp/ms7_chain_2', '/home/jovyan/hddm/temp/ms7_chain_1', '/home/jovyan/hddm/temp/ms7_chain_0']
current loading:  /home/jovyan/hddm/temp/ms7_chain_3 

current loading:  /home/jovyan/hddm/temp/ms7_c

## Check the PPC step by step


In [6]:
ms2_tmp = models['ms2'][0]
ms5_tmp = models['ms5'][0]
ms7_tmp = models['ms7'][0]
ms7_tmp

<hddm.models.hddm_regression.HDDMRegressor at 0x7f4513876e90>

## Model's input

In [7]:
ms2_tmp.data

Unnamed: 0,subj_idx,stim,rt,response,theta,dbs,conf
0,0,LL,1.210,1.0,0.656275,1,HC
1,0,WL,1.630,1.0,-0.327889,1,LC
2,0,WW,1.030,1.0,-0.480285,1,HC
3,0,WL,2.770,1.0,1.927427,1,LC
4,0,WW,-1.140,0.0,-0.213236,1,HC
...,...,...,...,...,...,...,...
3983,13,LL,-1.450,0.0,-1.237166,0,HC
3984,13,WL,0.711,1.0,-0.377450,0,LC
3985,13,WL,0.784,1.0,-0.694194,0,LC
3986,13,LL,-2.350,0.0,-0.546536,0,HC


Check all the nodes in the model.

We can see from the column names that there are stochastic nodes, observed nodes, and hidden nodes.

For stochastic nodes, there are also their means, stds etc.

In [8]:
ms2_tmp.nodes_db

Unnamed: 0,knode_name,stochastic,observed,subj,node,tag,depends,hidden,subj_idx,stim,...,dbs,conf,mean,std,2.5q,25q,50q,75q,97.5q,mc err
a,a,True,False,False,a,(),[],False,,,...,,,1.84844,0.107967,1.6438,1.77627,1.8448,1.9157,2.07038,0.00378011
a_std,a_std,True,False,False,a_std,(),[],False,,,...,,,0.386355,0.0950326,0.247214,0.318321,0.369564,0.435426,0.612872,0.00379249
a_rate,a_rate,False,False,False,a_rate,(),[],True,,,...,,,,,,,,,,
a_shape,a_shape,False,False,False,a_shape,(),[],True,,,...,,,,,,,,,,
a_subj.0,a_subj,True,False,True,a_subj.0,(),[subj_idx],False,0,,...,,,2.04782,0.0724406,1.91379,1.99802,2.04271,2.0958,2.19478,0.00318888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wfpt(LC).9,wfpt,False,True,False,wfpt(LC).9,"(LC,)","[conf, subj_idx]",False,9,,...,,LC,,,,,,,,
wfpt(LC).10,wfpt,False,True,False,wfpt(LC).10,"(LC,)","[conf, subj_idx]",False,10,,...,,LC,,,,,,,,
wfpt(LC).11,wfpt,False,True,False,wfpt(LC).11,"(LC,)","[conf, subj_idx]",False,11,,...,,LC,,,,,,,,
wfpt(LC).12,wfpt,False,True,False,wfpt(LC).12,"(LC,)","[conf, subj_idx]",False,12,,...,,LC,,,,,,,,


We can get the summary of stochastics as below. For the values of each draw in MCMC, we need go the traces.

In [9]:
ms2_tmp.get_stochastics()

Unnamed: 0,knode_name,stochastic,observed,subj,node,tag,depends,hidden,subj_idx,stim,...,dbs,conf,mean,std,2.5q,25q,50q,75q,97.5q,mc err
a,a,True,False,False,a,(),[],False,,,...,,,1.84844,0.107967,1.6438,1.77627,1.8448,1.9157,2.07038,0.00378011
a_std,a_std,True,False,False,a_std,(),[],False,,,...,,,0.386355,0.0950326,0.247214,0.318321,0.369564,0.435426,0.612872,0.00379249
a_subj.0,a_subj,True,False,True,a_subj.0,(),[subj_idx],False,0,,...,,,2.04782,0.0724406,1.91379,1.99802,2.04271,2.0958,2.19478,0.00318888
a_subj.1,a_subj,True,False,True,a_subj.1,(),[subj_idx],False,1,,...,,,1.98539,0.0798777,1.83147,1.93025,1.98678,2.03662,2.1478,0.00360076
a_subj.2,a_subj,True,False,True,a_subj.2,(),[subj_idx],False,2,,...,,,1.40045,0.0558243,1.29475,1.36298,1.39743,1.43813,1.50963,0.00264581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
z_subj_trans.9,z_subj_trans,True,False,True,z_subj_trans.9,(),[subj_idx],True,9,,...,,,,,,,,,,
z_subj_trans.10,z_subj_trans,True,False,True,z_subj_trans.10,(),[subj_idx],True,10,,...,,,,,,,,,,
z_subj_trans.11,z_subj_trans,True,False,True,z_subj_trans.11,(),[subj_idx],True,11,,...,,,,,,,,,,
z_subj_trans.12,z_subj_trans,True,False,True,z_subj_trans.12,(),[subj_idx],True,12,,...,,,,,,,,,,


In [10]:
ms2_tmp_traces = ms2_tmp.get_traces()
ms2_tmp_traces.columns

Index(['a', 'a_std', 'a_subj.0', 'a_subj.1', 'a_subj.2', 'a_subj.3',
       'a_subj.4', 'a_subj.5', 'a_subj.6', 'a_subj.7', 'a_subj.8', 'a_subj.9',
       'a_subj.10', 'a_subj.11', 'a_subj.12', 'a_subj.13', 'v(HC)', 'v(LC)',
       'v_std', 'v_subj(HC).0', 'v_subj(HC).1', 'v_subj(HC).2', 'v_subj(HC).3',
       'v_subj(HC).4', 'v_subj(HC).5', 'v_subj(HC).6', 'v_subj(HC).7',
       'v_subj(HC).8', 'v_subj(HC).9', 'v_subj(HC).10', 'v_subj(HC).11',
       'v_subj(HC).12', 'v_subj(HC).13', 'v_subj(LC).0', 'v_subj(LC).1',
       'v_subj(LC).2', 'v_subj(LC).3', 'v_subj(LC).4', 'v_subj(LC).5',
       'v_subj(LC).6', 'v_subj(LC).7', 'v_subj(LC).8', 'v_subj(LC).9',
       'v_subj(LC).10', 'v_subj(LC).11', 'v_subj(LC).12', 'v_subj(LC).13', 't',
       't_std', 't_subj.0', 't_subj.1', 't_subj.2', 't_subj.3', 't_subj.4',
       't_subj.5', 't_subj.6', 't_subj.7', 't_subj.8', 't_subj.9', 't_subj.10',
       't_subj.11', 't_subj.12', 't_subj.13', 'sv', 'sz', 'st', 'z_trans',
       'z_std', 'z_subj_t

## Observeds

The MDDM model object is an `pymc` model object *per se*, so it have almost all the properties of `pymc` objects, but more.

The observed nodes are the nodes for input data (observed = True).

Depends on the model specification, the number of observeds are different. Here in a regression model without depends_on settings, the observeds are equal to the number of participants.

In [11]:
ms2_tmp.get_observeds()

Unnamed: 0,knode_name,stochastic,observed,subj,node,tag,depends,hidden,subj_idx,stim,...,dbs,conf,mean,std,2.5q,25q,50q,75q,97.5q,mc err
wfpt(HC).0,wfpt,False,True,False,wfpt(HC).0,"(HC,)","[conf, subj_idx]",False,0,,...,,HC,,,,,,,,
wfpt(HC).1,wfpt,False,True,False,wfpt(HC).1,"(HC,)","[conf, subj_idx]",False,1,,...,,HC,,,,,,,,
wfpt(HC).2,wfpt,False,True,False,wfpt(HC).2,"(HC,)","[conf, subj_idx]",False,2,,...,,HC,,,,,,,,
wfpt(HC).3,wfpt,False,True,False,wfpt(HC).3,"(HC,)","[conf, subj_idx]",False,3,,...,,HC,,,,,,,,
wfpt(HC).4,wfpt,False,True,False,wfpt(HC).4,"(HC,)","[conf, subj_idx]",False,4,,...,,HC,,,,,,,,
wfpt(HC).5,wfpt,False,True,False,wfpt(HC).5,"(HC,)","[conf, subj_idx]",False,5,,...,,HC,,,,,,,,
wfpt(HC).6,wfpt,False,True,False,wfpt(HC).6,"(HC,)","[conf, subj_idx]",False,6,,...,,HC,,,,,,,,
wfpt(HC).7,wfpt,False,True,False,wfpt(HC).7,"(HC,)","[conf, subj_idx]",False,7,,...,,HC,,,,,,,,
wfpt(HC).8,wfpt,False,True,False,wfpt(HC).8,"(HC,)","[conf, subj_idx]",False,8,,...,,HC,,,,,,,,
wfpt(HC).9,wfpt,False,True,False,wfpt(HC).9,"(HC,)","[conf, subj_idx]",False,9,,...,,HC,,,,,,,,


We can also iterate the observeds:

In [12]:
from post_pred_gen_redifined import _parents_to_random_posterior_sample
from post_pred_gen_redifined import _post_pred_generate
from post_pred_gen_redifined import post_pred_gen

from pointwise_loglik_gen import _pointwise_like_generate
from pointwise_loglik_gen import pointwise_like_gen

In [13]:
iter_data_tmp2 = ((name2, ms2_tmp.data.loc[obs['node'].value.index]) for name2, obs in ms2_tmp.iter_observeds())
iter_data_tmp2

iter_data_tmp5 = ((name5, ms5_tmp.data.loc[obs5['node'].value.index]) for name5, obs5 in ms5_tmp.iter_observeds())
iter_data_tmp5

iter_data_tmp7 = ((name7, ms7_tmp.data.loc[obs7['node'].value.index]) for name7, obs7 in ms7_tmp.iter_observeds())
iter_data_tmp7

<generator object <genexpr> at 0x7f4516d3b5d0>

In [14]:
for name2, data2 in iter_data_tmp2:
#     print(name2)
    node2 = ms2_tmp.get_data_nodes(data2.index) # get the node corresponding to data.index.
    
    if name2 == 'wfpt(HC).0':
        print("subject 0's HC data")
        node2_HC = node2
    elif name2 == 'wfpt(LC).0':
        print("subject 0's LC data")
        node2_LC = node2
    # node
    
#     for i, parent in enumerate(node.extended_parents):
#         if name == 'wfpt.13':
#             print("Order of extended_parent: ", i)
#             print(parent)

subject 0's HC data
subject 0's LC data


In [15]:
for name5, data5 in iter_data_tmp5:
    print(name5)
    node5 = ms5_tmp.get_data_nodes(data5.index) # get the node corresponding to data.index.
    # node
    
    for i, parent in enumerate(node5.extended_parents):
        if name5 == 'wfpt.13':
            print("Order of extended_parent: ", i)
            print(parent)

wfpt(HC).0
wfpt(HC).1
wfpt(HC).2
wfpt(HC).3
wfpt(HC).4
wfpt(HC).5
wfpt(HC).6
wfpt(HC).7
wfpt(HC).8
wfpt(HC).9
wfpt(HC).10
wfpt(HC).11
wfpt(HC).12
wfpt(HC).13
wfpt(LC).0
wfpt(LC).1
wfpt(LC).2
wfpt(LC).3
wfpt(LC).4
wfpt(LC).5
wfpt(LC).6
wfpt(LC).7
wfpt(LC).8
wfpt(LC).9
wfpt(LC).10
wfpt(LC).11
wfpt(LC).12
wfpt(LC).13


In [16]:
df_pll_tmp2 = _pointwise_like_generate(node2_HC, samples=None, data=None, append_data=False)
df_pll_tmp2 = pd.concat(df_pll_tmp2, names=['draw'], 
                            keys = list(range(len(df_pll_tmp2))))
df_pll_tmp2

Unnamed: 0_level_0,Unnamed: 1_level_0,log_lik
draw,trial_idx,Unnamed: 2_level_1
0,0,-0.775333
0,2,-0.881349
0,4,-1.246707
0,6,-1.674462
0,8,-1.373326
...,...,...
1499,290,-1.862179
1499,291,-1.103772
1499,292,-2.511639
1499,295,-1.094928


In [22]:
df_pll_tmp2

Unnamed: 0_level_0,Unnamed: 1_level_0,log_lik
draw,trial_idx,Unnamed: 2_level_1
0,0,-0.775333
0,2,-0.881349
0,4,-1.246707
0,6,-1.674462
0,8,-1.373326
...,...,...
1499,290,-1.862179
1499,291,-1.103772
1499,292,-2.511639
1499,295,-1.094928


In [24]:
pingInfoFilePath = "./df_pll_tmp2.ftr"
df_pll_tmp2.reset_index().to_feather(pingInfoFilePath)

In [25]:
readFrame = pd.read_feather(pingInfoFilePath, columns=None, use_threads=True)
readFrame

Unnamed: 0,draw,trial_idx,log_lik
0,0,0,-0.775333
1,0,2,-0.881349
2,0,4,-1.246707
3,0,6,-1.674462
4,0,8,-1.373326
...,...,...,...
220495,1499,290,-1.862179
220496,1499,291,-1.103772
220497,1499,292,-2.511639
220498,1499,295,-1.094928


In [17]:
df_pll_tmp5 = _pointwise_like_generate(node5, samples=None, data=None, append_data=False)
df_pll_tmp5 = pd.concat(df_pll_tmp5, names=['draw'], 
                            keys = list(range(len(df_pll_tmp5))))
df_pll_tmp5

Unnamed: 0_level_0,Unnamed: 1_level_0,log_lik
draw,trial_idx,Unnamed: 2_level_1
0,3715,-0.663994
0,3716,-1.433283
0,3718,-1.373292
0,3719,-2.307738
0,3724,-1.883402
...,...,...
1499,3979,-1.127020
1499,3980,-1.395583
1499,3981,-1.040972
1499,3984,-1.775895


In [18]:
np.isinf(df_pll_tmp5).values.sum()

0

In [22]:
datasets = []

##############################
# Iterate the posterior and generate likelihood for each data point

for i, parent in enumerate(node5.extended_parents):
    if not isinstance(parent, pm.Node): # Skip non-stochastic nodes
        continue
    else:
        mc_len = len(parent.trace())
        break
# # samples=samples
# if samples is None:
#     samples = mc_len
#     # print("Number of samples is equal to length of MCMC trace.")

# assert samples, "Can not determine the number of samples"
                

In [25]:
bottom_node = node5
for sample in range(mc_len):
    _parents_to_random_posterior_sample(bottom_node, pos = sample)

    param_dict = deepcopy(bottom_node.parents.value)

    # for regressor models
    if 'reg_outcomes' in param_dict:
        del param_dict['reg_outcomes']

        pointwise_lik = bottom_node.value.copy()
        pointwise_lik.index.names = ['trial_idx']        # change the index label as "trial_idx"
        pointwise_lik.drop(['rt'],axis=1,inplace=True) # drop 'rt' b/c not gonna use it.

        for i in bottom_node.value.index:
            # get current params
            for p in bottom_node.parents['reg_outcomes']:
                param_dict[p] = bottom_node.parents.value[p].loc[i].item()

            # calculate the point-wise likelihood.
            tmp_lik = hddm.wfpt.pdf_array(x = np.array(bottom_node.value.loc[i]),
                                          v = np.array(param_dict['v']),
                                           a = np.array(param_dict['a']), 
                                           t = np.array(param_dict['t']),
                                           p_outlier = param_dict['p_outlier'],
                                           sv = param_dict['sv'],
                                           z = param_dict['z'],
                                           sz = param_dict['sz'],
                                           st = param_dict['st'])
            pointwise_lik.loc[i, 'log_lik'] = tmp_lik

        # check if there is zero prob.
        if 0 in pointwise_lik.values:
            pointwise_lik['log_lik']=pointwise_lik['log_lik'].replace(0.0, pointwise_lik['log_lik'].mean())

        elif pointwise_lik['log_lik'].isnull().values.any():
            print('NAN in the likelihood, check the data !')
            break

        pointwise_lik['log_lik'] = np.log(pointwise_lik['log_lik'])
        
        if np.isinf(pointwise_lik['log_lik']).values.sum() > 0:
            print('Correction does not work!!!\n')

In [81]:
%%time
0 in pointwise_lik.values

CPU times: user 63 µs, sys: 2 µs, total: 65 µs
Wall time: 68.2 µs


False

In [78]:
%%time
np.isinf(pointwise_lik['log_lik']).values.sum() > 0

CPU times: user 533 µs, sys: 0 ns, total: 533 µs
Wall time: 551 µs


False

In [35]:
%%time
pointwise_lik.isnull().sum().sum() > 0

CPU times: user 529 µs, sys: 14 µs, total: 543 µs
Wall time: 520 µs


False

In [31]:
%%time
pointwise_lik.isnull().values.any()

CPU times: user 381 µs, sys: 10 µs, total: 391 µs
Wall time: 364 µs


False

In [38]:
%%time
pointwise_lik['log_lik'].isnull().values.any()

CPU times: user 639 µs, sys: 17 µs, total: 656 µs
Wall time: 678 µs


False

In [34]:
%%time
pointwise_lik['log_lik'].isnull().sum() > 0

CPU times: user 2.27 ms, sys: 0 ns, total: 2.27 ms
Wall time: 1.73 ms


False

In [19]:
tmp5 = df_pll_tmp2[df_pll_tmp2.isna().any(axis=1)]
tmp5

Unnamed: 0_level_0,Unnamed: 1_level_0,log_lik
draw,trial_idx,Unnamed: 2_level_1


In [18]:
np.isinf(df_pll_tmp2).values.sum()

0

In [13]:
df_pll_2 = p_map(partial(pointwise_like_gen), models['ms2'])

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

 [-----            14%                  ] 4 of 28 complete in 1.2 sec[-----            14%                  ] 4 of 28 complete in 1.2 sec[-----            14%                  ] 4 of 28 complete in 1.2 sec[-----            14%                  ] 4 of 28 complete in 1.2 sec[------           17%                  ] 5 of 28 complete in 2.5 sec[------           17%                  ] 5 of 28 complete in 2.5 sec[------           17%                  ] 5 of 28 complete in 2.5 sec[------           17%                  ] 5 of 28 complete in 2.5 sec[--------         21%                  ] 6 of 28 complete in 3.8 sec[--------         21%                  ] 6 of 28 complete in 3.9 sec[--------         21%                  ] 6 of 28 complete in 3.9 sec[--------         21%                  ] 6 of 28 complete in 3.9 sec[---------        25%                  ] 7 of 28 complete in 5.1 sec[---------        25%                  ] 7 of 28 complete in 5.1 sec[---------        25%                  ] 7 of 2

In [16]:
df_pll_2 = pd.concat(df_pll_2, names=['draw'], keys=list(range(len(df_pll_2))))
df_pll_2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,log_lik
draw,node,draw,trial_idx,Unnamed: 4_level_1
0,wfpt(HC).0,0,0,-0.775333
0,wfpt(HC).0,0,2,-0.881349
0,wfpt(HC).0,0,4,-1.246707
0,wfpt(HC).0,0,6,-1.674462
0,wfpt(HC).0,0,8,-1.373326


In each iterate, we can extract each node and check the parents,  extended_parents, and other properties of the nodes.

In [30]:
iter_data_tmp7

<generator object <genexpr> at 0x7f9ce94270d0>

In [21]:
node2_HC.value.index

Int64Index([  0,   2,   4,   6,   8,  10,  12,  13,  16,  20,
            ...
            281, 282, 285, 286, 287, 290, 291, 292, 295, 297],
           dtype='int64', length=147)

In [26]:
df_pll_2.iloc[df_pll_2.index.get_level_values('node') == "wfpt(HC).0"].index.get_level_values(3).unique()

Int64Index([  0,   2,   4,   6,   8,  10,  12,  13,  16,  20,
            ...
            281, 282, 285, 286, 287, 290, 291, 292, 295, 297],
           dtype='int64', name='trial_idx', length=147)

In [27]:
df_pll_2.iloc[df_pll_2.index.get_level_values('node') == "wfpt(HC).0"].index.get_level_values(3).unique() == node2_HC.value.index

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [15]:
index1 = node2_LC.value.index.array
index1

<PandasArray>
[  1,   3,   5,   7,   9,  11,  14,  15,  17,  18,
 ...
 275, 278, 280, 283, 284, 288, 289, 293, 294, 296]
Length: 151, dtype: int64

In [16]:
index2 = node2_HC.value.index.array
index2

<PandasArray>
[  0,   2,   4,   6,   8,  10,  12,  13,  16,  20,
 ...
 281, 282, 285, 286, 287, 290, 291, 292, 295, 297]
Length: 147, dtype: int64

In [17]:
any(x in set(index2) for x in index1)

False

In [18]:
df_ppc_2_sub0_HC = _post_pred_generate(node2_HC,samples=None, data=None, append_data=False)

Number of samples is equal to length of MCMC trace.


In [19]:
df_ppc_2_sub0_LC = _post_pred_generate(node2_LC,samples=None, data=None, append_data=False)

Number of samples is equal to length of MCMC trace.


In [20]:
df_ppc_2_sub0_HC = pd.concat(df_ppc_2_sub0_HC, names=['draw'], 
                             keys=list(range(len(df_ppc_2_sub0_HC))))  
df_ppc_2_sub0_HC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rt,response
draw,trial_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,-1.007522,0.0
0,2,0.931868,1.0
0,4,-1.206423,0.0
0,6,1.649377,1.0
0,8,-1.440006,0.0


In [21]:
df_ppc_2_sub0_LC = pd.concat(df_ppc_2_sub0_LC, names=['draw'], 
                             keys=list(range(len(df_ppc_2_sub0_LC))))  
df_ppc_2_sub0_LC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rt,response
draw,trial_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1.179023,1.0
0,3,4.129387,1.0
0,5,-2.164839,0.0
0,7,-0.95527,0.0
0,9,1.024792,1.0


In [22]:
node2_HC.value.index

Int64Index([  0,   2,   4,   6,   8,  10,  12,  13,  16,  20,
            ...
            281, 282, 285, 286, 287, 290, 291, 292, 295, 297],
           dtype='int64', name='trial_idx', length=147)

In [23]:
df_ppc_2_sub0_HC.index

MultiIndex([(   0,   0),
            (   0,   2),
            (   0,   4),
            (   0,   6),
            (   0,   8),
            (   0,  10),
            (   0,  12),
            (   0,  13),
            (   0,  16),
            (   0,  20),
            ...
            (1499, 281),
            (1499, 282),
            (1499, 285),
            (1499, 286),
            (1499, 287),
            (1499, 290),
            (1499, 291),
            (1499, 292),
            (1499, 295),
            (1499, 297)],
           names=['draw', 'trial_idx'], length=220500)

In [24]:
node2_HC.value.index == df_ppc_2_sub0_HC.index.get_level_values(1).unique()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [25]:
node2_LC.value.index

Int64Index([  1,   3,   5,   7,   9,  11,  14,  15,  17,  18,
            ...
            275, 278, 280, 283, 284, 288, 289, 293, 294, 296],
           dtype='int64', name='trial_idx', length=151)

In [26]:
df_ppc_2_sub0_LC.index

MultiIndex([(   0,   1),
            (   0,   3),
            (   0,   5),
            (   0,   7),
            (   0,   9),
            (   0,  11),
            (   0,  14),
            (   0,  15),
            (   0,  17),
            (   0,  18),
            ...
            (1499, 275),
            (1499, 278),
            (1499, 280),
            (1499, 283),
            (1499, 284),
            (1499, 288),
            (1499, 289),
            (1499, 293),
            (1499, 294),
            (1499, 296)],
           names=['draw', 'trial_idx'], length=226500)

In [27]:
df_ppc_2_sub0_LC.index.get_level_values(1).unique()

Int64Index([  1,   3,   5,   7,   9,  11,  14,  15,  17,  18,
            ...
            275, 278, 280, 283, 284, 288, 289, 293, 294, 296],
           dtype='int64', name='trial_idx', length=151)

In [28]:
node2_LC.value.index == df_ppc_2_sub0_LC.index.get_level_values(1).unique()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [38]:
import pymc as pm
import numpy as np


datasets = []

In [39]:
for i, parent in enumerate(node2_HC.extended_parents):
    if not isinstance(parent, pm.Node): # Skip non-stochastic nodes
        continue
    else:
        mc_len = len(parent.trace())
        break

# samples=samples
if samples is None:

    samples = mc_len
    print("Number of samples is equal to length of MCMC trace.")


In [43]:
samples = mc_len
samples

1500

In [45]:
# assert samples, "Can not determine the number of samples"

# for sample in range(samples):

sample = 0

_parents_to_random_posterior_sample(node2_HC, pos = sample)

# Generate data from bottom node
sampled_data = node2_HC.random()


In [47]:
sampled_data

Unnamed: 0,rt,response
0,-2.276289,0.0
1,1.408459,1.0
2,1.482535,1.0
3,-1.013911,0.0
4,2.500801,1.0
...,...,...
142,2.714920,1.0
143,-1.305316,0.0
144,1.348555,1.0
145,-0.961198,0.0


In [48]:
node2_HC.value

Unnamed: 0,rt
0,1.210
2,1.030
4,-1.140
6,2.000
8,0.857
...,...
290,0.796
291,-1.230
292,2.600
295,-1.160


In [54]:
if not all(sampled_data.index == node2_HC.value.index):
    print("need to change index")

need to change index


In [None]:
# change the index of ppc data if it is not the same as the observed data
if not any(sampled_data.index == bottom_node.value.index): 
    sampled_data.index = bottom_node.value.index

sampled_data.index.names = ['trial_idx']

# add the "response" column for regression models
if not "response" in sampled_data.columns:
    sampled_data["response"] = np.where(sampled_data['rt'] > 0, 1,
                                        np.where(sampled_data['rt'] <=0, 0, None)) 

if append_data and data is not None:
    sampled_data = sampled_data.join(data.reset_index(), lsuffix='_sampled')
datasets.append(sampled_data)


In [13]:
for name5, data5 in iter_data_tmp5:
    print(name5)
    node5 = ms5_tmp.get_data_nodes(data5.index) # get the node corresponding to data.index.
    # node
    
    for i, parent in enumerate(node5.extended_parents):
        if name5 == 'wfpt.13':
            print("Order of extended_parent: ", i)
            print(parent)

wfpt(HC).0
wfpt(HC).1
wfpt(HC).2
wfpt(HC).3
wfpt(HC).4
wfpt(HC).5
wfpt(HC).6
wfpt(HC).7
wfpt(HC).8
wfpt(HC).9
wfpt(HC).10
wfpt(HC).11
wfpt(HC).12
wfpt(HC).13
wfpt(LC).0
wfpt(LC).1
wfpt(LC).2
wfpt(LC).3
wfpt(LC).4
wfpt(LC).5
wfpt(LC).6
wfpt(LC).7
wfpt(LC).8
wfpt(LC).9
wfpt(LC).10
wfpt(LC).11
wfpt(LC).12
wfpt(LC).13


In [14]:
node5.extended_parents

{<pymc.distributions.new_dist_class.<locals>.new_class 'a_theta:C(conf, Treatment('LC'))[HC]_subj.13' at 0x7f10e1fe3d90>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'st' at 0x7f10e27e6b50>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7f10e2b9d950>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'v_subj(LC).13' at 0x7f10e2490a50>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'a_theta:C(conf, Treatment('LC'))[LC]_subj.13' at 0x7f10e2016250>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'sz' at 0x7f10e27e6250>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'z_subj_trans.13' at 0x7f10e2076810>,
 <pymc.distributions.new_dist_class.<locals>.new_class 't_subj.13' at 0x7f10e20a6a10>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'a_Intercept_subj.13' at 0x7f10e203fe10>}

In [15]:
for name7, data7 in iter_data_tmp7:
    print(name7)
    node7 = ms7_tmp.get_data_nodes(data7.index) # get the node corresponding to data.index.
    # node
    
    for i, parent in enumerate(node7.extended_parents):
        if name7 == 'wfpt.13':
            print("Order of extended_parent: ", i)
            print(parent)

wfpt.0
wfpt.1
wfpt.2
wfpt.3
wfpt.4
wfpt.5
wfpt.6
wfpt.7
wfpt.8
wfpt.9
wfpt.10
wfpt.11
wfpt.12
wfpt.13
Order of extended_parent:  0
sz
Order of extended_parent:  1
a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[0]_subj.13
Order of extended_parent:  2
t_subj.13
Order of extended_parent:  3
a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[1]_subj.13
Order of extended_parent:  4
a_Intercept_subj.13
Order of extended_parent:  5
v_C(conf, Treatment('LC'))[T.HC]_subj.13
Order of extended_parent:  6
a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[1]_subj.13
Order of extended_parent:  7
z_subj_trans.13
Order of extended_parent:  8
v_Intercept_subj.13
Order of extended_parent:  9
a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[0]_subj.13
Order of extended_parent:  10
st
Order of extended_parent:  11
sv


In [16]:
node7

<pymc.distributions.new_dist_class.<locals>.new_class 'wfpt.13' at 0x7fa75ef29d50>

In [19]:
datasets5 = _post_pred_generate(node5, samples=300, data=None, append_data=False)
datasets5

[                 rt response
 trial_idx                   
 3715       1.409273        1
 3716       3.763312        1
 3718       1.778683        1
 3719      -1.630310        0
 3724      -2.873457        0
 ...             ...      ...
 3979       0.841206        1
 3980       1.132463        1
 3981       1.821747        1
 3984      -1.413759        0
 3985       1.917863        1
 
 [135 rows x 2 columns],
                  rt response
 trial_idx                   
 3715       5.112616        1
 3716       1.241982        1
 3718       4.117038        1
 3719       1.711777        1
 3724       0.881426        1
 ...             ...      ...
 3979       3.148814        1
 3980      -1.394265        0
 3981       0.904401        1
 3984      -1.053935        0
 3985      -2.405205        0
 
 [135 rows x 2 columns],
                  rt response
 trial_idx                   
 3715       2.745060        1
 3716      -1.217427        0
 3718       1.518758        1
 3719       0.97

In [20]:
node5.random()

Unnamed: 0,rt
3715,0.767788
3716,1.717424
3718,1.214926
3719,-6.690742
3724,1.209056
...,...
3979,-0.906328
3980,2.040207
3981,1.008237
3984,2.835378


In [21]:
node5.value

Unnamed: 0,rt
3715,0.929
3716,1.880
3718,1.810
3719,-1.920
3724,2.300
...,...
3979,1.660
3980,1.890
3981,1.590
3984,0.711


In [20]:
datasets7 = _post_pred_generate(node7, samples=None, data=None, append_data=False)
datasets7

Number of samples is equal to length of MCMC trace.


[                 rt response
 trial_idx                   
 3714      -1.057788        0
 3715       1.681301        1
 3716       0.863445        1
 3717      -0.970333        0
 3718       1.359569        1
 ...             ...      ...
 3983       0.988109        1
 3984       1.110974        1
 3985       1.787548        1
 3986       1.742085        1
 3987      -2.064669        0
 
 [274 rows x 2 columns],
                  rt response
 trial_idx                   
 3714       1.177459        1
 3715       1.610119        1
 3716       1.373421        1
 3717       2.163059        1
 3718      -1.333076        0
 ...             ...      ...
 3983       1.245924        1
 3984      -1.661632        0
 3985       4.849354        1
 3986      -2.045702        0
 3987       2.765157        1
 
 [274 rows x 2 columns],
                  rt response
 trial_idx                   
 3714       1.685802        1
 3715       1.309598        1
 3716       1.638215        1
 3717       1.59

In [37]:
len(datasets7)

1500

In [24]:
datasets1 = _post_pred_generate(node, samples=None, data=None, append_data=False)

Number of samples is equal to length of MCMC trace.


In [25]:
datasets1

[                 rt  response
 trial_idx                    
 3714      -0.974782       0.0
 3715       1.777994       1.0
 3716       1.132826       1.0
 3717       3.440303       1.0
 3718       1.873888       1.0
 ...             ...       ...
 3983       1.550933       1.0
 3984      -2.223484       0.0
 3985       1.234439       1.0
 3986      -1.175953       0.0
 3987       0.713392       1.0
 
 [274 rows x 2 columns],
                  rt  response
 trial_idx                    
 3714      -5.891114       0.0
 3715      -2.205038       0.0
 3716      -1.243729       0.0
 3717      -1.968253       0.0
 3718       2.007208       1.0
 ...             ...       ...
 3983       1.732055       1.0
 3984      -2.586954       0.0
 3985      -1.933651       0.0
 3986       1.019084       1.0
 3987       1.487821       1.0
 
 [274 rows x 2 columns],
                  rt  response
 trial_idx                    
 3714       4.262569       1.0
 3715       2.294025       1.0
 3716      -1.65

In [None]:
for name, data in iter_data:
    node = model.get_data_nodes(data.index)

    if progress_bar:
        bar_iter += 1
        bar.update(bar_iter)

    if node is None or not hasattr(node, 'random'):
        continue # Skip

    ##############################
    # Sample and generate stats
    datasets = _post_pred_generate(node, samples=samples, data=data, append_data=append_data)
    results[name] = pd.concat(datasets, names=['draw'], keys=list(range(len(datasets))))

    # add the "response" column for regression models
    if not "response" in results.columns:
        results["response"] = np.where(results['rt'] > 0, 1, 
                                       np.where(results['rt'] <=0, 0, None)) 

After the iteration, we can see that `node` is the `wfpt.13`, the last element of `observeds`.

You can also check the input data of each node.

In [13]:
isinstance(node, pm.Node) # check if the node is a pymc node.

True

In [23]:
node.value

Unnamed: 0,rt
3714,1.500
3715,0.929
3716,1.880
3717,-1.180
3718,1.810
...,...
3983,-1.450
3984,0.711
3985,0.784
3986,-2.350


Or node's name

In [15]:
node.__name__

'wfpt.13'

In [None]:
# get the subject id
data['subj_idx'].loc[node.value.index].unique()

In [16]:
node.parents

{'p_outlier': 0.05,
 'v': <pymc.distributions.new_dist_class.<locals>.new_class 'v_subj.13' at 0x7f6dc5737750>,
 'sv': <pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7f6dc58da8d0>,
 'a': <pymc.distributions.new_dist_class.<locals>.new_class 'a_subj.13' at 0x7f6dc57755d0>,
 'z': <pymc.CommonDeterministics.InvLogit 'z_subj.13' at 0x7f6dc56dce90>,
 'sz': <pymc.distributions.new_dist_class.<locals>.new_class 'sz' at 0x7f6dc58daf50>,
 't': <pymc.distributions.new_dist_class.<locals>.new_class 't_subj.13' at 0x7f6dc56e5ed0>,
 'st': <pymc.distributions.new_dist_class.<locals>.new_class 'st' at 0x7f6dc58e5b50>}

In [17]:
node.parents.value

{'v': array(0.19225226),
 'sv': array(0.12883454),
 'a': array(2.06162401),
 'z': array(0.52247385),
 'sz': array(0.1526342),
 't': array(0.67963306),
 'st': array(0.46497784),
 'p_outlier': 0.05}

In [18]:
node.extended_parents

{<pymc.distributions.new_dist_class.<locals>.new_class 'a_subj.13' at 0x7f6dc57755d0>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'st' at 0x7f6dc58e5b50>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'v_subj.13' at 0x7f6dc5737750>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'sz' at 0x7f6dc58daf50>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7f6dc58da8d0>,
 <pymc.distributions.new_dist_class.<locals>.new_class 't_subj.13' at 0x7f6dc56e5ed0>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'z_subj_trans.13' at 0x7f6dc56b2850>}

Note, extended_parents do not have value

And as we said, `random()` function of the node can be used to generate random values based on the **current** parents' value. You can re-run the code below for a few times, see how the value of RT changes each time.

In [23]:
sampled_data = node.random()
sampled_data

Unnamed: 0,rt,response
0,2.428942,1.0
1,2.983670,1.0
2,1.884258,1.0
3,1.753909,1.0
4,-1.048085,0.0
...,...,...
269,1.079058,1.0
270,1.983772,1.0
271,2.122878,1.0
272,4.829852,1.0


In [25]:
sample_data7 = node7.random()

In [30]:
not any(sampled_data.index == node.value.index)

True

In [29]:
any(sample_data7.index == node7.value.index)

True

In [22]:
node.random()

Unnamed: 0,rt,response
0,1.396042,1.0
1,1.154437,1.0
2,0.742758,1.0
3,3.109260,1.0
4,-1.481202,0.0
...,...,...
269,1.601240,1.0
270,-0.907861,0.0
271,1.067648,1.0
272,2.201136,1.0


Very conveniently, we can directly retriviel the traces of a node's parents and extended_parent here too.

In [20]:
node.parents['v'].trace()

array([0.15387324, 0.21610853, 0.1367186 , ..., 0.18279531, 0.19987589,
       0.19225226])

In [25]:
# we can not directly index extended_parents but can get the trace in a for loop
for i, parent in enumerate(node.extended_parents):
    print(parent)
    print(parent.trace())

st
[0.45661132 0.45625829 0.48352744 ... 0.51125925 0.52033121 0.47464767]
a_Intercept_subj.13
[2.14894803 2.04464093 2.24554997 ... 2.2302547  2.19780888 2.0290853 ]
a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[0]_subj.13
[ 0.12125852  0.19985113  0.0641674  ... -0.01330511  0.09896035
  0.09104917]
a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[0]_subj.13
[-0.06125472 -0.0779277  -0.04243819 ...  0.04367235  0.05348451
  0.10372685]
sv
[0.21481978 0.2732277  0.24124355 ... 0.25024603 0.29303582 0.25823629]
a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[1]_subj.13
[ 0.03394697 -0.11044463 -0.05594438 ...  0.03232501 -0.00100048
 -0.00782412]
v_C(conf, Treatment('LC'))[T.HC]_subj.13
[-0.51252481 -0.47892954 -0.48017478 ... -0.52627411 -0.49826299
 -0.47897617]
sz
[0.220693   0.19928462 0.14847611 ... 0.13398106 0.02876901 0.13495805]
a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[1]_subj.13
[-0.03347591 -0.01104332  0.01372463 ...  

In [33]:
def _post_pred_generate(bottom_node, samples=None, data=None, append_data=False):
    """Generate posterior predictive data from a single observed node."""
    import pymc as pm
    import numpy as np
    
    
    datasets = []

    ##############################
    # Sample and generate stats
    # If number of samples is fixed, use the original code, i.e., randomly sample one set of
    # values from extended_parents and generate random value;
    #
    # If number of samples is None, use the lenght of trace, and iterate the whole posterior.

    for i, parent in enumerate(bottom_node.extended_parents):
        if not isinstance(parent, pm.Node): # Skip non-stochastic nodes
            continue
        else:
            mc_len = len(parent.trace())
            break

    # samples=samples
    if samples is None:

        samples = mc_len
        print("Number of samples is equal to length of MCMC trace.")

    assert samples, "Can not determine the number of samples"
    
    if samples == mc_len:
        for sample in range(samples):
            _parents_to_random_posterior_sample(bottom_node, pos = sample)
            
            # Generate data from bottom node
            sampled_data = bottom_node.random()
            
            # change the index of ppc data if it is not the same as the observed data
            if not any(sampled_data.index == bottom_node.value.index): 
                sampled_data.index = bottom_node.value.index
               
            sampled_data.index.names = ['trial_idx']

            # add the "response" column for regression models
            if not "response" in sampled_data.columns:
                sampled_data["response"] = np.where(sampled_data['rt'] > 0, 1,
                                                    np.where(sampled_data['rt'] <=0, 0, None)) 
                        
            if append_data and data is not None:
                sampled_data = sampled_data.join(data.reset_index(), lsuffix='_sampled')
            datasets.append(sampled_data)
    
    else:
        for sample in range(samples):
            pos = np.random.randint(0, mc_len)
            _parents_to_random_posterior_sample(bottom_node, pos = pos)

            # Generate data from bottom node
            sampled_data = bottom_node.random()
            # change the index of ppc data if it is not the same as the observed data
            if not any(sampled_data.index == bottom_node.value.index): 
                sampled_data.index = bottom_node.value.index
            sampled_data.index.names = ['trial_idx']
            
            # add the "response" column for regression models
            if not "response" in sampled_data.columns:
                sampled_data["response"] = np.where(sampled_data['rt'] > 0, 1,
                                                    np.where(sampled_data['rt'] <=0, 0, None)) 

            if append_data and data is not None:
                sampled_data = sampled_data.join(data.reset_index(), lsuffix='_sampled')
            datasets.append(sampled_data)

    return datasets


## Make sure that change `extended_parents` do changed `parents`

It is not very intuitive that the value of a node's `parents` is dependes on the value of `extended_parents`. 

Again, let's check the extended_parents and parents

In [26]:
node.extended_parents

{<pymc.distributions.new_dist_class.<locals>.new_class 'z_subj_trans.13' at 0x7f82df6e1fd0>,
 <pymc.distributions.new_dist_class.<locals>.new_class 't_subj.13' at 0x7f82df7391d0>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'v_Intercept_subj.13' at 0x7f82df4eb710>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[1]_subj.13' at 0x7f82df5e5710>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'sz' at 0x7f82e1c5acd0>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'v_C(conf, Treatment('LC'))[T.HC]_subj.13' at 0x7f82df51ce90>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[1]_subj.13' at 0x7f82df615e90>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7f82e1b6fe90>,
 <pymc.distributions.new_dist_class.<locals>.new_class 'a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[0]_subj.13' at 0x7f82df67b250>,
 

In [27]:
node.parents

{'p_outlier': 0.05,
 'v': <pymc.PyMCObjects.Deterministic 'v_reg.13' at 0x7f82df495d90>,
 'sv': <pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7f82e1b6fe90>,
 'a': <pymc.PyMCObjects.Deterministic 'a_reg.13' at 0x7f82df542750>,
 'z': <pymc.CommonDeterministics.InvLogit 'z_subj.13' at 0x7f82df708c50>,
 'sz': <pymc.distributions.new_dist_class.<locals>.new_class 'sz' at 0x7f82e1c5acd0>,
 't': <pymc.distributions.new_dist_class.<locals>.new_class 't_subj.13' at 0x7f82df7391d0>,
 'st': <pymc.distributions.new_dist_class.<locals>.new_class 'st' at 0x7f82e1c9de10>,
 'reg_outcomes': frozenset({'a', 'v'})}

Note that `v` is a `pymc.PyMCObjects.Deterministic` with a name `v_reg.13`, and `a` is a `pymc.PyMCObjects.Deterministic` with a name `a_reg.13`.

Here the `Deterministic` means that this object's value is determined by its parents, i.e., the extended_parents here.

Also note that `sv`, `st`, and `sz` are the same in both `extended_parents` and in `parents`.

`sv` in `extended_parents`: `<pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7fc9c6f4c090`
`sv` in `parents`: `<pymc.distributions.new_dist_class.<locals>.new_class 'sv' at 0x7fc9c6f4c090>`

We can verify that changing `extended_parents`' value also simutanously changed the values of those that are deterministic (determined by values of extended_parents' value and the design matrix).

I will test with 5 samples from posterior, in each sample, all the extended_parents's value will be changed to their posterior at that draw. I will record all values of extended_parents and parents' value. 

Then I will calculate the parents' value by dot muptiply the extended_parents' value with their corresponding design matrix. 

Finally, I will compare the the parents' value I recorded in each iteration and the parents' value I calculated by combining extended_parents' value and design matrix.

In [28]:
for name, data in iter_data_tmp7:
    print(name)
    node = ms7_tmp.get_data_nodes(data.index) # get the node corresponding to data.index.
    node
    
    for i, parent in enumerate(node.extended_parents):
        print("Order of extended_parent: ", i)
        print(parent)

In [29]:
##### First, record the value of extended_parents and parents in 5 iterations ####

ls_10_ext_par = []
ls_10_par = []
for pos in range(5):
    print(pos)

    dicts = {}
    for i, parent in enumerate(node.extended_parents):
#         print(parent)
#         print(parent.trace()[pos])
        dicts[parent.__name__] = parent.trace()[pos]  # note how I get the node's name

        assert len(parent.trace()) >= pos, "pos larger than posterior sample size"
        parent.value = parent.trace()[pos]
    
    # record the values of extended_parents
    ls_10_ext_par.append(dicts)
    
    tmp_dict = deepcopy(node.parents.value)
    
    # record the values of parents
    del tmp_dict['reg_outcomes']
    tmp_par = pd.DataFrame.from_dict(tmp_dict)    
    
    ls_10_par.append(tmp_par)

df_ls_10_ext_par = pd.DataFrame.from_dict(ls_10_ext_par)
print(df_ls_10_ext_par.head)

0
1
2
3
4
<bound method NDFrame.head of          st  a_Intercept_subj.13  \
0  0.456611             2.148948   
1  0.456258             2.044641   
2  0.483527             2.245550   
3  0.469041             2.219220   
4  0.470805             2.227721   

   a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[0]_subj.13  \
0                                           0.121259                        
1                                           0.199851                        
2                                           0.064167                        
3                                           0.128285                        
4                                           0.092162                        

   a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[0]_subj.13  \
0                                          -0.061255                        
1                                          -0.077928                        
2                                          -0.042

In [30]:
##### Second, select parameter "a" related extended_parents' value to verify the values

filter_col = [col for col in df_ls_10_ext_par if col.startswith('a_')]
filter_col

df_a_ext_par = df_ls_10_ext_par[filter_col]
print(df_a_ext_par)


#### Get the design matrix

print(ms7_tmp.model_descrs[0]['outcome'])
design_matrix=dmatrix(ms7_tmp.model_descrs[0]['model'], 
                      data=data, return_type='dataframe', NA_action='raise')
print("Head of the design matrix:")
print(design_matrix.head())

design_matrix = design_matrix.add_prefix("a_").add_suffix("_subj.13")
print("Add parameter and participants' info to design matrix: ")
print(design_matrix.head())

# re-order the extended_parents' value's dataframe 
df_a_ext_par = df_a_ext_par[design_matrix.columns]

print(df_a_ext_par.head())

predictor_tmp = design_matrix.dot(df_a_ext_par.T)
predictor_tmp

   a_Intercept_subj.13  \
0             2.148948   
1             2.044641   
2             2.245550   
3             2.219220   
4             2.227721   

   a_theta:C(conf, Treatment('LC'))[HC]:C(dbs, Treatment('0'))[0]_subj.13  \
0                                           0.121259                        
1                                           0.199851                        
2                                           0.064167                        
3                                           0.128285                        
4                                           0.092162                        

   a_theta:C(conf, Treatment('LC'))[LC]:C(dbs, Treatment('0'))[0]_subj.13  \
0                                          -0.061255                        
1                                          -0.077928                        
2                                          -0.042438                        
3                                          -0.018326                    

Unnamed: 0,0,1,2,3,4
3714,2.165100,2.049969,2.238928,2.209395,2.228267
3715,2.199981,1.878607,2.161448,2.163257,2.157646
3716,2.136189,2.086150,2.266576,2.233211,2.245240
3717,2.177441,2.054040,2.233868,2.201888,2.228684
3718,2.162842,1.999437,2.222653,2.203983,2.208643
...,...,...,...,...,...
3983,1.998931,1.797392,2.166164,2.060510,2.113702
3984,2.172069,2.074055,2.261568,2.226137,2.247216
3985,2.191471,2.098738,2.275010,2.231941,2.263575
3986,2.082676,1.935415,2.210480,2.149107,2.177351


In [31]:
# compare the values of the first draw:
ls_10_par[0]['a']

3714    2.165100
3715    2.199981
3716    2.136189
3717    2.177441
3718    2.162842
          ...   
3983    1.998931
3984    2.172069
3985    2.191471
3986    2.082676
3987    2.240182
Name: a, Length: 274, dtype: float64

In [32]:
predictor_tmp[0]

3714    2.165100
3715    2.199981
3716    2.136189
3717    2.177441
3718    2.162842
          ...   
3983    1.998931
3984    2.172069
3985    2.191471
3986    2.082676
3987    2.240182
Name: 0, Length: 274, dtype: float64

In [33]:
ls_10_par[0]['a'] == predictor_tmp[0]

3714    True
3715    True
3716    True
3717    True
3718    True
        ... 
3983    True
3984    True
3985    True
3986    True
3987    True
Length: 274, dtype: bool