In [1]:
import sys
print('Notebook is running:', sys.executable)

# further check your python version
from platform import python_version

print('The current Python version is', python_version())

# If you are sure that conda is installed, also check the package that install
#!conda list  # list the conda

import hddm, IPython, kabuki, pymc
import numpy as np
import pandas as pd
import seaborn as sns
print('The current HDDM version is', hddm.__version__) # 0.8.0
print('The current Kabuki version is', kabuki.__version__) # 0.6.3
print('The current PyMC version is', pymc.__version__) # 2.3.8

# Warning:`IPython.parallel` package has been deprecated since IPython 4.0. 
print('The current IPython version is', IPython.__version__) 

print('The current Numpy version is', np.__version__) 

print('The current Pandas version is', pd.__version__)

print('The current seaborn version is', sns.__version__)

Notebook is running: /opt/conda/bin/python
The current Python version is 3.7.6
The current HDDM version is 0.8.0
The current Kabuki version is 0.6.3
The current PyMC version is 2.3.8
The current IPython version is 7.15.0
The current Numpy version is 1.19.4
The current Pandas version is 1.0.5
The current seaborn version is 0.11.1




In [2]:
# Preparation
import os, hddm, time, csv
import glob
import datetime
from datetime import date

import pymc as pm
import hddm
import kabuki

import arviz as az
import numpy as np
import pandas as pd
import feather
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrix

from p_tqdm import p_map
from functools import partial

# set the color of plots
from cycler import cycler
plt.rcParams['axes.prop_cycle'] = cycler(color='bgrcmykw')

In [3]:
# NOTE: I hacked the `post_pred_gen`, 
# more detals: https://groups.google.com/g/hddm-users/c/Is6AM7eN0fo
from post_pred_gen_redifined import _parents_to_random_posterior_sample
from post_pred_gen_redifined import _post_pred_generate
from post_pred_gen_redifined import post_pred_gen

from pointwise_loglik_gen import _pointwise_like_generate
from pointwise_loglik_gen import pointwise_like_gen

# import self-defined functions
from SimData import SimData
from run_models import run_m1, run_m2, run_m4, run_m5, run_m7

model_func = [run_m1, run_m2, run_m4, run_m5, run_m7]

m_keys = ["ms1",
          "ms2",
          "ms4",
          "ms5",
          "ms7"]

df_keys = ["sim_df1", 
           "sim_df2", 
           "sim_df4", 
           "sim_df5",
           "sim_df7"]


In [4]:
def model_recov(data=None, m_keys=None, model_func=None):
    """
    This func is for model recovery. 
    
    data: input data, can be simulated data or real data
    m_keys: id for different models
    model_func: a list of model functions
    
    """
    InfData = {}
    models = {}
    for ii in range(len(m_keys)):
        m_key = m_keys[ii]

        ### Run models
        save_name = "./tmp/" + m_key + "_tmp"
        print("start model fitting for ", m_key)
        ms_tmp = p_map(partial(model_func[ii], 
                               df=data, 
                               samples=samples,
                               burn=burn,
                               save_name=save_name),
                       range(chains))

        ### Observations
        xdata_observed = ms_tmp[0].data.copy()
        xdata_observed.index.names = ['trial_idx']
        xdata_observed = xdata_observed[['rt', 'response']]
        xdata_observed = xr.Dataset.from_dataframe(xdata_observed)

        ### posteriors
        xdata_posterior = []
        for jj in range(len(ms_tmp)):
            trace_tmp = ms_tmp[jj].get_traces()
            trace_tmp['chain'] = jj
            trace_tmp['draw'] = np.arange(len(trace_tmp), dtype=int)
            xdata_posterior.append(trace_tmp)
        xdata_posterior = pd.concat(xdata_posterior)
        xdata_posterior = xdata_posterior.set_index(["chain", "draw"])
        xdata_posterior = xr.Dataset.from_dataframe(xdata_posterior)

        ### PPC
        xdata_post_pred = [] # define an empty dict    
        print("start PPC for ", m_key)
        start_time = time.time()  
        xdata_post_pred = p_map(partial(post_pred_gen), ms_tmp)
        print("Running PPC for ", m_key, " costs %f seconds" % (time.time() - start_time))
        xdata_post_pred = pd.concat(xdata_post_pred, names=['chain'], 
                                keys = list(range(len(xdata_post_pred))))
        xdata_post_pred = xdata_post_pred.reset_index(level=1, drop=True)
        xdata_post_pred = xr.Dataset.from_dataframe(xdata_post_pred)

        ### Point-wise log likelihood
        xdata_loglik = [] # define an empty dict
        print("start calculating loglik for ", m_key)
        start_time = time.time()  # the start time of the processing
        xdata_loglik = p_map(partial(pointwise_like_gen), ms_tmp)
        print("Generating loglik costs %f seconds" % (time.time() - start_time))

        xdata_loglik = pd.concat(xdata_loglik, names=['chain'], 
                                keys = list(range(len(xdata_loglik))))
        xdata_loglik = xdata_loglik.reset_index(level=1, drop=True)
        xdata_loglik = xr.Dataset.from_dataframe(xdata_loglik)
        
        ### convert to InfData
        InfData[m_key] = az.InferenceData(posterior=xdata_posterior, 
                                                 observed_data=xdata_observed,
                                                 posterior_predictive=xdata_post_pred,
                                                 log_likelihood = xdata_loglik)
        models[m_key] = ms_tmp
    return models, InfData

In [5]:
samples = 2000
burn = 500
chains = 4

In [None]:
%%time

conf_mat_dic1 = pd.DataFrame(0, index=m_keys, columns=df_keys)
conf_mat_loo1 = pd.DataFrame(0, index=m_keys, columns=df_keys)
conf_mat_waic1 = pd.DataFrame(0, index=m_keys, columns=df_keys)

for sim in range (4):   
    for df_key in df_keys:
        ### simulate data
        data = SimData(df_key)

        ### fit the sim data
        print("Start model recovery for ", df_key)
        models, InfData = model_recov(data=data, m_keys=m_keys, model_func=model_func)

        ### compare models
        tmp_loo_comp = az.compare(InfData, ic="loo")
        tmp_loo_comp = tmp_loo_comp.reset_index()
        tmp_waic_comp = az.compare(InfData, ic="waic")
        tmp_waic_comp = tmp_waic_comp.reset_index()
        
        tmp_dic = []
        indx_name = []

        for m_key, model in models.items():
            m_tmp = kabuki.utils.concat_models(model)
            tmp_dic.append(m_tmp.dic)
            indx_name.append(m_key)
            
        tmp_dic_comp = pd.DataFrame(tmp_dic, index=indx_name, columns=['dic'])
        tmp_dic_comp = tmp_dic_comp.sort_values(by=['dic'])
        tmp_dic_comp = tmp_dic_comp.reset_index()
        #conf_mat_dic.rename(columns={'index':'rank'}, inplace=True)

        ### record the best models
        conf_mat_dic1.loc[tmp_dic_comp.loc[0, 'index'], df_key] += 1
        conf_mat_loo1.loc[tmp_loo_comp.loc[0, 'index'], df_key] += 1
        conf_mat_waic1.loc[tmp_waic_comp.loc[0, 'index'], df_key] += 1

        conf_mat_dic1.to_csv('conf_mat_dic1.csv')
        conf_mat_loo1.to_csv('conf_mat_loo1.csv')
        conf_mat_waic1.to_csv('conf_mat_waic1.csv')

Start model recovery for  sim_df1
start model fitting for  ms1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

  tmp2 = (x - v) * (fx - fw)
  tmp2 = (x - v) * (fx - fw)
  tmp2 = (x - v) * (fx - fw)
  tmp2 = (x - v) * (fx - fw)


 [                  0%                  ] 3 of 2000 complete in 0.7 sec[                  0%                  ] 3 of 2000 complete in 0.6 sec[                  0%                  ] 3 of 2000 complete in 0.8 sec[                  0%                  ] 4 of 2000 complete in 0.9 sec[                  0%                  ] 4 of 2000 complete in 1.3 sec[                  0%                  ] 4 of 2000 complete in 1.3 sec[                  0%                  ] 4 of 2000 complete in 1.4 sec[                  0%                  ] 5 of 2000 complete in 1.4 sec

In [None]:
tmp_loo_comp

In [None]:
tmp_waic_comp

In [None]:
models['ms7'][0].plot_posteriors()