In [1]:
import nest_asyncio
nest_asyncio.apply()

from hddCRP.modelBuilder import cdCRP
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from statannotations.Annotator import Annotator
from sklearn.metrics import roc_auc_score

import itertools

from pandas.api.types import CategoricalDtype
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [2]:
overwrite_existing_results = False
results_directory = "Results/population/"

if(not os.path.exists(results_directory)):
    os.makedirs(results_directory)

data_filename = 'data/Data_turns_all_by_session.pkl';
with open(data_filename, 'rb') as data_file:
    data = pickle.load(data_file)

subjects = ["uniform", "diverse"]
subjects.sort()
print("subjects = " + str(subjects))

context_depth = 2;
nback_depth   = 1;
session_numbers = None#[1]; # index by 1
number_of_trials    = 100;

action_labels = [0,1,2]

subjects = ['diverse', 'uniform']


In [3]:
if(session_numbers is None):
    fit_file = f"{results_directory}/fits_trials_{number_of_trials}"
    fit_summary_file = f"{results_directory}/fit_summary_trials_{number_of_trials}"
    seed_offset = number_of_trials
else:
    start_session = np.min(session_numbers)
    end_session = np.max(session_numbers)
    fit_file = f"{results_directory}/fits_session_{start_session}"
    fit_summary_file = f"{results_directory}/fit_summary_session_{start_session}"
    if(end_session != start_session):
        fit_file += f"_to_{start_session}"
        fit_summary_file  += f"_to_{start_session}"
    seed_offset = start_session

if(nback_depth != 1 or context_depth != 2):
    fit_file += f"_cd{context_depth}_nb{nback_depth}"
    fit_summary_file  += f"_cd{context_depth}_nb{nback_depth}"

fit_file += f".pkl"
fit_summary_file += f".pkl"
if(not os.path.isfile(fit_file) or overwrite_existing_results):
    data_fits = pd.DataFrame()
    data_fit_metrics = pd.DataFrame()
    for subject_index, subject in enumerate(subjects):
        print(f"subject {subject} ")

        sequences = []
        session_types = []
        subject_labels = []
        for subject_p in data["group_definition"][subject]:
            sequences_0 = data["data"][subject_p]["data"]; # turns in each session
            session_types_0 = data["data"][subject_p]["task"] # which maze

            if(session_numbers is None):
                ii = list(np.where(np.array(session_types_0)=='C')[0])
                seqs_c = [sequences_0[xx] for xx in ii]
                seqs_c = list(itertools.chain.from_iterable(seqs_c))
                sequences += [seqs_c[:number_of_trials]]
                session_types += ['C']
                subject_labels += [subject_p]
            else:
                ii = list(np.where(np.array(session_types_0)=='C')[0][np.array(session_numbers)-1])
                sequences     += [sequences_0[xx] for xx in ii]
                session_types += [session_types_0[xx] for xx in ii]
                subject_labels += [subject_p] * len(ii)

        stan_seed = (subject_index+1) * 1000 + seed_offset

        model = cdCRP(sequences, session_labels=session_types, subject_labels=subject_labels, possible_observations=action_labels);
        model.same_nback_depth = nback_depth
        model.context_depth = context_depth

        model.build(random_seed=stan_seed);
        model.fit_model()

        map_fit = model.get_map()
        fit_df  = model.fit.to_frame()

        fit_df["subject"] = subject
        summary_df = model.fit_summary()
        summary_df["subject"] = subject
        summary_df["MAP"] = pd.Series(map_fit)
        if(session_numbers is None):
            summary_df["number_of_trials"] = number_of_trials
            summary_df["start_session_C"]  = pd.NA
            summary_df["end_session_C"]    = pd.NA
            fit_df["number_of_trials"] = number_of_trials
            fit_df["start_session_C"]  = pd.NA
            fit_df["end_session_C"]    = pd.NA
        else:
            summary_df["number_of_trials"] = pd.NA
            summary_df["start_session_C"]  = start_session
            summary_df["end_session_C"]    = end_session
            fit_df["number_of_trials"] = pd.NA
            fit_df["start_session_C"]  = start_session
            fit_df["end_session_C"]    = end_session

        data_fit_metrics = pd.concat([data_fit_metrics,summary_df], copy=False)
        data_fits = pd.concat([data_fits,fit_df], copy=False)

    data_fits.to_pickle(fit_file)
    data_fit_metrics.to_pickle(fit_summary_file)
else:
    print("fit file found")



fit file found


In [5]:
parameter_name_map = {"alpha" : "concentration", 
                      "context_similarity_depth_1" : "context weight level 1",
                      "context_similarity_depth_2" : "context weight level 2", 
                      "repeat_bias_1_back" : "repeat bias",
                      "timeconstant_within_session_A" : "time constant",
                      "timeconstant_within_session_C" : "time constant"}

 
summary_df = pd.read_pickle(fit_summary_file)
summary_df.index.name = "parameter"
summary_df = summary_df.reset_index()
summary_df["parameter"] = summary_df["parameter"].map(parameter_name_map)
data_fits  = pd.read_pickle(fit_file)
data_fits.rename(columns=parameter_name_map, inplace=True)

params = list(summary_df["parameter"].unique());

data_fits

parameters,lp__,accept_stat__,stepsize__,treedepth__,n_leapfrog__,divergent__,energy__,alpha,timeconstant_within_session_C,repeat_bias_1_back,context_similarity_depth_1,context_similarity_depth_2,subject,number_of_trials,start_session_C,end_session_C
draws,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,-963.691587,0.900097,0.486370,3.0,15.0,0.0,969.192488,5.074392,28.059196,0.976909,0.905401,0.078246,diverse,100,,
1,-959.389314,0.861784,0.547604,3.0,7.0,0.0,961.265363,2.376460,44.751409,0.662627,0.853212,0.482617,diverse,100,,
2,-959.979959,0.948568,0.565220,2.0,3.0,0.0,960.478307,3.744173,27.560810,0.509202,0.793253,0.355549,diverse,100,,
3,-965.536217,0.831479,0.416612,3.0,7.0,0.0,966.321926,6.026395,72.076291,0.601749,0.295470,0.774897,diverse,100,,
4,-961.581044,0.966101,0.486370,3.0,7.0,0.0,965.400025,3.563361,31.063167,0.557241,0.961190,0.431229,diverse,100,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,-1080.791075,0.816319,0.495197,2.0,7.0,0.0,1081.699834,7.372000,28.955316,0.985973,0.471936,0.872592,uniform,100,,
3996,-1078.931526,0.920430,0.594087,3.0,7.0,0.0,1081.105994,3.736188,35.375710,0.811518,0.341901,0.837610,uniform,100,,
3997,-1079.107245,1.000000,0.479113,3.0,7.0,0.0,1083.110699,4.551114,62.874407,1.185255,0.356120,0.797080,uniform,100,,
3998,-1078.117119,0.934105,0.576204,3.0,7.0,0.0,1079.490373,4.719203,26.736792,1.090076,0.681612,0.831471,uniform,100,,


draws
0         [5.074392192948843, 0.9054014629780311]
1        [2.3764598307168714, 0.8532117692493647]
2         [3.7441734403736797, 0.793252983119751]
3            [6.026394702997351, 0.2954696788896]
4         [3.563360938777622, 0.9611896338996332]
                          ...                    
3995       [7.37199955239934, 0.4719363563965383]
3996    [3.7361875438202627, 0.34190079536686196]
3997      [4.551113924387478, 0.3561201789406736]
3998      [4.719203478628664, 0.6816121090404115]
3999      [3.940338686588198, 0.2011235086309134]
Length: 8000, dtype: object

In [None]:
pval_seed = 10;
sim_rng = np.random.Generator(np.random.MT19937(pval_seed))

pval = np.zeros((len(params)))
pval2 = np.zeros((len(params)))
for ii, param in enumerate(params):
    #fit.hist(params[ii], by="subject", ax=ax);
    cat_type = CategoricalDtype(categories=["uniform", "diverse"], ordered=False)
    df_c = data_fits[["subject", param]]
    df_c = df_c.assign(subject_c= df_c["subject"].astype(cat_type))
    df_c = df_c.assign(subject_c2= df_c["subject"].astype(cat_type))
    df_c['subject_c'].replace(['uniform', 'diverse'],
                            [0, 1], inplace=True)
    df_c['subject_c2'].replace(['uniform', 'diverse'],
                            [1, 0], inplace=True)
    pval[ii]  = roc_auc_score(df_c['subject_c' ].to_numpy(), df_c[param].to_numpy())
    pval2[ii] = roc_auc_score(df_c['subject_c2'].to_numpy(), df_c[param].to_numpy())

    print(f"{param}: p(diverse > uniform) = {pval[ii]} (opp = {pval2[ii]})")


df_c2 = data_fits.melt(value_vars=params, id_vars=["subject"])


sns.displot(
    df_c2, x="value", col="parameters", hue="subject", palette="colorblind",
    height=3, facet_kws={"margin_titles": True, "sharex" :False},common_bins=False
)

params



In [None]:

sns.pairplot(data_fits, hue="subject", vars=params,
    plot_kws=dict(s=1))


In [None]:
X = data_fits[["repeat bias", "context weight level 1"]]#[params]
y = data_fits['subject']

#define cross-validation method to use
cv = LeaveOneOut()

#build multiple linear regression model
model = SVC(kernel='linear', C=1, random_state=192)

#use LOOCV to evaluate model
scores = cross_val_score(model, X, y,
                         cv=cv, n_jobs=-1)

#view mean absolute error
np.mean(np.absolute(scores))

In [None]:
plt.hist(scores)