In [1]:
import random
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from sklearn.model_selection import KFold

from Python_Scripts import OrderedCategorySystem as OCS
from Python_Scripts import generate_plots as plots
from Python_Scripts import order_analyses as analyses
from Python_Scripts import RationalCategorySystem as RCS

In [2]:
F =  [12, 13, 15, 14, 16, 18, 17, 19, 20]
B =  [20, 19, 17, 18, 16, 14, 15, 13, 12]
M1 =  [16, 17, 15, 18, 14, 19, 13, 20, 12]
M2 = [16, 15, 17, 14, 18, 13, 19, 12, 20]

NEW = [i for i in range(9, 24)]
ALL = NEW + [1, 3, 29, 31]

SHIFT = 3
DISTRACTORS = [1, 3, 29, 31]

ITEMS = ['I09', 'I10', 'I11', 'I12', 'I13', 'I14', 'I15', 'I16', 'I17', 'I18', 'I19', 'I20', 'I21', 'I22', 'I23']

LEFT = ITEMS[:9]
CENTRE = ITEMS[3:12]
RIGHT = ITEMS[6:]

LOCS = [('L', LEFT),
        ('C', CENTRE), 
        ('R', RIGHT)]
ORDERS = [('f', 0, [1, 2, 3, 4, 5, 6, 7, 8]),
          ('m', 4, [0, 1, 2, 3, 5, 6, 7, 8]),
          ('b', 8, [0, 1, 2, 3, 4, 5, 6, 7])]

item_space = [i for i in range(1, 32)]

### Load Participant Data

In [3]:
allParticipants = pd.read_csv('Results/participant_data.csv')
allParticipants =  allParticipants[(allParticipants['ATTEMPTS'] < 3) & (allParticipants['TOTAL_ERRORS'] < 4) & (allParticipants['POOL'] == 'prolific2')]
participants = allParticipants['P_ID'].tolist()

participant_df = pd.read_csv('Results/trial_data.csv')    
participant_df = participant_df[participant_df['P_ID'].isin(participants)]

cat_assigns = ITEMS + ['I01', 'I03', 'I29', 'I31']
others = participant_df.columns.difference(cat_assigns)
sequence_df = pd.read_csv('Results/sequence_data.csv')

trial_df = (
  participant_df[others]
    .assign(ITEMS = participant_df[cat_assigns].agg(
            lambda row: {k: v for k, v in row.items() if not pd.isna(v)},
            axis=1
    )
  )
)
seq_df = (
  sequence_df[['P_ID', 'DEPTH', 'LOC', 'ORDER', 'STIMULI']]
  .assign(SEQUENCE = sequence_df[[f't{i+1:02}' for i in range(13)]].agg(
            lambda row: {k: v for k, v in row.items()},
            axis=1
    )   
  )
)

data_df = trial_df.merge(seq_df, on=['P_ID', 'DEPTH', 'LOC', 'ORDER', 'STIMULI'], how='inner')
participant_trials = list(data_df.to_dict('index').values())

nTrials = data_df.shape[0]

In [None]:
def k_fold_cross_validation_ocs(data, params, k=5, determ=False, verbose=False):
    participants = list(data.keys())
    kf = KFold(n_splits=k, shuffle=True, random_state=18)
    lls = []
    best_params = []
    ns = []
    for train_idx, test_idx in kf.split(participants):
        train_data = {participants[idx]: data[participants[idx]] for idx in train_idx}
        test_data = {participants[idx]: data[participants[idx]] for idx in test_idx}
        if determ:
            best_a, _ = OCS.find_best_params(train_data, params, determ=True)
            test_ll, n_trials = OCS.get_loglike_and_n(test_data, determ=True, alpha=best_a)
            if verbose:
                print(f'alpha = {round(best_a,2)}, loglike = {round(test_ll/n_trials,2)}')
            best_params.append((best_a))

        else:
            best_t, best_a, _ = OCS.find_best_params(train_data, params)
            test_ll, n_trials = OCS.get_loglike_and_n(test_data, temp=best_t, alpha=best_a)
            if verbose:
                print(f't = {round(best_t,1)}, alpha = {round(best_a,2)}, loglike = {round(test_ll/n_trials,2)}')
            best_params.append((best_t, best_a))
        lls.append(test_ll/n_trials)
        ns.append(n_trials)
    return np.array(lls), best_params, np.array(ns)

def k_fold_cross_validation_rcs(data, item_space, depth, params, k=5, determ=False, verbose=False):
    participants = list(data.keys())
    kf = KFold(n_splits=k, shuffle=True, random_state=18)
    lls = []
    best_params = []
    ns = []
    for train_idx, test_idx in kf.split(participants):
        train_data = {participants[idx]: data[participants[idx]] for idx in train_idx}
        test_data = {participants[idx]: data[participants[idx]] for idx in test_idx}
        if determ:
            best_c, best_a, _ = RCS.find_best_params(train_data, item_space, depth, params, determ=True)
            print(best_c, best_a)
            # test_ll, n_trials = RCS.get_loglike_and_n(test_data, item_space, depth, c=best_c, alpha=best_a, dete)
        else:
            best_c, best_a, _ = RCS.find_best_params(train_data, item_space, depth, params)
            test_ll, n_trials = RCS.get_loglike_and_n(test_data, item_space, depth, c=best_c, alpha=best_a)
            if verbose:
                print(f'c = {round(best_c,2)}, alpha = {round(best_a,2)}, loglike = {round(test_ll/n_trials,2)}')
            best_params.append((best_c, best_a))
        lls.append(test_ll/n_trials)
        ns.append(n_trials)
    return np.array(lls), best_params, np.array(ns)
    

In [5]:
ocs_data2, ocs_data3 = OCS.compute_possible_scores(participant_trials, item_space)

two_lev_data = [tr for tr in participant_trials if tr['DEPTH'] == 2]
three_lev_data = [tr for tr in participant_trials if tr['DEPTH'] == 3]

rcm_data2 = RCS.format_rcm_data(two_lev_data)
rcm_data3 = RCS.format_rcm_data(three_lev_data)

In [6]:
temps = np.linspace(1.0, 5, 41)
alphas = np.linspace(0, 0.5, 51)
c_vals = np.linspace(0.1, 0.9, 17)

item_values = (np.array(item_space) - min(item_space)) / (max(item_space) - min(item_space))
item_values = [np.array([val]) for val in item_values.tolist()]

### 2 Level Category Systems

In [16]:
ckmms2_ll, ckmms2_params,  ckmms2_n = k_fold_cross_validation_ocs(ocs_data2, (temps, alphas), k=20)
ckmmd2_ll, ckmmd2_params,  ckmmd2_n  = k_fold_cross_validation_ocs(ocs_data2, (alphas[1:]), determ=True, k=20)

In [None]:
rcmp2_lls, rcmp2_params, rcmp2_ns = k_fold_cross_validation_rcs(rcm_data2, item_values, 2, (c_vals, alphas), k=20)

c = 0.35, alpha = 0.0, loglike = -6.46
c = 0.35, alpha = 0.0, loglike = -7.05
c = 0.35, alpha = 0.0, loglike = -7.6
c = 0.35, alpha = 0.0, loglike = -7.99
c = 0.35, alpha = 0.0, loglike = -6.82
c = 0.35, alpha = 0.0, loglike = -7.37
c = 0.35, alpha = 0.0, loglike = -7.1
c = 0.35, alpha = 0.0, loglike = -6.13
c = 0.35, alpha = 0.0, loglike = -8.06
c = 0.35, alpha = 0.0, loglike = -7.73
c = 0.35, alpha = 0.0, loglike = -8.59
c = 0.35, alpha = 0.0, loglike = -7.43
c = 0.35, alpha = 0.0, loglike = -7.15
c = 0.35, alpha = 0.0, loglike = -6.61
c = 0.35, alpha = 0.0, loglike = -5.47


In [7]:
rcmm2_lls, rcmm2_params, rcmm2_ns = k_fold_cross_validation_rcs(rcm_data2, item_values, 2, (c_vals, alphas), k=20, determ=True, verbose=True)


UnboundLocalError: cannot access local variable 'test_ll' where it is not associated with a value

In [24]:
avg_ll_2s = np.average(ckmms2_ll, weights=ckmms2_n)
dev_2s = np.sqrt(np.average((ckmms2_ll-avg_ll_2s)**2, weights=ckmms2_n))
t_unique_2s, t_freqs_2s = np.unique([t for t, _ in ckmms2_params], return_counts=True)
a_unique_2s, a_freqs_2s = np.unique([a for _, a in ckmms2_params], return_counts=True)



avg_ll_2d = np.average(ckmmd2_ll, weights=ckmmd2_n)
dev_2d = np.sqrt(np.average((ckmmd2_ll-avg_ll_2d)**2, weights=ckmmd2_n))
a_unique_2d, a_freqs_2d = np.unique(ckmmd2_params, return_counts=True)

avg_ll_2p = np.average(rcmp2_lls, weights=rcmp2_ns)
dev_2p = np.sqrt(np.average((rcmp2_lls-avg_ll_2p)**2, weights=rcmp2_ns))
c_unique_2p, c_freqs_2p = np.unique([c for c, _ in rcmp2_params], return_counts=True)
a_unique_2p, a_freqs_2p = np.unique([a for _, a in rcmp2_params], return_counts=True)


rand_total_ll, n = OCS.get_loglike_and_n(ocs_data2, alpha=1.0) 


print(f'CKMM:\n\tSoftmax:{round(avg_ll_2s, 2)} +/- {round(dev_2s,2)}, T = {round(t_unique_2s[np.argmax(t_freqs_2s)],2)} ({t_freqs_2s[np.argmax(t_freqs_2s)]}/20), alpha= {a_unique_2s[np.argmax(a_freqs_2s)]} ({a_freqs_2s[np.argmax(a_freqs_2s)]}/20)')
print(f'\tGreedy:{round(avg_ll_2d, 2)} +/- {round(dev_2d,2)}, alpha= {a_unique_2d[np.argmax(a_freqs_2d)]} ({a_freqs_2d[np.argmax(a_freqs_2d)]}/20)')
print(f'RCM:\n\tParticle:{round(avg_ll_2p, 2)} +/- {round(dev_2p,2)}, c = {round(c_unique_2p[np.argmax(c_freqs_2p)],2)} ({c_freqs_2p[np.argmax(c_freqs_2p)]}/20), alpha= {a_unique_2p[np.argmax(a_freqs_2p)]} ({a_freqs_2p[np.argmax(a_freqs_2p)]}/20)')

print(f'Baseline: {round(rand_total_ll/n,2)}')


CKMM:
	Softmax:-5.82 +/- 0.96, T = 3.9 (12/20), alpha= 0.05 (18/20)
	Greedy:-7.31 +/- 0.86, alpha= 0.25 (18/20)
RCM:
	Particle:-7.31 +/- 0.74, c = 0.35 (20/20), alpha= 0.0 (20/20)
Baseline: -14.28


In [22]:
print(avg_ll_2p, avg_ll_2d)

-7.305735106169985 -7.313237879874856


### 3 Level Category Systems

In [37]:
ckmms3_ll, ckmms3_params,  ckmms3_n = k_fold_cross_validation_ocs(ocs_data3, (temps, alphas), k=20)
ckmmd3_ll, ckmmd3_params,  ckmmd3_n  = k_fold_cross_validation_ocs(ocs_data3, (alphas[1:]), determ=True, k=20)

In [35]:
rcmp3_lls, rcmp3_params, rcmp3_ns = k_fold_cross_validation_rcs(rcm_data3, item_values, 3, (c_vals, alphas), k=20)


In [39]:
avg_ll_3s = np.average(ckmms3_ll, weights=ckmms3_n)
dev_3s = np.sqrt(np.average((ckmms3_ll-avg_ll_3s)**2, weights=ckmms3_n))
t_unique_3s, t_freqs_3s = np.unique([t for t, _ in ckmms3_params], return_counts=True)
a_unique_3s, a_freqs_3s = np.unique([a for _, a in ckmms3_params], return_counts=True)



avg_ll_3d = np.average(ckmmd3_ll, weights=ckmmd3_n)
dev_3d = np.sqrt(np.average((ckmmd3_ll-avg_ll_2d)**2, weights=ckmmd3_n))
a_unique_3d, a_freqs_3d = np.unique(ckmmd3_params, return_counts=True)

avg_ll_3p = np.average(rcmp3_lls, weights=rcmp3_ns)
dev_3p = np.sqrt(np.average((rcmp3_lls-avg_ll_3p)**2, weights=rcmp3_ns))
c_unique_3p, c_freqs_3p = np.unique([c for c, _ in rcmp3_params], return_counts=True)
a_unique_3p, a_freqs_3p = np.unique([a for _, a in rcmp3_params], return_counts=True)


rand_total_ll, n = OCS.get_loglike_and_n(ocs_data3, alpha=1.0) 


print(f'CKMM:\n\tSoftmax:{round(avg_ll_3s, 2)} +/- {round(dev_3s,2)}, T = {round(t_unique_3s[np.argmax(t_freqs_3s)],2)} ({t_freqs_3s[np.argmax(t_freqs_3s)]}/20), alpha= {a_unique_3s[np.argmax(a_freqs_3s)]} ({a_freqs_3s[np.argmax(a_freqs_3s)]}/20)')
print(f'\tGreedy:{round(avg_ll_3d, 2)} +/- {round(dev_3d,2)}, alpha= {a_unique_3d[np.argmax(a_freqs_3d)]} ({a_freqs_3d[np.argmax(a_freqs_3d)]}/20)')
print(f'RCM:\n\tParticle:{round(avg_ll_3p, 2)} +/- {round(dev_3p,2)}, c = {round(c_unique_3p[np.argmax(c_freqs_3p)],2)} ({c_freqs_3p[np.argmax(c_freqs_3p)]}/20), alpha= {a_unique_3p[np.argmax(a_freqs_3p)]} ({a_freqs_3p[np.argmax(a_freqs_3p)]}/20)')

print(f'Baseline: {round(rand_total_ll/n,2)}')


CKMM:
	Softmax:-11.97 +/- 1.66, T = 1.0 (20/20), alpha= 0.14 (18/20)
	Greedy:-13.04 +/- 5.91, alpha= 0.29 (18/20)
RCM:
	Particle:-14.82 +/- 0.95, c = 0.3 (20/20), alpha= 0.0 (20/20)
Baseline: -25.3
