## Analyses

In [1]:
# imports
import numpy as np
import pandas as pd; pd.set_option('display.max_columns', None)
import cmlreaders as cml
from analyses import analyses
import warnings; warnings.filterwarnings("ignore")

[nltk_data] Downloading package cmudict to
[nltk_data]     /home1/hherrema/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [2]:
df = cml.get_data_index('ltp')
df_select = df[(df.experiment == 'ltpFR2') & (df.session != 23)]

#### Storing Results

In [3]:
def store_results_v1(data_tr_lst, data_sess, data_sub, nom, tr_lst_toggle, idx=''):
    if tr_lst_toggle:    # trial level
        data_tr_lst.to_csv(f'analyses/dataframes/{nom}_H{idx}_data_tr.csv', index=False)
    else:                # list level
        data_tr_lst.to_csv(f'analyses/dataframes/{nom}_H{idx}_data_lst.csv', index=False)
    
    # session level
    data_sess.to_csv(f'analyses/dataframes/{nom}_H{idx}_data.csv', index=False)
    
    # subject level (between-subject average)
    data_sub.to_csv(f'analyses/dataframes/{nom}_H{idx}_data_bsa.csv', index=False)

In [4]:
def store_results_v2(data_tr_lst, data_sess, data_sub, nom, idx, tr_lst_toggle):
    if idx not in ['J', 'JFL']:
        raise ValueError(f'{idx} not a valid index.')
    
    if tr_lst_toggle:    # trial level
        data_tr_lst.to_csv(f'analyses/dataframes/{nom}_{idx}_data_tr.csv', index=False)
    else:                # list level
        data_tr_lst.to_csv(f'analyses/dataframes/{nom}_{idx}_data_lst.csv', index=False)
    
    # session level
    data_sess.to_csv(f'analyses/dataframes/{nom}_{idx}_data.csv', index=False)
    
    # subject level (between-subject average)
    data_sub.to_csv(f'analyses/dataframes/{nom}_{idx}_data_bsa.csv', index=False)

#### Parallel Computing

In [5]:
client = analyses.create_client('pcs', '15GB', 200)

Unique port for hherrema is 51465
{'dashboard_address': ':51465'}
To view the dashboard, run: 
`ssh -fN hherrema@rhino2.psych.upenn.edu -L 8000:192.168.86.107:51465` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser


In [6]:
sub_iter, exp_iter, sess_iter = analyses.build_iterables(df_select)

In [20]:
client.shutdown

<bound method Client.shutdown of <Client: 'tcp://192.168.86.107:33083' processes=0 threads=0, memory=0 B>>

#### Phonetic Clustering Score

In [7]:
# sim start or rhyme
method_intr = ['both' for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.pcs_parallel_v1, sub_iter, exp_iter, sess_iter, method_intr)

In [8]:
pcs_H_data_lst = pd.concat(results, ignore_index=True)                 # list level data
pcs_H_data = analyses.pcs_sess_avg(pcs_H_data_lst)          # session average
pcs_H_data_bsa = analyses.pcs_btwn_subj_avg(pcs_H_data)     # between-subject average

# store results
store_results_v1(pcs_H_data_lst, pcs_H_data, pcs_H_data_bsa, 'pcs', False)

In [9]:
# sim start
method_intr = ['sim_start' for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.pcs_parallel_v1, sub_iter, exp_iter, sess_iter, method_intr)

In [10]:
pcs_HS_data_lst = pd.concat(results, ignore_index=True)                 # list level data
pcs_HS_data = analyses.pcs_sess_avg(pcs_HS_data_lst)          # session average
pcs_HS_data_bsa = analyses.pcs_btwn_subj_avg(pcs_HS_data)     # between-subject average

# store results
store_results_v1(pcs_HS_data_lst, pcs_HS_data, pcs_HS_data_bsa, 'pcs', False, 'S')

In [11]:
# rhyme
method_intr = ['rhyme' for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.pcs_parallel_v1, sub_iter, exp_iter, sess_iter, method_intr)

In [12]:
pcs_HR_data_lst = pd.concat(results, ignore_index=True)                 # list level data
pcs_HR_data = analyses.pcs_sess_avg(pcs_HR_data_lst)          # session average
pcs_HR_data_bsa = analyses.pcs_btwn_subj_avg(pcs_HR_data)     # between-subject average

# store results
store_results_v1(pcs_HR_data_lst, pcs_HR_data, pcs_HR_data_bsa, 'pcs', False, 'R')

In [9]:
# Jaccard index
psim_fxn_iter = [analyses.phonetic_sim_J for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.pcs_parallel_v2, sub_iter, exp_iter, sess_iter,  psim_fxn_iter)

In [10]:
pcs_J_data_lst = pd.concat(results, ignore_index=True)                 # list level data
pcs_J_data = analyses.pcs_sess_avg(pcs_J_data_lst)         # session average
pcs_J_data_bsa = analyses.pcs_btwn_subj_avg(pcs_J_data)    # between-subject average

# store results
store_results_v2(pcs_J_data_lst, pcs_J_data, pcs_J_data_bsa, 'pcs', 'J', False)

In [15]:
# Jaccard index (first and last phoneme)
psim_fxn_iter = [analyses.phonetic_sim_JFL for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.pcs_parallel_v2, sub_iter, exp_iter, sess_iter, psim_fxn_iter)

In [16]:
pcs_JFL_data_lst = pd.concat(results, ignore_index=True)                  # list level data
pcs_JFL_data = analyses.pcs_sess_avg(pcs_JFL_data_lst)         # session average
pcs_JFL_data_bsa = analyses.pcs_btwn_subj_avg(pcs_JFL_data)    # between-subject average

# store results
store_results_v2(pcs_JFL_data_lst, pcs_JFL_data, pcs_JFL_data_bsa, 'pcs', 'JFL', False)

#### Temporal Clustering Score

In [11]:
errors, results = analyses.run_parallel_sessions(client, analyses.tcs_parallel, sub_iter, exp_iter, sess_iter)

In [12]:
# list level data
tcs_data_lst = pd.concat(results, ignore_index=True)
tcs_data_lst.to_csv('analyses/dataframes/tcs_data_lst.csv', index=False)

# session averages
tcs_data = analyses.tcs_sess_avg(tcs_data_lst)
tcs_data.to_csv('analyses/dataframes/tcs_data.csv', index=False)

# between-subject average
tcs_data_bsa = analyses.tcs_btwn_subj_avg(tcs_data)
tcs_data_bsa.to_csv('analyses/dataframes/tcs_data_bsa.csv', index=False)

#### Semantic Clustering Score

In [13]:
errors, results = analyses.run_parallel_sessions(client, analyses.scs_parallel, sub_iter, exp_iter, sess_iter)

In [14]:
# list level data
scs_data_lst = pd.concat(results, ignore_index=True)
scs_data_lst.to_csv('analyses/dataframes/scs_data_lst.csv', index=False)

# session averages
scs_data = analyses.scs_sess_avg(scs_data_lst)
scs_data.to_csv('analyses/dataframes/scs_data.csv', index=False)

# between-subject average
scs_data_bsa = analyses.scs_btwn_subj_avg(scs_data)
scs_data_bsa.to_csv('analyses/dataframes/scs_data_bsa.csv', index=False)

#### Correlation of Clustering Scores

In [21]:
# reliability of phonetic similarity metrics
pcs_corrs = analyses.pcs_correlations(pcs_H_data_lst, pcs_J_data_lst, pcs_JFL_data_lst)
pcs_corrs_bsa = analyses.pcs_corr_btwn_subj_avg(pcs_corrs)
pcs_corrs_bsa.to_csv('analyses/dataframes/pcs_corrs_bsa.csv', index=False)

  0%|          | 0/2374 [00:00<?, ?it/s]

#### Phonetic Intrusions

In [22]:
# binary metric, encoding list
psim_fxn_iter = [analyses.phonetic_sim_H for x in range(len(sub_iter))]
seed_iter = [x for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.psim_intr_parallel_l, sub_iter, exp_iter, sess_iter, psim_fxn_iter, seed_iter)

In [23]:
psim_intr_l_H_data_tr = pd.concat(results, ignore_index=True)                                  # trial level data
psim_intr_l_H_data = analyses.psim_intr_sess_avg_l(psim_intr_l_H_data_tr)            # session averages
psim_intr_l_H_data_bsa = analyses.psim_intr_btwn_subj_avg_l(psim_intr_l_H_data)      # between-subject average

# store results
store_results_v1(psim_intr_l_H_data_tr, psim_intr_l_H_data, psim_intr_l_H_data_bsa, 'psim_intr_l', True)

In [24]:
# Jaccard index, encoding list
psim_fxn_iter = [analyses.phonetic_sim_J for x in range(len(sub_iter))]
seed_iter = [x for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.psim_intr_parallel_l, sub_iter, exp_iter, sess_iter, psim_fxn_iter, seed_iter)

In [25]:
psim_intr_l_J_data_tr = pd.concat(results, ignore_index=True)                                  # trial level data
psim_intr_l_J_data = analyses.psim_intr_sess_avg_l(psim_intr_l_J_data_tr)           # session averages
psim_intr_l_J_data_bsa = analyses.psim_intr_btwn_subj_avg_l(psim_intr_l_J_data)     # between-subject average

# store results
store_results_v2(psim_intr_l_J_data_tr, psim_intr_l_J_data, psim_intr_l_J_data_bsa, 'psim_intr_l', 'J', True)

In [26]:
# binary metric, preceding recall
psim_fxn_iter = [analyses.phonetic_sim_H for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.psim_intr_parallel_r, sub_iter, exp_iter, sess_iter, psim_fxn_iter)

In [27]:
psim_intr_r_H_data_tr = pd.concat(results, ignore_index=True)                                  # trial level data
psim_intr_r_H_data = analyses.psim_intr_sess_avg_r(psim_intr_r_H_data_tr)           # session averages
psim_intr_r_H_data_bsa = analyses.psim_intr_btwn_subj_avg_r(psim_intr_r_H_data)     # between-subject average

# store results
store_results_v1(psim_intr_r_H_data_tr, psim_intr_r_H_data, psim_intr_r_H_data_bsa, 'psim_intr_r', True)

In [28]:
# Jaccard index, preceding recall
psim_fxn_iter = [analyses.phonetic_sim_J for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.psim_intr_parallel_r, sub_iter, exp_iter, sess_iter, psim_fxn_iter)

In [29]:
psim_intr_r_J_data_tr = pd.concat(results, ignore_index=True)                                    # trial level data
psim_intr_r_J_data = analyses.psim_intr_sess_avg_r(psim_intr_r_J_data_tr)             # session averages
psim_intr_r_J_data_bsa = analyses.psim_intr_btwn_subj_avg_r(psim_intr_r_J_data)       # between-subject average

# store results
store_results_v2(psim_intr_r_J_data_tr, psim_intr_r_J_data, psim_intr_r_J_data_bsa, 'psim_intr_r', 'J', True)

#### Recall Probability

In [17]:
errors, results = analyses.run_parallel_sessions(client, analyses.p_recall_parallel, sub_iter, exp_iter, sess_iter)

In [18]:
# list level data
p_recall_data_lst = pd.concat(results, ignore_index=True)
p_recall_data_lst.to_csv('analyses/dataframes/p_recall_data_lst.csv', index=False)

# session averages
p_recall_data = analyses.p_recall_sess_avg(p_recall_data_lst)
p_recall_data.to_csv('analyses/dataframes/p_recall_data.csv', index=False)

# between-subject average
p_recall_data_bsa = analyses.p_recall_btwn_subj_avg(p_recall_data)
p_recall_data_bsa.to_csv('analyses/dataframes/p_recall_data_bsa.csv', index=False)

#### Correlations with Recall Probability

In [19]:
df_beh = analyses.aggregate_data_beh(pcs_H_data_lst, pcs_J_data_lst, tcs_data_lst, scs_data_lst, p_recall_data_lst)
df_beh.to_csv('analyses/dataframes/cl_pr.csv', index=False)

p_recall_corrs = analyses.p_recall_correlations(df_beh)
p_recall_corrs_bsa = analyses.p_recall_corr_btwn_subj_avg(p_recall_corrs)
p_recall_corrs_bsa.to_csv('analyses/dataframes/p_recall_corrs_bsa.csv', index=False)

  0%|          | 0/2374 [00:00<?, ?it/s]

#### Phonetic-CRL/IRT

In [35]:
errors, results = analyses.run_parallel_sessions(client, analyses.psim_crl_parallel_v1, sub_iter, exp_iter, sess_iter)

In [36]:
psim_crl_H_data_tr = pd.concat(results, ignore_index=True)                                  # trial level data
psim_crl_H_data = analyses.psim_crl_sess_avg_v1(psim_crl_H_data_tr)             # session average
psim_crl_H_data_bsa = analyses.psim_crl_btwn_subj_avg_v1(psim_crl_H_data)       # between-subject average

# store results
store_results_v1(psim_crl_H_data_tr, psim_crl_H_data, psim_crl_H_data_bsa, 'psim_crl', True)

In [7]:
# Jaccard index
psim_fxn_iter = [analyses.phonetic_sim_J for x in range(len(sub_iter))]
errors, results = analyses.run_parallel_sessions(client, analyses.psim_irt_parallel_v2, sub_iter, exp_iter, sess_iter, psim_fxn_iter)

In [8]:
psim_irt_J_data_tr = pd.concat(results, ignore_index=True)                                # trial level data
psim_irt_J_data_tr = analyses.bin_phonetic_similarities(psim_irt_J_data_tr)
psim_irt_J_data = analyses.psim_irt_sess_avg_v2(psim_irt_J_data_tr)           # session averages
psim_irt_J_data_bsa = analyses.psim_irt_btwn_subj_avg_v2(psim_irt_J_data)     # between-subject average

# store results
store_results_v2(psim_irt_J_data_tr, psim_irt_J_data, psim_irt_J_data_bsa, 'psim_irt', 'J', True)