In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
%load_ext autoreload
%autoreload 2

# imports

In [None]:
import os
import numpy as np
import pandas as pd
from natsort import natsorted
from scipy.stats import pearsonr 

In [None]:
import decision_margin_consistency.analyses.self_consistency as analysis

# load the data

In [None]:
exp_name = 'snr-edges-v1'
df = analysis.load_data(exp_name, nTrials=160)

In [None]:
len(df.workerID.unique())

In [None]:
df.iloc[0]

In [None]:
df.groupby(by=['workerID']).responseCorrect.mean()

# check for outliers

Trim any subjects more than 3 STD from the mean (there were none in this dataset)

In [None]:
import seaborn as sns
subjects = df.workerID.unique()
accuracy = df.groupby(by=['workerID']).responseCorrect.mean()
M = accuracy.mean()
STD = accuracy.std()
lower = M - 3*STD 
upper = min(.99, M + 3*STD)
outliers = (accuracy < lower) | (accuracy > upper)
any(outliers)

In [None]:
g = sns.displot(accuracy)
g.set(xlim=(.50, 1.00));

# self-consistency analysis

We computed self-consistency by comparing subjects to themselves (1st vs. 2nd trial across images). TLDR, subjects do not respond consistency across trials.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
results = analysis.compute_summary(df)
results

In [None]:
condNames = sorted(results.condName.unique())
subjects = results.subject.unique()
condNames, subjects, len(subjects)

For each individual, compute the mean accuracy for the 1st and 2nd trial, and the correlation across items for trial1 accuracy vs. trial2 accuracy.

In [None]:
trial1_acc = []
trial2_acc = []
corrs = []
r2s = []
N = len(subjects)
for subject in subjects:
    subset = results[results.subject==subject]
    corr = pearsonr(subset.correct1, subset.correct2)[0]
    r2 = corr**2
    corrs.append(corr)
    r2s.append(r2)
    trial1_acc.append(subset.correct1.mean())
    trial2_acc.append(subset.correct2.mean())
avg_trial1_acc = np.mean(trial1_acc)
avg_trial2_acc = np.mean(trial2_acc)
avg_corr = np.mean(corrs)
avg_r2 = np.mean(r2s)  
print(f"Summary of first vs. second response performance (N={N})")
print(f"Mean proportion correct first trial = {avg_trial1_acc:3.3f}, vs. second trial = {avg_trial2_acc:3.3f}")
print(f"Correlation across items first vs. second response, r={avg_corr:3.3f}, r2={avg_r2:3.3}")

Next we compute cohen's kappa (error consistency) between first and second response.

In [None]:
from scipy import stats

print("\nError consistency of first vs. second response (within a subject) ==>")
kappas = []
corrs = []
for subject in subjects:
    subset = results[results.subject==subject]
    condName = subset.iloc[0].condName
    assert len(subset)==80
    err_con = analysis.compute_error_consistency(subset.correct1.values, subset.correct2.values)
    r = pearsonr(subset.correct1, subset.correct2)[0]
    kappas.append(err_con['k'])
    corrs.append(r)
    # print(f"{subject[0:5]}... ({condName}): c_exp={err_con['c_exp']:2.3f}, c_obs={err_con['c_obs']:2.3f} kappa={err_con['k']:2.3f}, r={r:2.3f}")

    
# summary stats Kappa    
kappas = np.array(kappas)
mean_kappa = np.mean(kappas)
sem_kappa = stats.sem(kappas)     
ci_kappa = stats.t.interval(0.95, len(kappas) - 1, loc=mean_kappa, scale=sem_kappa)
        
print(f"Cohen's kappa (average): {mean_kappa:3.2f} (95% CI: [{ci_kappa[0]:3.2f},{ci_kappa[1]:3.2f}])")

# summary stats correlations
corrs = np.array(corrs)
mean_corr = np.mean(corrs)
sem_corr = stats.sem(corrs)     
ci_corr = stats.t.interval(0.95, len(corrs) - 1, loc=mean_corr, scale=sem_corr)

print(f"Pearson's r (average): {mean_corr:3.2f} (95% CI: [{ci_corr[0]:3.2f},{ci_corr[1]:3.2f}]), r\u00B2={mean_corr**2:3.2f}")

# Cohen's Kappa Scores are nearly Identical to Pearson R over response accuracy

While Cohen's Kappa is well-justified for the binary correct/incorrect scores, the resulting kappa values are very similar to pearson r over the same scores.

In [None]:
stats.sem(kappas)

In [None]:
len(kappas)

In [None]:
ax = sns.scatterplot(x=kappas, y=corrs)
ax.axis('square');
ax.set_xlim([.3,1.0]);
ax.set_ylim([.3,1.0]);
ax.set_xlabel("Cohen's Kappa")
ax.set_ylabel("pearson r")
ax.plot([0, 1], [0, 1], transform=ax.transAxes)

# between-subject error consistency



In [None]:
import math
from pdb import set_trace
from fastprogress import master_bar, progress_bar
from collections import defaultdict
from scipy import stats
from functools import partial

def compute_between_subject_error_consistency(df):
    corrs = []
    corrs_half = []
    corrs_12 = []
    corrs_21 = []
    r1s = []
    r2s = []
    
    # Two groups of subjects; each group saw a different set of images, but saw each image twice
    # so we can only compare responses within a group
    subject_group = sorted(df.condName.unique())
    
    results = defaultdict(list)
    mb = master_bar(subject_group)
    for condName in mb:
        df_ = df[df.condName==condName]
        all_items = df_.item.unique()
        subjects = df_.subject.unique()
        num_subj = len(subjects)
        for idx1 in progress_bar(range(0,num_subj-1), parent=mb):
            sub1 = subjects[idx1]
            dat1 = df_[df_.subject==sub1].reset_index()
            assert len(dat1)==80
            for idx2 in range(idx1+1,num_subj):
                sub2 = subjects[idx2]      
                dat2 = df_[df_.subject==sub2].reset_index()
                assert len(dat1)==80
                assert all((dat1.subject == dat2.subject)==False)
                assert all((dat1.item == dat2.item)==True)
                corr_S1Avg_S2Avg = pearsonr(dat1.correctAvg, dat2.correctAvg)[0]
                corr_S1R1_S2R1 = pearsonr(dat1.correct1, dat2.correct1)[0]
                corr_S1R2_S2R2 = pearsonr(dat1.correct2, dat2.correct2)[0]
                corr_S1R1_S2R2 = pearsonr(dat1.correct1, dat2.correct2)[0]
                corr_S1R2_S2R1 = pearsonr(dat1.correct2, dat2.correct1)[0]

                errcon_S1R1_S2R1 = analysis.compute_error_consistency(dat1.correct1.values, dat2.correct1.values)['k']
                errcon_S1R2_S2R2 = analysis.compute_error_consistency(dat1.correct2.values, dat2.correct2.values)['k']
                errcon_S1R1_S2R2 = analysis.compute_error_consistency(dat1.correct1.values, dat2.correct2.values)['k']
                errcon_S2R1_S2R1 = analysis.compute_error_consistency(dat1.correct2.values, dat2.correct1.values)['k']
                
                results['subj_group'].append(condName)
                results['sub1'].append(sub1)
                results['sub2'].append(sub2)
                
                results['corr_S1Avg_S2Avg'].append(corr_S1Avg_S2Avg)
                results['corr_S1R1_S2R1'].append(corr_S1R1_S2R1)
                results['corr_S1R2_S2R2'].append(corr_S1R2_S2R2)
                results['corr_S1R1_S2R2'].append(corr_S1R1_S2R2)
                results['corr_S1R2_S2R1'].append(corr_S1R2_S2R1)
                results['corr_R1_R2_Avg'].append((corr_S1R1_S2R2+corr_S1R2_S2R1)/2)

                results['errcon_S1R1_S2R1'].append(errcon_S1R1_S2R1)
                results['errcon_S1R2_S2R2'].append(errcon_S1R2_S2R2)
                results['errcon_S1R1_S2R2'].append(errcon_S1R1_S2R2)
                results['errcon_S2R1_S2R1'].append(errcon_S2R1_S2R1)
    
    results = pd.DataFrame(results)
    return results

    

In [None]:
summary = analysis.compute_summary(df)
summary

In [None]:
avg = summary.groupby('item').mean(numeric_only=True).reset_index()
avg

In [None]:
sums = summary.groupby('item').sum(numeric_only=True).reset_index()
sums

In [None]:
print(pearsonr(avg.correct1, avg.correct2)[0])
sns.scatterplot(x=avg.correct1,y=avg.correct2)

In [None]:
between = compute_between_subject_error_consistency(summary)
between

In [None]:
def compute_summary_stats(df, startswith=['corr', 'errcon', 'dmc']):
    columns = [name for name in df.columns.values if any([name.startswith(pattern) for pattern in startswith])]
    results = defaultdict(list)
    for col in columns:
        scores = df[col].values
        mean = np.mean(scores)
        sem = stats.sem(scores)     
        ci = stats.t.interval(0.95, len(scores) - 1, loc=mean, scale=sem)
        
        results['score'].append(col)
        results['N'].append(len(scores))
        results['mean'].append(mean)
        results['sem'].append(sem)
        results['ci_lower'].append(ci[0])
        results['ci_upper'].append(ci[1])
        
        print(f"{col}={mean:3.3f} (95% CI: [{ci[0]:3.3f},{ci[1]:3.3f}])")
    
    return pd.DataFrame(results)

In [None]:
compute_summary_stats(between)

In [None]:
between.errcon_S1R1_S2R1.mean(), between.errcon_S1R2_S2R2.mean(), between.errcon_S1R1_S2R2.mean(), between.errcon_S2R1_S2R1.mean()

In [None]:
between.corr_S1Avg_S2Avg.mean(), between.corr_S1R1_S2R1.mean(), between.corr_S1R2_S2R2.mean(), between.corr_R1_R2_Avg.mean()

In [None]:
import pandas as pd
from collections import defaultdict

def get_group_avg_accuracy(df):
    results = defaultdict(list)
    for condName in condNames:
        df_ = df[df.condName==condName]
        all_items = natsorted(df_.item.unique())
        subjects = df_.subject.unique()

        for item in all_items:
            subset = df_[df_.item==item]
            assert len(subset)==len(subjects)
            results['condName'].append(condName)
            results['item'].append(item)
            results['correct1'].append(subset.correct1.mean())
            results['correct2'].append(subset.correct2.mean())
            results['count1'].append(len(subset.correct1))
            results['count2'].append(len(subset.correct2))
            
    results = pd.DataFrame(results)

    return results

avg_acc = get_group_avg_accuracy(summary)
avg_acc

In [None]:
for condName in condNames:
    subset = avg_acc[avg_acc.condName==condName]
    assert len(subset)==80  
    r = pearsonr(subset.correct1, subset.correct2)[0]
    print(f"{condName}: r={r:3.3f}, r\u00B2={r*r:3.3f}")
r = pearsonr(avg_acc.correct1, avg_acc.correct2)[0]    
print(f"overall: r={r:3.3f}, r\u00B2={r*r:3.3f}")

In [None]:
import matplotlib.pyplot as plt 

for condName in condNames:
    subset = avg_acc[avg_acc.condName==condName]
    assert len(subset)==80  
    r = pearsonr(subset.correct1, subset.correct2)[0]
    print(f"{condName}: r={r:3.3f}")

    ax = sns.scatterplot(x=subset.correct1, y=subset.correct2)
    ax.axis('square');
    ax.set_xlim([0,1.2]);
    ax.set_ylim([0,1.2]);
    ax.set_xlabel("accuracy first presentation")
    ax.set_ylabel("accuracy second presentation")
    plt.show()

# dprime

Question: Hey, we're using ideas from signal detection theory, why not calculate d'?

Answer: OK, but then we have to "adjust" scores for floor/ceiling effects (d' is undefined for Pc=1.0 or 0.0). There are standard adjustments for that, but using these adjusted scores affects pearsonr, and so we should therefore use spearmanr for analyses using these adjusted scores.

In [None]:
import numpy as np
from decision_margin_consistency.helpers.dprime import dprime_mAFC, adjusted_pc, adjusted_pc_edge_cases
from scipy.stats import spearmanr

In [None]:
avg_acc = get_group_avg_accuracy(summary)
avg_acc

Pearson's correlation between trial1 and trial2 is slightly degraded by converting to dprime.

In [None]:
for condName in condNames:
    subset = avg_acc[avg_acc.condName==condName]
    assert len(subset)==80  
    dprime1 = dprime_mAFC(subset.correct1, N=subset.count1, m=16)
    dprime2 = dprime_mAFC(subset.correct2, N=subset.count2, m=16)
    
    r = pearsonr(dprime1, dprime2)[0]
    print(f"{condName}: r={r:3.3f}, r\u00B2={r*r:3.3f}")
dprime1 = dprime_mAFC(avg_acc.correct1, N=avg_acc.count1, m=16)
dprime2 = dprime_mAFC(avg_acc.correct2, N=avg_acc.count2, m=16)    
r = pearsonr(dprime1, dprime2)[0]    
print(f"overall: r={r:3.3f}, r\u00B2={r*r:3.3f}")

In [None]:
ax = sns.scatterplot(x=dprime1, y=dprime2)
ax.axis('square');
ax.set_xlabel("dprime first presentation")
ax.set_ylabel("dprime second presentation")
plt.show()

There's literally no difference in spearmanr between the dprime and original percent correct scores, so the only thing we buy is interpreting the scores as signal-to-noise measures of distance from the decision margin (aka snr signal strength or sensitivity).

In [None]:
for condName in condNames:
    subset = avg_acc[avg_acc.condName==condName]
    assert len(subset)==80  
    dprime1 = dprime_mAFC(subset.correct1, N=subset.count1, m=16)
    dprime2 = dprime_mAFC(subset.correct2, N=subset.count2, m=16)
    
    r = spearmanr(dprime1, dprime2)[0]
    print(f"{condName}: r={r:3.3f}, r\u00B2={r*r:3.3f}")
dprime1 = dprime_mAFC(avg_acc.correct1, N=avg_acc.count1, m=16)
dprime2 = dprime_mAFC(avg_acc.correct2, N=avg_acc.count2, m=16)    
r = spearmanr(dprime1, dprime2)[0]    
print(f"overall: r={r:3.3f}, r\u00B2={r*r:3.3f}")

In [None]:
for condName in condNames:
    subset = avg_acc[avg_acc.condName==condName]
    assert len(subset)==80  
    r = spearmanr(subset.correct1, subset.correct2)[0]
    print(f"{condName}: r={r:3.3f}, r\u00B2={r*r:3.3f}")
r = spearmanr(avg_acc.correct1, avg_acc.correct2)[0]    
print(f"overall: r={r:3.3f}, r\u00B2={r*r:3.3f}")