In [19]:
import pandas as pd
import numpy as np
import altair as alt
from joblib import Parallel, delayed
from numpy.random import Generator, PCG64
rng = np.random.default_rng()

# Data Preprocessing

In [40]:
aoa_df = pd.read_csv("../data/word_age_of_acquisition.csv")
aoa_df = aoa_df.replace({"comb (object)": "comb"})
df = pd.read_csv("../data/kisumu vocab data.csv")

In [41]:
word_classification = pd.read_csv("../data/word_classification_df.csv")
word_classification['Target'] = word_classification['Target'].apply(lambda w : w.lower())
word_classification['Near Distractor'] = word_classification['Near Distractor'].apply(lambda w : w.lower())
word_classification['Random 1'] = word_classification['Random 1'].apply(lambda w : w.lower())
word_classification['Random 2'] = word_classification['Random 2'].apply(lambda w : w.lower())

In [42]:
def find_word(r):
    target = r['target_word'].title()
    word_response = subject_responses[
        (subject_responses["Subject Number"] == r["child"])
    ].iloc[0][target]

    return word_response.lower()

def classify_response(r):
    res = r['response']
    if res == r['Target']:
        return 'Target'
    elif res == r['Near Distractor']:
        return 'Near Distractor'
    elif res == r['Random 1'] or r['Random 2']:
        return 'Random'
    else:
        return ''

subject_responses = pd.read_csv("../data/Kisumu 2024 Vocabulary - Vocab.csv").dropna(subset="Subject Number")
all_df = pd.merge(df, word_classification, left_on="target_word", right_on="Target", how="left")
all_df = pd.merge(all_df, aoa_df, on='target_word', how='left')
all_df['response'] = all_df.apply(find_word, axis=1)
all_df['response_type'] = all_df.apply(classify_response, axis=1)

# Error Plot

In [43]:
filtered_df = all_df.copy()
filtered_df = filtered_df[filtered_df['accuracy'] == 0]
filtered_df['is_near_distractor'] = (filtered_df['response_type'] == 'Near Distractor').astype(int)
filtered_df['is_random'] = (filtered_df['response_type'] == 'Random').astype(int)
filtered_df['condition'] = filtered_df['condition'].replace({
    'bw': 'black_white',
    'obj': 'object',
})

In [44]:
def bootstrap_ci(
        data,
        measure,
        id_col,
        n_iterations=10000,
        statistic=np.mean):
    
    items = list(data[id_col].unique())
    n_size = len(items)
    df = data.copy()

    def bootstrap_iteration(data, chosen_items):
        filter_df = data[data[id_col].isin(chosen_items)] # Filter based on chosen questions
        bs_mean = statistic(filter_df[measure]) 
        return (bs_mean, list(chosen_items))

    qset_means = Parallel(n_jobs=-1)(
        delayed(bootstrap_iteration)(
            df.copy(),
            rng.choice(items, n_size,  replace=True)
        ) for _ in range(n_iterations)
    )
    
    means = []
    qs_used = []
    means = [bs_mean for bs_mean, chosen_qs in qset_means]
 
    # 95% confidence interval
    lower = np.percentile(means, 2.5)
    upper = np.percentile(means, 97.5)
    
    return lower, upper


def create_confidence_interval_df(
    data,
    measure, 
    id_col,
    condition_col,
    statistic=np.mean
):
    data_list = []

    for condition in data[condition_col].unique():
        condition_data = data[data[condition_col] == condition]

        lower, upper = bootstrap_ci(condition_data, measure=measure, statistic=statistic, id_col=id_col)

        data_list.append({
            "condition": condition,
            "ci_upper": upper, 
            "ci_lower": lower,
        })

    return pd.DataFrame(data_list)


In [45]:
def get_error_dfs(df, measure):
    agg_data = df.groupby(['condition']).agg(
        mean_prop=(measure, 'mean')
    ).reset_index()
    
    word_level_data = df.groupby(['condition', 'target_word']).agg(
        mean_prop=(measure, 'mean')
    ).reset_index()
    word_level_data['item_id'] = word_level_data['condition'] + word_level_data['target_word']
    
    ci_df = create_confidence_interval_df(
        data=df,
        measure=measure,
        id_col='target_word',
        condition_col='condition'
    )

    error_df = pd.merge(agg_data, ci_df, on=['condition'])

    return error_df, word_level_data

error_dfs, item_dfs = get_error_dfs(
    filtered_df, 
    measure='is_random'
)

In [49]:
def create_error_plot(error_df, item_level_df):
    ci_plot = alt.Chart(error_df).mark_errorbar().encode(
        x=alt.X("condition:N", title=None),
        y=alt.Y("ci_upper", title="Proportion Far Distractors"),
        y2=alt.Y2("ci_lower"),
        strokeWidth=alt.value(2),
        color=alt.Color('condition').legend(None)
    )

    mean_points = alt.Chart(error_df).mark_point(filled=True, size=75, opacity=1).encode(
        x=alt.X('condition:N', scale=alt.Scale(domain=["black_white", 'cartoon', 'photo', 'object'])),
        y=alt.Y('mean_prop:Q', scale=alt.Scale(domain=[0,1])),
        color='condition:N'
    )
    
    scatter_plot = alt.Chart(item_level_df).mark_circle(size=16,opacity=0.5).encode(
        x=alt.X("condition:N", title=None),
        y=alt.Y("mean_prop:Q", scale=alt.Scale(domain=[0,1])),
        xOffset="jitter:Q",
        color=alt.Color('condition:N').legend(None),
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())" 
    )
    
    dashed_line = alt.Chart(error_df).mark_rule(strokeDash=[5, 10], color='black').encode(
        y=alt.datum(0.66),
        opacity=alt.value(0.5)
    )
    
    return mean_points + scatter_plot + ci_plot + dashed_line

plot = create_error_plot(error_dfs, item_dfs)
plot = plot.properties(width=150, title="Proportion of Incorrect Responses")
plot.save("proportion_incorrect_plot.pdf")
plot

In [50]:
error_dfs

Unnamed: 0,condition,mean_prop,ci_upper,ci_lower
0,black_white,0.733333,0.841463,0.589037
1,cartoon,0.66879,0.777778,0.5
2,object,0.533333,0.684211,0.340397
3,photo,0.713333,0.8,0.550995


# Model Comparisons

In [31]:
all_selection_df = all_df.copy()
all_selection_df = all_selection_df[all_selection_df['accuracy'] == 0]
all_selection_df['is_near_distractor'] = (all_selection_df['response_type'] == 'Near Distractor').astype(int)
all_selection_df['is_random'] = (all_selection_df['response_type'] == 'Random').astype(int)
all_selection_df = all_selection_df[['child', 'condition', 'age', 'target_word', 'accuracy', 'response_type', 'is_near_distractor', 'is_random']]
all_selection_df.to_csv("error_trials_only.csv")

## Loading R

In [32]:
%load_ext rpy2.ipython

In [33]:
%%R
install.packages("lme4")
library(lme4)

--- Please select a CRAN mirror for use in this session ---
Secure CRAN mirrors 

 1: 0-Cloud [https]
 2: Australia (Canberra) [https]
 3: Australia (Melbourne 1) [https]
 4: Australia (Melbourne 2) [https]
 5: Austria (Wien 1) [https]
 6: Belgium (Brussels) [https]
 7: Brazil (PR) [https]
 8: Brazil (SP 1) [https]
 9: Brazil (SP 2) [https]
10: Bulgaria [https]
11: Canada (MB) [https]
12: Canada (ON 1) [https]
13: Canada (ON 2) [https]
14: Chile (Santiago) [https]
15: China (Beijing 2) [https]
16: China (Beijing 3) [https]
17: China (Hefei) [https]
18: China (Hong Kong) [https]
19: China (Jinan) [https]
20: China (Lanzhou) [https]
21: China (Nanjing) [https]
22: China (Shanghai 2) [https]
23: China (Shenzhen) [https]
24: China (Wuhan) [https]
25: Colombia (Cali) [https]
26: Costa Rica [https]
27: Cyprus [https]
28: Czech Republic [https]
29: Denmark [https]
30: East Asia [https]
31: Ecuador (Cuenca) [https]
32: France (Lyon 1) [https]
33: France (Lyon 2) [https]
34: France (Marseille) 

Selection:  68



The downloaded binary packages are in
	/var/folders/v8/3zpbxkws53b3x6m8509jyml80000gn/T//RtmpAhZ5Zf/downloaded_packages


trying URL 'https://ftp.osuosl.org/pub/cran/bin/macosx/big-sur-arm64/contrib/4.4/lme4_1.1-36.tgz'
Content type 'application/x-gzip' length 7079300 bytes (6.8 MB)
downloaded 6.8 MB

Loading required package: Matrix
In doTryCatch(return(expr), name, parentenv, handler) :
  unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
  dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
  Referenced from: <34C5A480-1AC4-30DF-83C9-30A913FC042E> /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/modules/R_X11.so
  Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/usr/local/lib/libSM.6.dylib' (no such file), '/usr/lib/libSM.6.dylib' (no such file, not in dyld cache)


## Model Comparison

The below is implemented in the `picture-perception-error-analysis.Rmd` file

In [34]:
%%R -i all_selection_df

base_model <- glmer(
    is_random ~ 1  + (condition | child) + (condition * age | target_word), 
    data=all_selection_df, 
    family = "binomial"
)

condition_model <- glmer(
    is_random ~ condition * age + (condition | child) + (condition * age | target_word), 
    data=all_selection_df, 
    family = "binomial"
)

anova(base_model, condition_model, test = "Chisq")

1: In commonArgs(par, fn, control, environment()) :
  maxfun < 10 * length(par)^2 is not recommended.
2: In optwrap(optimizer, devfun, start, rho$lower, control = control,  :
  convergence code 1 from bobyqa: bobyqa -- maximum number of function evaluations exceeded
3: In (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf,  :
  failure to converge in 10000 evaluations
4: In optwrap(optimizer, devfun, start, rho$lower, control = control,  :
  convergence code 4 from Nelder_Mead: failure to converge in 10000 evaluations
5: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv,  :
  unable to evaluate scaled gradient
6: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv,  :
  Model failed to converge: degenerate  Hessian with 1 negative eigenvalues
7: In commonArgs(par, fn, control, environment()) :
  maxfun < 10 * length(par)^2 is not recommended.
8: In (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf,  :
  failure to converg

In [None]:
%%R
base_model <- glmer(
    is_random ~ 1  + (condition | child), 
    data=all_selection_df, 
    family = "binomial"
)

condition_model <- glmer(
    is_random ~ condition + (condition | child), 
    data=all_selection_df, 
    family = "binomial"
)

anova(base_model, condition_model, test = "Chisq")