## 1. Native Speakers

### 1.1 Load immediate RCT results from the control and treatment groups

In [104]:
import pandas as pd

df_story = pd.read_csv("./rct_results/10_concepts_native_story_scores.tsv", sep="\t")
df_def = pd.read_csv("./rct_results/10_concepts_native_definition_scores.tsv", sep="\t")
print("df_story - native", df_story.shape)
print("df_def - native", df_def.shape)

df_story - native (160, 20)
df_def - native (165, 19)


In [105]:
# sanity check -- no intersection of participants in the same batch
story_group_tuple_set = set(tuple(x) for x in df_story[['batch_id', 'PROLIFIC_PID']].to_numpy())
def_group_tuple_set = set(tuple(x) for x in df_def[['batch_id', 'PROLIFIC_PID']].to_numpy())
assert len(story_group_tuple_set.intersection(def_group_tuple_set)) == 0, "there is some overlap between two groups"

In [106]:
print("RCT results for the control group among the non-native speakers")

df_def['q1_pred'] = (df_def['conceptQ'] == df_def['concept_question_answer'])
df_def['q2_pred'] = (df_def['predictionQ'] == df_def['prediction_question_answer'])
df_def['q3_pred'] = (df_def['limitationQ'] == df_def['limitation_question_answer'])

print((df_def[['q1_pred', 'q2_pred', 'q3_pred']].sum()*100 / df_def.shape[0]).round(2))
print(df_def[['PROLIFIC_PID', 'q1_pred', 'q2_pred', 'q3_pred']].groupby("PROLIFIC_PID").mean().reset_index()[['q1_pred', 'q2_pred', 'q3_pred']].astype(float).std().round(4))
print("Familiarity Score: mean: {}, std: {}".format(df_def['familiarity_concept'].astype(float).mean(), df_def['familiarity_concept'].astype(float).std()))
print("Relevance Score: mean: {}, std: {}".format(df_def['relevance'].astype(float).mean(), df_def['relevance'].astype(float).std()))
print("Interest Score: mean: {}, std: {}".format(df_def['law_interest'].astype(float).mean(), df_def['law_interest'].astype(float).std()))
print("perceived_difficulty: mean: {}, std: {}".format(df_def['perceived_difficulty'].astype(float).mean(), df_def['perceived_difficulty'].astype(float).std()))


RCT results for the control group among the non-native speakers
q1_pred    93.33
q2_pred    78.79
q3_pred    77.58
dtype: float64
q1_pred    0.1203
q2_pred    0.2127
q3_pred    0.2258
dtype: float64
Familiarity Score: mean: 1.8848484848484848, std: 1.226764834273666
Relevance Score: mean: 2.6303030303030304, std: 1.3076894650034483
Interest Score: mean: 3.6666666666666665, std: 1.1225696422164713
perceived_difficulty: mean: 3.096969696969697, std: 1.2058002222198316


In [107]:
print("RCT results for the treatment group among the native speakers")

df_story['q1_pred'] = (df_story['conceptQ'] == df_story['concept_question_answer'])
df_story['q2_pred'] = (df_story['predictionQ'] == df_story['prediction_question_answer'])
df_story['q3_pred'] = (df_story['limitationQ'] == df_story['limitation_question_answer'])

print((df_story[['q1_pred', 'q2_pred', 'q3_pred']].sum()*100 / df_story.shape[0]).round(2))
print(df_story[['PROLIFIC_PID', 'q1_pred', 'q2_pred', 'q3_pred']].groupby("PROLIFIC_PID").mean().reset_index()[['q1_pred', 'q2_pred', 'q3_pred']].astype(float).std().round(4))
# print(df_story[['q1_pred', 'q2_pred', 'q3_pred', 'q_id']].groupby(by="q_id").sum() / 16)
print("Familiarity Score: mean: {}, std: {}".format(df_story['familiarity_concept'].astype(float).mean(), df_story['familiarity_concept'].astype(float).std()))
print("Relevance Score: mean: {}, std: {}".format(df_story['relevance'].astype(float).mean(), df_story['relevance'].astype(float).std()))
print("Interest Score: mean: {}, std: {}".format(df_story['law_interest'].astype(float).mean(), df_story['law_interest'].astype(float).std()))
print("perceived_difficulty: mean: {}, std: {}".format(df_story['perceived_difficulty'].astype(float).mean(), df_story['perceived_difficulty'].astype(float).std()))



RCT results for the treatment group among the native speakers
q1_pred    90.62
q2_pred    74.38
q3_pred    84.38
dtype: float64
q1_pred    0.1900
q2_pred    0.2651
q3_pred    0.2422
dtype: float64
Familiarity Score: mean: 1.74375, std: 1.1505911797389556
Relevance Score: mean: 3.20625, std: 1.3274881704811141
Interest Score: mean: 3.78125, std: 0.9947780321927768
perceived_difficulty: mean: 3.14375, std: 1.2226729919659707


### 1.2 Load follow-up RCT results from the control and treatment groups

In [108]:
def compute_group_accuracy(df_):
    df_['conceptQ'] = df_['conceptQ'].astype(float)
    df_['concept_question_answer'] = df_['concept_question_answer'].astype(float)
    df_['predictionQ'] = df_['predictionQ'].astype(float)
    df_['prediction_question_answer'] = df_['prediction_question_answer'].astype(float)
    df_['limitationQ'] = df_['limitationQ'].astype(float)
    df_['limitation_question_answer'] = df_['limitation_question_answer'].astype(float)
    
    df_['q1_pred'] = (df_['conceptQ'] == df_['concept_question_answer'])
    df_['q2_pred'] = (df_['predictionQ'] == df_['prediction_question_answer'])
    df_['q3_pred'] = (df_['limitationQ'] == df_['limitation_question_answer'])

    # num_ppl = df_.shape[0] / 10
    # print("num_ppl", num_ppl)

    # print((df_[['q1_pred', 'q2_pred', 'q3_pred']].sum()*100 / df_.shape[0]).round(2))
    # print(df_[['PROLIFIC_PID', 'q1_pred', 'q2_pred', 'q3_pred']].groupby("PROLIFIC_PID").mean().reset_index()[['q1_pred', 'q2_pred', 'q3_pred']].astype(float).std().round(4))

    # print("Familiarity Score: mean: {}, std: {}".format(df_['familiarity_concept'].astype(float).mean(), df_['familiarity_concept'].astype(float).std()))
    return df_

def compute_group_retention_accuracy(df0, df1):
    df0_ = df0.sort_values(by=['concept', 'PROLIFIC_PID'])
    df1_ = df1.sort_values(by=['concept', 'PROLIFIC_PID'])

    for questionPrediction in ['q1_pred', 'q2_pred', 'q3_pred']:
        df0_tmp = df0_[df0_[questionPrediction]].sort_values(by=['concept', 'PROLIFIC_PID'])
        df1_tmp = df1_.merge(df0_tmp[['PROLIFIC_PID', 'q_id']], on=['PROLIFIC_PID', 'q_id']).sort_values(by=['concept', 'PROLIFIC_PID'])
        print((df1_tmp[[questionPrediction]].sum()*100 / df1_tmp.shape[0]).round(2))


df_def = pd.read_csv("./rct_results/10_concepts_native_definition_scores.tsv", sep="\t")
df_story = pd.read_csv("./rct_results/10_concepts_native_story_scores.tsv", sep="\t")

df_def_followup = pd.read_csv("./rct_results/10_concepts_native_definition_scores_followup.tsv", sep="\t")
df_story_followup = pd.read_csv("./rct_results/10_concepts_native_story_scores_followup.tsv", sep="\t")

df_def_both = df_def.merge(df_def_followup[['PROLIFIC_PID', 'q_id']], on=['PROLIFIC_PID', 'q_id'])
df_story_both = df_story.merge(df_story_followup[['PROLIFIC_PID', 'q_id']], on=['PROLIFIC_PID', 'q_id'])

print(df_def.shape, df_story.shape)
print(df_def_followup.shape, df_story_followup.shape)
print(df_def_both.shape, df_story_both.shape)


(165, 19) (160, 20)
(100, 11) (105, 11)
(100, 19) (105, 20)


In [117]:
df_def_both = compute_group_accuracy(df_def_both)
df_def_followup = compute_group_accuracy(df_def_followup)

print("Rentention Scores for the Control Group -- Native Speakers")
compute_group_retention_accuracy(df_def_both, df_def_followup)

Rentention Scores for the Control Group -- Native Speakers
q1_pred    92.55
dtype: float64
q2_pred    88.89
dtype: float64
q3_pred    91.03
dtype: float64


In [118]:
df_story_both = compute_group_accuracy(df_story_both)
df_story_followup = compute_group_accuracy(df_story_followup)

print("Rentention Scores for the Treatment Group -- Native Speakers")
compute_group_retention_accuracy(df_story_both, df_story_followup)

Rentention Scores for the Treatment Group -- Native Speakers
q1_pred    91.58
dtype: float64
q2_pred    86.84
dtype: float64
q3_pred    91.01
dtype: float64


## 2. Non-native Speakers

### 2.1 Load immediate RCT results from the control and treatment groups

In [119]:
import pandas as pd

df_story = pd.read_csv("./rct_results/10_concepts_non_native_story_scores.tsv", sep="\t")
df_def = pd.read_csv("./rct_results/10_concepts_non_native_definition_scores.tsv", sep="\t")
print("df_story - non-native", df_story.shape)
print("df_def - non-native", df_def.shape)

df_story - non-native (170, 20)
df_def - non-native (185, 19)


In [120]:
print("RCT results for the control group among the non-native speakers")

df_def['q1_pred'] = (df_def['conceptQ'] == df_def['concept_question_answer'])
df_def['q2_pred'] = (df_def['predictionQ'] == df_def['prediction_question_answer'])
df_def['q3_pred'] = (df_def['limitationQ'] == df_def['limitation_question_answer'])

print((df_def[['q1_pred', 'q2_pred', 'q3_pred']].sum()*100 / df_def.shape[0]).round(2))
print(df_def[['PROLIFIC_PID', 'q1_pred', 'q2_pred', 'q3_pred']].groupby("PROLIFIC_PID").mean().reset_index()[['q1_pred', 'q2_pred', 'q3_pred']].astype(float).std().round(4))
print("Familiarity Score: mean: {}, std: {}".format(df_def['familiarity_concept'].astype(float).mean(), df_def['familiarity_concept'].astype(float).std()))
print("Relevance Score: mean: {}, std: {}".format(df_def['relevance'].astype(float).mean(), df_def['relevance'].astype(float).std()))
print("Interest Score: mean: {}, std: {}".format(df_def['law_interest'].astype(float).mean(), df_def['law_interest'].astype(float).std()))
print("perceived_difficulty: mean: {}, std: {}".format(df_def['perceived_difficulty'].astype(float).mean(), df_def['perceived_difficulty'].astype(float).std()))


RCT results for the control group among the non-native speakers
q1_pred    89.19
q2_pred    71.89
q3_pred    68.65
dtype: float64
q1_pred    0.1455
q2_pred    0.2434
q3_pred    0.2627
dtype: float64
Familiarity Score: mean: 1.8108108108108107, std: 0.9901097281592252
Relevance Score: mean: 2.4702702702702704, std: 1.1704850501582447
Interest Score: mean: 3.8378378378378377, std: 0.9182187835315079
perceived_difficulty: mean: 3.2, std: 1.1315629821661257


In [121]:
print("RCT results for the treatment group among the non-native speakers")

df_story['q1_pred'] = (df_story['conceptQ'] == df_story['concept_question_answer'])
df_story['q2_pred'] = (df_story['predictionQ'] == df_story['prediction_question_answer'])
df_story['q3_pred'] = (df_story['limitationQ'] == df_story['limitation_question_answer'])

print((df_story[['q1_pred', 'q2_pred', 'q3_pred']].sum()*100 / df_story.shape[0]).round(2))
print(df_story[['PROLIFIC_PID', 'q1_pred', 'q2_pred', 'q3_pred']].groupby("PROLIFIC_PID").mean().reset_index()[['q1_pred', 'q2_pred', 'q3_pred']].astype(float).std().round(4))
# print(df_story[['q1_pred', 'q2_pred', 'q3_pred', 'q_id']].groupby(by="q_id").sum() / 16)
print("Familiarity Score: mean: {}, std: {}".format(df_story['familiarity_concept'].astype(float).mean(), df_story['familiarity_concept'].astype(float).std()))
print("Relevance Score: mean: {}, std: {}".format(df_story['relevance'].astype(float).mean(), df_story['relevance'].astype(float).std()))
print("Interest Score: mean: {}, std: {}".format(df_story['law_interest'].astype(float).mean(), df_story['law_interest'].astype(float).std()))
print("perceived_difficulty: mean: {}, std: {}".format(df_story['perceived_difficulty'].astype(float).mean(), df_story['perceived_difficulty'].astype(float).std()))


RCT results for the treatment group among the non-native speakers
q1_pred    91.18
q2_pred    81.76
q3_pred    84.71
dtype: float64
q1_pred    0.1235
q2_pred    0.1819
q3_pred    0.1844
dtype: float64
Familiarity Score: mean: 1.6294117647058823, std: 1.0310359619772527
Relevance Score: mean: 3.1882352941176473, std: 1.171597984720104
Interest Score: mean: 4.029411764705882, std: 1.2036751935046288
perceived_difficulty: mean: 3.052941176470588, std: 1.2929566351678787


In [122]:
# sanity check -- no intersection of participants in the same batch
story_group_tuple_set = set(tuple(x) for x in df_story[['batch_id', 'PROLIFIC_PID']].to_numpy())
def_group_tuple_set = set(tuple(x) for x in df_def[['batch_id', 'PROLIFIC_PID']].to_numpy())
assert len(story_group_tuple_set.intersection(def_group_tuple_set)) == 0, "there is some overlap between two groups"

### 2.2 Load follow-up RCT results from the control and treatment groups

In [124]:
df_def = pd.read_csv("./rct_results/10_concepts_non_native_definition_scores.tsv", sep="\t")
df_story = pd.read_csv("./rct_results/10_concepts_non_native_story_scores.tsv", sep="\t")

df_def_followup = pd.read_csv("./rct_results/10_concepts_non_native_definition_scores_followup.tsv", sep="\t")
df_story_followup = pd.read_csv("./rct_results/10_concepts_non_native_story_scores_followup.tsv", sep="\t")

df_def_both = df_def.merge(df_def_followup[['PROLIFIC_PID', 'q_id']], on=['PROLIFIC_PID', 'q_id'])
df_story_both = df_story.merge(df_story_followup[['PROLIFIC_PID', 'q_id']], on=['PROLIFIC_PID', 'q_id'])

print(df_def.shape, df_story.shape)
print(df_def_followup.shape, df_story_followup.shape)
print(df_def_both.shape, df_story_both.shape)

(185, 19) (170, 20)
(125, 11) (150, 11)
(125, 19) (150, 20)


In [125]:
df_def_both = compute_group_accuracy(df_def_both)
df_def_followup = compute_group_accuracy(df_def_followup)

print("Rentention Scores for the Control Group -- Non-native Speakers")
compute_group_retention_accuracy(df_def_both, df_def_followup)

Rentention Scores for the Control Group -- Non-native Speakers
q1_pred    86.32
dtype: float64
q2_pred    82.8
dtype: float64
q3_pred    91.01
dtype: float64


In [126]:
df_story_both = compute_group_accuracy(df_story_both)
df_story_followup = compute_group_accuracy(df_story_followup)

print("Rentention Scores for the Treatment Group -- Native Speakers")
compute_group_retention_accuracy(df_story_both, df_story_followup)

Rentention Scores for the Treatment Group -- Native Speakers
q1_pred    98.56
dtype: float64
q2_pred    89.6
dtype: float64
q3_pred    92.31
dtype: float64
