Data analysis for the binary continuous likert experiment


First, import all the modules and libraries necessary

In [172]:
from zipfile import ZipFile
import json
from pathlib import Path
from matplotlib import pyplot as plt
import pandas as pd
import warnings 
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from matplotlib import pyplot as plt

Import data

In [173]:
zip_file_name = "bin-cont-likert-12-16-2025.zip"

indv_data_frames = []

with ZipFile(zip_file_name, 'r') as zip_file:
    for file_name in zip_file.namelist():
        if file_name.endswith('.csv'):
            df = pd.read_csv(zip_file.open(file_name))
            indv_data_frames.append(df)



Clean the data

In [174]:
combined_data_frame = pd.concat(indv_data_frames, ignore_index = True)

print(combined_data_frame)

                 trial_type  trial_index plugin_version  time_elapsed  \
0                  pavlovia            0            NaN          1317   
1             external-html            1            NaN          4365   
2              instructions            2            NaN          6102   
3              instructions            3            NaN          6980   
4                  pavlovia            0            NaN          1327   
..                      ...          ...            ...           ...   
757  html-keyboard-response            5          2.1.0         36143   
758                pavlovia            0            NaN          2430   
759            instructions            1            NaN          5359   
760  html-keyboard-response            2          2.1.0          6495   
761  html-keyboard-response            3          2.1.0          7098   

                           subject_id                          plot condition  \
0    gme91lqhsrowpuznxv9z6c9wdawc1h1k  bin

Make data frame for only survey-likert trials

In [175]:
is_likert_mask = combined_data_frame['trial_type']=='survey-likert'   

likert_df = combined_data_frame[is_likert_mask]

print(likert_df)

        trial_type  trial_index plugin_version  time_elapsed  \
17   survey-likert            6            NaN         11922   
26   survey-likert            6            NaN         11110   
35   survey-likert            6            NaN         29598   
44   survey-likert            6            NaN         14520   
53   survey-likert            6            NaN         11243   
..             ...          ...            ...           ...   
708  survey-likert            6            NaN         13581   
717  survey-likert            6            NaN         12113   
726  survey-likert            6            NaN         10637   
738  survey-likert            6            NaN         26441   
750  survey-likert            7            NaN        350329   

                           subject_id                               plot  \
17   7vf8suazjq0q9kapzo416ql0qvr77n4n   continuous_0.43906666276199_.png   
26   fzu0u9g4lmsb1r56ooh9131axx0of4l7      binary_0.616832475402624_.png   
35 

Drop columns

In [176]:
columns_to_drop = ['trial_index', 'plugin_version', 'url', 'view_history', 'stimulus', 'study_id' , 'session_id', 'success']

likert_df = likert_df.drop(columns_to_drop, axis = 1)

print(likert_df)

        trial_type  time_elapsed                        subject_id  \
17   survey-likert         11922  7vf8suazjq0q9kapzo416ql0qvr77n4n   
26   survey-likert         11110  fzu0u9g4lmsb1r56ooh9131axx0of4l7   
35   survey-likert         29598  php7jbppld04td37qskxm3tfndnvarzn   
44   survey-likert         14520  lp1qau1d9r1lg9w0ehuy327tg7zay5jn   
53   survey-likert         11243  5alzuncws9dpqpmfh3t863x9rahgzrf5   
..             ...           ...                               ...   
708  survey-likert         13581  z4rhmwadjutsdoqu5c7dttdtzto58azk   
717  survey-likert         12113  og65wu97pqa0lkq4yhawagjrtl783y3o   
726  survey-likert         10637  yg8jco4vs8sgqfqv51jd5o6wp9xsh224   
738  survey-likert         26441  ccpyvto1l54hmar65ygjy7cw9vgbtqjj   
750  survey-likert        350329  4w46ru5tblfeo09mamswm3bkps58vb1a   

                                  plot condition       rt  \
17    continuous_0.43906666276199_.png       [2]   2760.0   
26       binary_0.616832475402624_.pn

Drop rows with missing data for plot

In [177]:
likert_df = likert_df.dropna(subset=['plot'])
print(likert_df)


        trial_type  time_elapsed                        subject_id  \
17   survey-likert         11922  7vf8suazjq0q9kapzo416ql0qvr77n4n   
26   survey-likert         11110  fzu0u9g4lmsb1r56ooh9131axx0of4l7   
35   survey-likert         29598  php7jbppld04td37qskxm3tfndnvarzn   
44   survey-likert         14520  lp1qau1d9r1lg9w0ehuy327tg7zay5jn   
53   survey-likert         11243  5alzuncws9dpqpmfh3t863x9rahgzrf5   
62   survey-likert         13793  w3r6m8tqzsps9x88l84j8qg36pwkq5rf   
85   survey-likert         11980  dyu643r5gbwoo9lde9hshcx5e2nd4qeh   
95   survey-likert         12852  70o1maay8o1tf2jzxlkez8hdwgduuvzf   
118  survey-likert        136068  oxmt4j60e801a4v66psu747zdf2szsr4   
127  survey-likert         10294  3xz6xj9jcpfvd2ggaq4dfqtjamd3ej08   
155  survey-likert        330158  ktoyalq5gn8a4lyg38hfb72mqsf2h8ak   
164  survey-likert         15264  onmxv7rzbj872356zmhqbu9rscd7etwj   
176  survey-likert         14023  lvem6pk9n4xdrgb756h3toykap57qhog   
200  survey-likert  

Add value for correlation level

In [178]:
likert_df['correlation'] = likert_df['plot'].str.split('_').str[1].str[:5]


Make the likert scale responses into columns

In [179]:
# 1. Unpack the 'response' dictionary/JSON column
# The .apply(pd.Series) method will automatically create new columns named 'Attention' and 'Amount'
response_columns = likert_df['response'].apply(lambda x: json.loads(x) if isinstance(x, str) else x).apply(pd.Series)

# 2. Concatenate the new columns with the main DataFrame
likert_df = pd.concat([likert_df.drop('response', axis=1), response_columns], axis=1)

# 3. Convert the new columns to a numeric type (they might be strings after extraction)
likert_df['Attention'] = pd.to_numeric(likert_df['Attention'], errors='coerce')
likert_df['Amount'] = pd.to_numeric(likert_df['Amount'], errors='coerce')

print(likert_df.shape[0])
print(likert_df)

46
        trial_type  time_elapsed                        subject_id  \
17   survey-likert         11922  7vf8suazjq0q9kapzo416ql0qvr77n4n   
26   survey-likert         11110  fzu0u9g4lmsb1r56ooh9131axx0of4l7   
35   survey-likert         29598  php7jbppld04td37qskxm3tfndnvarzn   
44   survey-likert         14520  lp1qau1d9r1lg9w0ehuy327tg7zay5jn   
53   survey-likert         11243  5alzuncws9dpqpmfh3t863x9rahgzrf5   
62   survey-likert         13793  w3r6m8tqzsps9x88l84j8qg36pwkq5rf   
85   survey-likert         11980  dyu643r5gbwoo9lde9hshcx5e2nd4qeh   
95   survey-likert         12852  70o1maay8o1tf2jzxlkez8hdwgduuvzf   
118  survey-likert        136068  oxmt4j60e801a4v66psu747zdf2szsr4   
127  survey-likert         10294  3xz6xj9jcpfvd2ggaq4dfqtjamd3ej08   
155  survey-likert        330158  ktoyalq5gn8a4lyg38hfb72mqsf2h8ak   
164  survey-likert         15264  onmxv7rzbj872356zmhqbu9rscd7etwj   
176  survey-likert         14023  lvem6pk9n4xdrgb756h3toykap57qhog   
200  survey-liker

Drop rows where "Attention" of response != 10. Pavlovia codes on a 0-index scale, so I'm dropping the values that are !=9

In [180]:
likert_df = likert_df[likert_df['Attention'] == 9].copy()

print(likert_df.shape[0])
print(likert_df.columns)
print(likert_df)

41
Index(['trial_type', 'time_elapsed', 'subject_id', 'plot', 'condition', 'rt',
       'question_order', 'correlation', 'Amount', 'Attention'],
      dtype='object')
        trial_type  time_elapsed                        subject_id  \
17   survey-likert         11922  7vf8suazjq0q9kapzo416ql0qvr77n4n   
26   survey-likert         11110  fzu0u9g4lmsb1r56ooh9131axx0of4l7   
35   survey-likert         29598  php7jbppld04td37qskxm3tfndnvarzn   
44   survey-likert         14520  lp1qau1d9r1lg9w0ehuy327tg7zay5jn   
53   survey-likert         11243  5alzuncws9dpqpmfh3t863x9rahgzrf5   
62   survey-likert         13793  w3r6m8tqzsps9x88l84j8qg36pwkq5rf   
85   survey-likert         11980  dyu643r5gbwoo9lde9hshcx5e2nd4qeh   
95   survey-likert         12852  70o1maay8o1tf2jzxlkez8hdwgduuvzf   
118  survey-likert        136068  oxmt4j60e801a4v66psu747zdf2szsr4   
127  survey-likert         10294  3xz6xj9jcpfvd2ggaq4dfqtjamd3ej08   
155  survey-likert        330158  ktoyalq5gn8a4lyg38hfb72mqsf2h

Make data frames for each of the 6 conditions
("binary_0.30230805510781_.png",
        "binary_0.43906666276199_.png",
        "binary_0.616832475402624_.png",
        "continuous_0.30230805510781_.png",
        "continuous_0.43906666276199_.png",
        "continuous_0.616832475402624_.png")

In [181]:
#grouped_df = likert_df.groupby("plot")

In [182]:
#separate_dfs = {condition: plot for condition, plot in grouped_df}

In [183]:
#for value in separate_dfs:
    #print(separate_dfs[value])
    #print('*'*60)

Compute stats

Group by plot

In [184]:
# 1. Ensure Amount is numeric (in case it was imported as strings)
likert_df['Amount'] = pd.to_numeric(likert_df['Amount'])

# 2. Fit the One-Way ANOVA model 
# C(plot) treats the filenames as categorical levels
model_plot = ols('Amount ~ C(plot)', data=likert_df).fit()

# 3. Generate and print the ANOVA table
anova_table_plot = sm.stats.anova_lm(model_plot, typ=2)

print("One-Way ANOVA: Comparing Amount by Plot Type")
print(anova_table_plot)



One-Way ANOVA: Comparing Amount by Plot Type
              sum_sq    df         F    PR(>F)
C(plot)    56.464015   5.0  2.732295  0.034696
Residual  144.657937  35.0       NaN       NaN


In [185]:
df_plot = likert_df

# 1. Check counts and variation for EVERY group
stats_check_plot = df_plot.groupby('plot')['Amount'].agg(['count', 'std', 'var'])
print("Group Stats Check:")
print(stats_check_plot)

# 2. Identify the 'problem' groups
zero_var = stats_check_plot[stats_check_plot['std'] == 0]
single_obs = stats_check_plot[stats_check_plot['count'] <= 1]

if not zero_var.empty:
    print("\nWarning: These groups have ZERO variance (all values are same):")
    print(zero_var.index.tolist())

if not single_obs.empty:
    print("\nWarning: These groups have only 1 observation:")
    print(single_obs.index.tolist())


Group Stats Check:
                                   count       std       var
plot                                                        
binary_0.30230805510781_.png           7  1.889822  3.571429
binary_0.43906666276199_.png          10  1.776388  3.155556
binary_0.616832475402624_.png          4  0.577350  0.333333
continuous_0.30230805510781_.png       7  2.115701  4.476190
continuous_0.43906666276199_.png       4  1.707825  2.916667
continuous_0.616832475402624_.png      9  2.697736  7.277778


Group by binary [1] vs cont [2]

In [186]:


# 2. Fit the One-Way ANOVA model 
# C(plot) treats the filenames as categorical levels
model_bin_cont = ols('Amount ~ C(condition)', data=likert_df).fit()

# 3. Generate and print the ANOVA table
anova_table_bin_cont = sm.stats.anova_lm(model_bin_cont, typ=2)

print("One-Way ANOVA: Comparing Amount by Plot Type")
print(anova_table_bin_cont)



One-Way ANOVA: Comparing Amount by Plot Type
                  sum_sq    df         F    PR(>F)
C(condition)    0.000523   1.0  0.000101  0.992019
Residual      201.121429  39.0       NaN       NaN


In [187]:
df_bin_cont = likert_df

# 1. Check counts and variation for EVERY group
stats_check_bin_cont = df_bin_cont.groupby('condition')['Amount'].agg(['count', 'std', 'var'])
print("Group Stats Check:")
print(stats_check_bin_cont)

# 2. Identify the 'problem' groups
zero_var = stats_check_bin_cont[stats_check_bin_cont['std'] == 0]
single_obs = stats_check_bin_cont[stats_check_bin_cont['count'] <= 1]

if not zero_var.empty:
    print("\nWarning: These groups have ZERO variance (all values are same):")
    print(zero_var.index.tolist())

if not single_obs.empty:
    print("\nWarning: These groups have only 1 observation:")
    print(single_obs.index.tolist())

Group Stats Check:
           count       std       var
condition                           
[1]           21  2.104417  4.428571
[2]           20  2.433862  5.923684


Compare correlation levels

In [188]:

# 2. Fit the One-Way ANOVA model 
# C(plot) treats the filenames as categorical levels
model_correlation = ols('Amount ~ C(correlation)', data=likert_df).fit()

# 3. Generate and print the ANOVA table
anova_table_correlation = sm.stats.anova_lm(model_correlation, typ=2)

print("One-Way ANOVA: Comparing Amount by Plot Type")
print(anova_table_correlation)

One-Way ANOVA: Comparing Amount by Plot Type
                    sum_sq    df         F    PR(>F)
C(correlation)   18.605468   2.0  1.936833  0.158128
Residual        182.516484  38.0       NaN       NaN


In [189]:
df_correlation = likert_df

# 1. Check counts and variation for EVERY group
stats_check_correlation = df_correlation.groupby('correlation')['Amount'].agg(['count', 'std', 'var'])
print("Group Stats Check:")
print(stats_check_correlation)

# 2. Identify the 'problem' groups
zero_var = stats_check_plot[stats_check_plot['std'] == 0]
single_obs = stats_check_plot[stats_check_plot['count'] <= 1]

if not zero_var.empty:
    print("\nWarning: These groups have ZERO variance (all values are same):")
    print(zero_var.index.tolist())

if not single_obs.empty:
    print("\nWarning: These groups have only 1 observation:")
    print(single_obs.index.tolist())

Group Stats Check:
             count       std       var
correlation                           
0.302           14  1.940035  3.763736
0.439           14  1.968153  3.873626
0.616           13  2.633609  6.935897
