<h1>Grid Game Study Graphs</h1>

In [45]:


! pip install scipy



In [46]:
import pandas as pd
import numpy as np
import scipy.stats as stats

from util.survey_structure import FSS_SURVEY_STATEMENTS, SURVEY_DIMENSIONS

In [47]:
TRIAL_DATA = pd.read_csv("./data/for-analysis/test_data.csv")
TEST_DATA = pd.read_csv("./data/for-analysis/test_data.csv")
SEQUENCE_TEST_DATA = pd.read_csv("./data/for-analysis/test_data.csv")
SURVEY_DATA = pd.read_csv("./data/for-analysis/survey.csv")
TRIAL_DATA_WITH_NUM_EXPOSURES = pd.read_csv(f"./data/for-analysis/targets_time_per_click.csv")

Notes to make reusable notebooks:
- outlier removal should be in a function

<h2>Outlier Removal - moved to graphs notebook</h2>

In [48]:
conditions = TRIAL_DATA['condition'].unique()
breakpoints = TRIAL_DATA['breakpoint'].unique()

<h3>For Trial data: remove all those with a completion time 3 SD outside the mean.</h3>

In [49]:
# Process data for each condition and breakpoint
# for condition in conditions:
#     for breakpoint in breakpoints:
#         # Filter data for current condition and breakpoint
#         mask = (TRIAL_DATA['condition'] == condition) & (TRIAL_DATA['breakpoint'] == breakpoint)
#         subset = TRIAL_DATA[mask]
        
#         # Calculate mean and standard deviation for ct
#         ct_mean = subset['ct'].mean()
#         ct_std = subset['ct'].std()
        
#         # Remove rows where ct is 3 or more standard deviations outside the mean
#         outlier_mask = np.abs(subset['ct'] - ct_mean) >= 3 * ct_std
#         TRIAL_DATA = TRIAL_DATA.drop(subset[outlier_mask].index)



<h3>For Test data: again, remove outliers based on completion time.</h3>

In [50]:
# for condition in conditions:
#     for breakpoint in breakpoints:
#         # Filter data for current condition and breakpoint
#         mask = (TEST_DATA['condition'] == condition) & (TEST_DATA['breakpoint'] == breakpoint)
#         subset = TEST_DATA[mask]
        
#         # Calculate mean and standard deviation for ct
#         ct_mean = subset['ct'].mean()
#         ct_std = subset['ct'].std()
        
#         # Remove rows where ct is 3 or more standard deviations outside the mean
#         outlier_mask = np.abs(subset['ct'] - ct_mean) >= 3 * ct_std
#         TEST_DATA = TEST_DATA.drop(subset[outlier_mask].index)


<h3>Sequence test data: more removals</h3>

In [51]:
# for condition in conditions:
#     for breakpoint in breakpoints:
#         # Filter data for current condition and breakpoint
#         mask = (SEQUENCE_TEST_DATA['condition'] == condition) & (SEQUENCE_TEST_DATA['breakpoint'] == breakpoint)
#         subset = SEQUENCE_TEST_DATA[mask]
        
#         # Calculate mean and standard deviation for ct
#         ct_mean = subset['ct'].mean()
#         ct_std = subset['ct'].std()
        
#         # Remove rows where ct is 3 or more standard deviations outside the mean
#         outlier_mask = np.abs(subset['ct'] - ct_mean) >= 3 * ct_std
#         SEQUENCE_TEST_DATA = SEQUENCE_TEST_DATA.drop(subset[outlier_mask].index)


<h2>Kruskal Wallis</h2>

In [52]:
def get_kruskal_wallis(one, two, dimensions):
    for dimension in dimensions:
        try:
            H, pval = stats.mstats.kruskalwallis(one[dimension].values, two[dimension].values)
            if pval<0.05:
                print(f"Significant differences exist for '{dimension}', with a p-value of {pval} \n")
        except:
            print(f"Cannot compute Kruskal-Wallis for '{dimension}', all numbers are identical.")   

<h3>Survey Data</h3>

In [53]:
dimensions = SURVEY_DIMENSIONS

# simon vs no game
print("\n Between Simon and Baseline...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = SURVEY_DATA[SURVEY_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Simon'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Baseline'], dimensions)
# simon vs searchlight
print("\n Between Simon and Searchlight...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = SURVEY_DATA[SURVEY_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Simon'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Searchlight'], dimensions)
# simon vs searchlight
print("\n Between Searchlight and Baseline...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = SURVEY_DATA[SURVEY_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Searchlight'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Baseline'], dimensions)
# searchlight vs no game        


 Between Simon and Baseline...

For block: 1 

Significant differences exist for 'imi_value_usefulness', with a p-value of 0.0389569107871065 

Significant differences exist for 'fss_unambiguous_feedback', with a p-value of 0.0011143811487273486 

Significant differences exist for 'fss_concentration', with a p-value of 0.003398897329656172 

Significant differences exist for 'fss_merging_action_and_awareness', with a p-value of 0.019444202470781855 

For block: 2 

Significant differences exist for 'imi_value_usefulness', with a p-value of 0.03379704682642183 

Significant differences exist for 'fss_unambiguous_feedback', with a p-value of 0.004691134178451135 

For block: 3 

Significant differences exist for 'fss_unambiguous_feedback', with a p-value of 0.01694207337968228 

For block: 4 

Significant differences exist for 'fss_unambiguous_feedback', with a p-value of 0.04302959477729861 


 Between Simon and Searchlight...

For block: 1 

Significant differences exist for 'imi_inte

<h3>Test Confidence</h3>

In [54]:
dimensions = ['confidence']

# simon vs no game
print("\n Between Simon and Baseline...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = TEST_DATA[TEST_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Simon'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Baseline'], dimensions)
# simon vs searchlight
print("\n Between Simon and Searchlight...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = TEST_DATA[TEST_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Simon'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Searchlight'], dimensions)
# simon vs searchlight
print("\n Between Searchlight and Baseline...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = TEST_DATA[TEST_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Searchlight'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Baseline'], dimensions)
# searchlight vs no game        


 Between Simon and Baseline...

For block: 1 

For block: 2 

Significant differences exist for 'confidence', with a p-value of 0.004325258902895407 

For block: 3 

Significant differences exist for 'confidence', with a p-value of 2.914996688222503e-09 

For block: 4 

Significant differences exist for 'confidence', with a p-value of 2.987255581944233e-09 


 Between Simon and Searchlight...

For block: 1 

Significant differences exist for 'confidence', with a p-value of 0.0004334308738591274 

For block: 2 

For block: 3 

Significant differences exist for 'confidence', with a p-value of 0.002548122323450805 

For block: 4 

Significant differences exist for 'confidence', with a p-value of 0.007833011252586057 


 Between Searchlight and Baseline...

For block: 1 

Significant differences exist for 'confidence', with a p-value of 0.015069205415962994 

For block: 2 

Significant differences exist for 'confidence', with a p-value of 0.004340404925286443 

For block: 3 

Significant 

<h3>Sequence Test Confidence</h3>

In [55]:
# simon vs no game
print("\n Between Simon and Baseline...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = SEQUENCE_TEST_DATA[SEQUENCE_TEST_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Simon'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Baseline'], dimensions)
# simon vs searchlight
print("\n Between Simon and Searchlight...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = SEQUENCE_TEST_DATA[SEQUENCE_TEST_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Simon'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Searchlight'], dimensions)
# simon vs searchlight
print("\n Between Searchlight and Baseline...\n")
for breakpoint in breakpoints:
    print(f"For block: {breakpoint} \n")
    breakpoint_test_data = SEQUENCE_TEST_DATA[SEQUENCE_TEST_DATA['breakpoint'] == breakpoint]
    get_kruskal_wallis(breakpoint_test_data[breakpoint_test_data['condition'] == 'Searchlight'], breakpoint_test_data[breakpoint_test_data['condition'] == 'Baseline'], dimensions)
# searchlight vs no game        


 Between Simon and Baseline...

For block: 1 

For block: 2 

Significant differences exist for 'confidence', with a p-value of 0.004325258902895407 

For block: 3 

Significant differences exist for 'confidence', with a p-value of 2.914996688222503e-09 

For block: 4 

Significant differences exist for 'confidence', with a p-value of 2.987255581944233e-09 


 Between Simon and Searchlight...

For block: 1 

Significant differences exist for 'confidence', with a p-value of 0.0004334308738591274 

For block: 2 

For block: 3 

Significant differences exist for 'confidence', with a p-value of 0.002548122323450805 

For block: 4 

Significant differences exist for 'confidence', with a p-value of 0.007833011252586057 


 Between Searchlight and Baseline...

For block: 1 

Significant differences exist for 'confidence', with a p-value of 0.015069205415962994 

For block: 2 

Significant differences exist for 'confidence', with a p-value of 0.004340404925286443 

For block: 3 

Significant 

<h2>ART ANOVA Analysis</h2>

<h3>Trial Data: Completion Time (per block)</h3>

In [56]:
TRIAL_DATA_WITH_NUM_EXPOSURES.columns

Index(['Unnamed: 0', 'condition', 'targetID', 'ct', 'block', 'timeSubmitted',
       'exposure_count', 'trial', 'trial_order', 'participantID',
       'locationInSequence', 'distanceFromEndOfSequence', 'sequence'],
      dtype='str')

In [57]:
exposures_data = TRIAL_DATA_WITH_NUM_EXPOSURES[['condition', 'exposure_count', 'ct', 'targetID']]
exposures_data.head()

Unnamed: 0,condition,exposure_count,ct,targetID
0,Searchlight,1,41.3473,81
1,Searchlight,1,18.3856,138
2,Searchlight,1,14.3606,71
3,Searchlight,1,16.96,105
4,Searchlight,1,7.9511,108


In [65]:
import sys, subprocess
import pingouin as pg

# ART ANOVA on 'ct' by condition using pingouin

# TODO: analyze by seperate breakpoint

aov = pg.rm_anova(data=TRIAL_DATA, dv='ct', subject="participantID", within=['condition', 'breakpoint'], detailed=True, correction=True)
pg.print_table(aov)




ANOVA SUMMARY

Source                      SS    ddof1    ddof2      MS      F    p-unc    p-GG-corr    ng2    eps
----------------------  ------  -------  -------  ------  -----  -------  -----------  -----  -----
condition               23.797        2       40  11.899  1.104    0.341        0.334  0.021  0.844
breakpoint              34.221        3       60  11.407  4.286    0.008        0.013  0.029  0.848
condition * breakpoint  72.253        6      120  12.042  4.082    0.001        0.025  0.060  0.328





<h2>Test Data: Manhattan Distance per block</h2>

In [66]:
aov = pg.rm_anova(data=TEST_DATA, dv='manhattan', subject="participantID", within=['condition', 'breakpoint'], detailed=True, correction=True)
pg.print_table(aov)



ANOVA SUMMARY

Source                       SS    ddof1    ddof2      MS       F    p-unc    p-GG-corr    ng2    eps
----------------------  -------  -------  -------  ------  ------  -------  -----------  -----  -----
condition                 1.099        2       40   0.550   0.110    0.896        0.854  0.003  0.804
breakpoint              119.327        3       60  39.776  77.787    0.000        0.000  0.230  0.769
condition * breakpoint    6.363        6      120   1.060   1.226    0.298        0.302  0.016  0.300





In [67]:
SEQUENCE_TEST_DATA.columns

Index(['Unnamed: 0', 'trialLogID', 'participantID', 'timeSubmitted', 'pid',
       'condition', 'prolificID', 'day', 'phase', 'block', 'trial', 'targetID',
       'distEdge', 'distLand', 'nearLandmark', 'isLandmark', 'targetX',
       'targetY', 'clickX', 'clickY', 'euclidean', 'manhattan', 'ct',
       'underGuide', 'errors', 'movement', 'confidence', 'errorObjects',
       'sequence', 'cumulativeManhattanDistance',
       'cumulativeEuclideanDistance', 'manhattanDistanceLog',
       'euclideanDistanceLog', 'treatment', 'test_order', 'breakpoint',
       'gameDistributionIndex', 'cursorWidth', 'ctForTargetInOrder',
       'targetErrorArray', 'errorInTrial'],
      dtype='str')

In [68]:
aov = pg.rm_anova(data=SEQUENCE_TEST_DATA, dv='cumulativeManhattanDistance', subject="participantID", within=['condition', 'breakpoint'], detailed=True, correction=True)
pg.print_table(aov)


ANOVA SUMMARY

Source                     SS    ddof1    ddof2     MS    F    p-unc    p-GG-corr    ng2    eps
----------------------  -----  -------  -------  -----  ---  -------  -----------  -----  -----
condition               0.000        2       40  0.000  nan      nan          nan    nan    nan
breakpoint              0.000        3       60  0.000  nan      nan          nan    nan    nan
condition * breakpoint  0.000        6      120  0.000  nan      nan          nan    nan    nan



  f_a = ms_a / ms_as
  f_b = ms_b / ms_bs
  f_ab = ms_ab / ms_abs
  ef_a = ss_a / (ss_a + ss_s + ss_as + ss_bs + ss_abs)
  ef_b = ss_b / (ss_b + ss_s + ss_as + ss_bs + ss_abs)
  ef_ab = ss_ab / (ss_ab + ss_s + ss_as + ss_bs + ss_abs)
  eps = np.min([num / den, 1])
  eps = np.min([num / den, 1])
  eps = np.min([num / den, 1])
