# Question to be answered:

- Is the accuracy of model/human significantly better? In both force and mass questions?
- Is the distribution of responses significantly different?

In [1]:
from torch import nn
from isaac.utils import get_cuda_device_if_available
import joblib

from isaac.dataset import read_dataset, prepare_dataset
from isaac.models import MultiBranchModel
from isaac.constants import BASIC_TRAINING_COLS, MASS_CLASS_COLS, FORCE_CLASS_COLS
from isaac.evaluation import evaluate_saved_model
from isaac.statistical_tests import z_test

import torch
from torch.autograd import Variable
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
device = get_cuda_device_if_available()
print(device)

cpu


In [3]:
def get_question_accuracy(question_type):
    normalise_data = True
    scaler_path = "dissertation_results/scalers/passive_dual_scaler.sk"
    network_dims = (len(BASIC_TRAINING_COLS), 25, 3, 0.5)
    model_path = "dissertation_results/models/passive_"+question_type+"_dual_model.pt"
    dataset_path = "data/passive_trials_exp1.h5"
    class_columns = [list(MASS_CLASS_COLS), list(FORCE_CLASS_COLS)]
    multiclass = True
    seq_end = 1800
    step_size = 3

    accuracies, predicted = evaluate_saved_model(model_path, network_dims, dataset_path, 
                                                 training_columns=BASIC_TRAINING_COLS, class_columns=class_columns, 
                                                 step_size=step_size, seq_end=seq_end, scaler_path=scaler_path,
                                                 arch=MultiBranchModel, multiclass=multiclass)

    mass_accuracy, force_accuracy = accuracies
    mass_predicted = predicted[:, 0]
    force_predicted = predicted[:, 1]
    
    if question_type == "mass":
        return mass_accuracy, mass_predicted
    
    return force_accuracy, force_predicted

def get_participant_accuracy(passive_responses, answer_column, question_type_answer):
    return [(df[answer_column] == df[question_type_answer]).sum() / len(df) 
            for participant_id, df in passive_responses.groupby("participant")]


def get_participant_accuracy_filtering_by_answer(passive_responses, answer_column, question_type_answer, filter_by_class):
    
    passive_responses = passive_responses.copy().query(question_type_answer+" == "+filter_by_class)
    
    return [(df[answer_column] == df[question_type_answer]).sum() / len(df) 
            for participant_id, df in passive_responses.groupby("participant")]

# T-test for MASS questions

## Load model's predictions

In [4]:
question_type = "mass"
acc, model_mass_predicted = get_question_accuracy(question_type)
question_type = "force"
acc2, model_force_predicted = get_question_accuracy(question_type)

100%|██████████| 216/216 [00:01<00:00, 132.27it/s]
100%|██████████| 216/216 [00:00<00:00, 667.65it/s]
  7%|▋         | 16/216 [00:00<00:01, 151.61it/s]

Model's accuracy on test set: [54.62962963 62.5       ]


100%|██████████| 216/216 [00:01<00:00, 157.05it/s]
100%|██████████| 216/216 [00:00<00:00, 743.54it/s]


Model's accuracy on test set: [56.94444444 59.25925926]


In [5]:
acc, acc2

(54.629629629629626, 59.25925925925926)

In [6]:
model_mass_answers = [MASS_CLASS_COLS[i] for i in model_mass_predicted]
model_force_answers = [FORCE_CLASS_COLS[i] for i in model_force_predicted]

## Load human results

In [7]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()


 



In [8]:
rdata_path = "for_hector_small/data/exp1.rdata"
r['load'](rdata_path)

is_passive = (r['dfc.l'].condition == '0') & (r['dfc.l'].practice == 0.)  & (r['dfc.l'].exclude == 0)
responses = r['dfc.l'][["participant", "mass", "relationship", "trueMass", "trueRelationship", 'post_ent', 'post_ent_rel.rtheta', 'post_ent_mass.rtheta', 'corMass', 'corRel']]
passive_responses = responses[is_passive]

  res = PandasDataFrame.from_items(items)


In [9]:
passive_responses["model_mass"] = model_mass_answers
passive_responses["model_relationship"] = model_force_answers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
human_mass_accuracy_list = get_participant_accuracy(passive_responses, "mass", "trueMass")
human_force_accuracy_list = get_participant_accuracy(passive_responses, "relationship", "trueRelationship")

model_mass_accuracy_list = get_participant_accuracy(passive_responses, "model_mass", "trueMass")
model_force_accuracy_list = get_participant_accuracy(passive_responses, "model_relationship", "trueRelationship")

In [11]:
print(np.mean(human_mass_accuracy_list), "±", np.std(human_mass_accuracy_list))
print(np.mean(model_mass_accuracy_list), "±", np.std(model_mass_accuracy_list))

0.4537037037037037 ± 0.1922272175187657
0.5462962962962963 ± 0.1694722705344734


In [12]:
print(np.mean(model_force_accuracy_list), "±", np.std(model_force_accuracy_list))
print(np.mean(human_force_accuracy_list), "±", np.std(human_force_accuracy_list))

0.5925925925925926 ± 0.18332398030762342
0.611111111111111 ± 0.2151657414559676


# Perform t-test on overall accuracy

In [13]:
# Null hypothesis (opposite of what we want to prove): accuracies are not significantly different
# Alternative hypothesis: accuracies are significantly different

In [14]:
from scipy.stats import ttest_ind, ttest_rel, f_oneway

#### Is the model significantly better answering mass questions than humans?

In [15]:
ttest_rel(human_mass_accuracy_list, model_mass_accuracy_list)

Ttest_relResult(statistic=-1.635364975976666, pvalue=0.11558604179754998)

#### Is the model significantly better answering force questions than humans?

In [16]:
ttest_rel(human_force_accuracy_list, model_force_accuracy_list)

Ttest_relResult(statistic=0.2924070086985936, pvalue=0.7725969994259952)

#### Is the model significantly better answering force questions than mass questions?

In [17]:
ttest_ind(model_mass_accuracy_list, model_force_accuracy_list)

Ttest_indResult(statistic=-0.8893377286488802, pvalue=0.378449006026212)

# Perform z-test on overall accuracy

In [18]:
# Null hypothesis (opposite of what we want to prove): accuracies are not significantly different
# Alternative hypothesis: classifier 2 is significantly more accurate than the first

z_zero_point_five = 1.645

Z = z_test(passive_responses["trueMass"], passive_responses["mass"], passive_responses["model_mass"])

print(Z)
if Z < -z_zero_point_five:
    print("Classifier two significantly better than classifier one.")
else:
    print("Classifier two not significantly better.")

Z = z_test(passive_responses["trueRelationship"], passive_responses["relationship"], passive_responses["model_relationship"])

print(Z)
if Z < -z_zero_point_five:
    print("Classifier two significantly better than classifier one.")
else:
    print("Classifier two not significantly better than classifier one.")

# Are correct guesses / errors correlated between humans and model?

In [19]:
from scipy.stats import chisquare

In [20]:
passive_responses["model_mass_correct_guesses"] = (passive_responses["model_mass"] == passive_responses["trueMass"])
passive_responses["model_force_correct_guesses"] = (passive_responses["model_relationship"] == passive_responses["trueRelationship"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
mass_coincidence = get_participant_accuracy(passive_responses, "mass", "model_mass")
force_coincidence = get_participant_accuracy(passive_responses, "relationship", "model_relationship")

print(np.mean(mass_coincidence), np.std(mass_coincidence))
print(np.mean(force_coincidence), np.std(force_coincidence))

0.4212962962962963 0.18970186341248035
0.48148148148148157 0.12283795519834814


In [22]:
def test_for_independence(first_answers, second_answers):
    both_correct = (first_answers & second_answers).sum()
    both_wrong = (~first_answers & ~second_answers).sum()
    first_correct_second_wrong = (~first_answers & second_answers).sum()
    first_wrong_second_correct = (first_answers & ~second_answers).sum()
    
    matrix = [[both_correct, first_correct_second_wrong], [first_wrong_second_correct, both_wrong]]
    chisquare_results = chisquare(matrix, axis=None)
    
    return matrix, chisquare_results

In [23]:
test_for_independence(passive_responses["corRel"], passive_responses["model_force_correct_guesses"])

([[76, 52], [56, 32]],
 Power_divergenceResult(statistic=18.074074074074076, pvalue=0.00042464459551150644))

In [24]:
test_for_independence(passive_responses["corMass"], passive_responses["model_mass_correct_guesses"])

([[54, 64], [44, 54]],
 Power_divergenceResult(statistic=3.7037037037037037, pvalue=0.2952874424445788))

In [25]:
def test_for_coincidence(first_answers, second_answers):
    
    matrix = []
    all_classes = first_answers.unique()
    
    for class_name in all_classes:
        this_class_coincidences = []
        for second_class_name in all_classes:
            n_coincidences = ((first_answers == class_name) & (second_answers == second_class_name)).sum()
            this_class_coincidences.append(n_coincidences)
            
        matrix.append(this_class_coincidences)
    
    chisquare_results = chisquare(matrix, axis=None)
    
    return matrix, chisquare_results, all_classes

In [27]:
test_for_coincidence(passive_responses["mass"], passive_responses["model_mass"])

([[18, 20, 31], [22, 35, 15], [19, 18, 38]],
 Power_divergenceResult(statistic=23.5, pvalue=0.0027782545010619926),
 array(['same', 'B', 'A'], dtype=object))

In [28]:
test_for_coincidence(passive_responses["relationship"], passive_responses["model_relationship"])

([[29, 27, 6], [16, 42, 37], [7, 19, 33]],
 Power_divergenceResult(statistic=54.583333333333336, pvalue=5.319453707071536e-09),
 array(['attract', 'none', 'repel'], dtype=object))

# Are correct guesses / errors correlated to informativeness?

### Mass responses

In [29]:
not_na_passive_responses = passive_responses[passive_responses["post_ent"].notna()]

post_mass_correct_guesses = not_na_passive_responses.query("model_mass_correct_guesses")["post_ent_mass.rtheta"]
post_mass_wrong_guesses = not_na_passive_responses.query("not model_mass_correct_guesses")["post_ent_mass.rtheta"]

f_oneway(post_mass_correct_guesses, post_mass_wrong_guesses)

F_onewayResult(statistic=0.8374126847061395, pvalue=0.36151486791288434)

In [30]:
post_mass_correct_guesses = not_na_passive_responses.query("corMass == 1")["post_ent_mass.rtheta"]
post_mass_wrong_guesses = not_na_passive_responses.query("corMass == 0")["post_ent_mass.rtheta"]

f_oneway(post_mass_correct_guesses, post_mass_wrong_guesses)

F_onewayResult(statistic=0.03857733827167141, pvalue=0.8445376237361524)

### Force responses

In [34]:
post_force_correct_guesses = not_na_passive_responses[not_na_passive_responses.model_force_correct_guesses]['post_ent_rel.rtheta']
post_force_wrong_guesses = not_na_passive_responses[~not_na_passive_responses.model_force_correct_guesses]['post_ent_rel.rtheta']

f_oneway(post_force_correct_guesses, post_force_wrong_guesses)

F_onewayResult(statistic=6.148647108193671, pvalue=0.014187349415838359)

In [35]:
post_force_correct_guesses = not_na_passive_responses[(not_na_passive_responses.corRel).astype(bool)]['post_ent_rel.rtheta']
post_force_wrong_guesses = not_na_passive_responses[~(not_na_passive_responses.corRel).astype(bool)]['post_ent_rel.rtheta']

f_oneway(post_force_correct_guesses, post_force_wrong_guesses)

F_onewayResult(statistic=5.982515021800705, pvalue=0.015531865919044263)

#### Same statistic calculated a different way

import statsmodels.api as sm
from statsmodels.formula.api import ols

df_copy = not_na_passive_responses.copy()
df_copy = df_copy.rename({'post_ent_rel.rtheta': 'post_ent_rel_rtheta',
                          'post_ent_mass.rtheta': 'post_ent_mass_rtheta'}, axis=1)

moore_lm = ols('post_ent_mass_rtheta ~ corMass', data=df_copy).fit()
table = sm.stats.anova_lm(moore_lm, typ=2)
print(table)

moore_lm = ols('post_ent_rel_rtheta ~ corRel', data=df_copy).fit()
table = sm.stats.anova_lm(moore_lm, typ=2)
print(table)

# Is the model better at predicting any force class?

In [36]:
repel_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'repel'")

none_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'none'")

attract_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'attract'")

In [37]:
print(np.mean(repel_accuracy_list), np.std(repel_accuracy_list))
print(np.mean(none_accuracy_list), np.std(none_accuracy_list))
print(np.mean(attract_accuracy_list), np.std(attract_accuracy_list))

print()

print(ttest_ind(repel_accuracy_list, none_accuracy_list))
print(ttest_ind(repel_accuracy_list, attract_accuracy_list))
print(ttest_ind(attract_accuracy_list, none_accuracy_list))

0.5972222222222222 0.2718109137608462
0.5520833333333334 0.2793143266206651
0.6666666666666666 0.31180478223116176

Ttest_indResult(statistic=0.555442660346191, pvalue=0.5812833338681618)
Ttest_indResult(statistic=-0.8051413147021834, pvalue=0.4248803705043045)
Ttest_indResult(statistic=1.3127146735835187, pvalue=0.1957909408342765)


# Is the model better at predicting any mass class?

In [38]:
a_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'A'")

same_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'same'")

b_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'B'")

In [39]:
print(np.mean(a_accuracy_list), np.std(a_accuracy_list))
print(np.mean(same_accuracy_list), np.std(same_accuracy_list))
print(np.mean(b_accuracy_list), np.std(b_accuracy_list))

print()

print(ttest_ind(a_accuracy_list, same_accuracy_list))
print(ttest_ind(a_accuracy_list, b_accuracy_list))
print(ttest_ind(b_accuracy_list, same_accuracy_list))

0.6388888888888888 0.2133651596630169
0.375 0.3090082702956526
0.625 0.26020824993326663

Ttest_indResult(statistic=3.3702244929225316, pvalue=0.0015288739737107353)
Ttest_indResult(statistic=0.19794515097336143, pvalue=0.8439601534671033)
Ttest_indResult(statistic=2.9679135159734478, pvalue=0.004746588205160193)


# Is the model better at predicting any force class than humans?

In [40]:
human_repel_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                         "trueRelationship", "'repel'")

human_none_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                        "trueRelationship", "'none'")

human_attract_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                   "trueRelationship", "'attract'")

print(np.mean(human_repel_accuracy_list), np.std(human_repel_accuracy_list))
print(np.mean(human_none_accuracy_list), np.std(human_none_accuracy_list))
print(np.mean(human_attract_accuracy_list), np.std(human_attract_accuracy_list))

0.5277777777777778 0.3458305443885759
0.6041666666666666 0.25937290829143195
0.75 0.3227486121839514


In [41]:
print(ttest_rel(repel_accuracy_list, human_repel_accuracy_list))
print(ttest_rel(none_accuracy_list, human_none_accuracy_list))
print(ttest_rel(attract_accuracy_list, human_attract_accuracy_list))

Ttest_relResult(statistic=0.6790483458299109, pvalue=0.5038838059771544)
Ttest_relResult(statistic=-0.6321258132527124, pvalue=0.5335384931823728)
Ttest_relResult(statistic=-0.8477912478906585, pvalue=0.40529075365725387)


# Is the model better at predicting any mass class than humans?

In [42]:
human_a_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                     "trueMass", "'A'")

human_same_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                        "trueMass", "'same'")

human_b_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                     "trueMass", "'B'")

print(np.mean(human_a_accuracy_list), np.std(human_a_accuracy_list))
print(np.mean(human_same_accuracy_list), np.std(human_same_accuracy_list))
print(np.mean(human_b_accuracy_list), np.std(human_b_accuracy_list))

print()

0.5 0.2545875386086578
0.375 0.3236438943814179
0.4861111111111111 0.33304385578560547



In [43]:
print(ttest_rel(a_accuracy_list, human_a_accuracy_list))
print(ttest_rel(same_accuracy_list, human_same_accuracy_list))
print(ttest_rel(b_accuracy_list, human_b_accuracy_list))

Ttest_relResult(statistic=1.926052288842346, pvalue=0.06654610983905936)
Ttest_relResult(statistic=4.9154518179959675e-17, pvalue=1.0)
Ttest_relResult(statistic=1.6830069266853707, pvalue=0.10590104861701705)
