# Question to be answered:

- Is the accuracy of model/human significantly better? In both force and mass questions?
- Is the distribution of responses significantly different?

In [1]:
from torch import nn
from isaac.utils import get_cuda_device_if_available
import joblib

from isaac.dataset import read_dataset, prepare_dataset
from isaac.models import MultiBranchModel
from isaac.constants import BASIC_TRAINING_COLS, MASS_CLASS_COLS, FORCE_CLASS_COLS
from isaac.training import evaluate_saved_model
from isaac.statistical_tests import z_test

import torch
from torch.autograd import Variable
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
device = get_cuda_device_if_available()
print(device)

cpu


In [3]:
def get_question_accuracy(question_type):
    normalise_data = True
    scaler_path = "dissertation_results/scalers/passive_dual_scaler.sk"
    network_dims = (len(BASIC_TRAINING_COLS), 25, 3, 0.5)
    model_path = "dissertation_results/models/passive_"+question_type+"_dual_model.pt"
    dataset_path = "data/passive_trials_exp1.h5"
    class_columns = [list(MASS_CLASS_COLS), list(FORCE_CLASS_COLS)]
    multiclass = True
    seq_end = 1800
    step_size = 3

    accuracies, predicted = evaluate_saved_model(model_path, network_dims, dataset_path, 
                                                 training_columns=BASIC_TRAINING_COLS, class_columns=class_columns, 
                                                 step_size=step_size, seq_end=seq_end, scaler_path=scaler_path,
                                                 arch=MultiBranchModel, multiclass=multiclass)

    mass_accuracy, force_accuracy = accuracies
    mass_predicted = predicted[:, 0]
    force_predicted = predicted[:, 1]
    
    if question_type == "mass":
        return mass_accuracy, mass_predicted
    
    return force_accuracy, force_predicted

def get_participant_accuracy(passive_responses, answer_column, question_type_answer):
    return [(df[answer_column] == df[question_type_answer]).sum() / len(df) 
            for participant_id, df in passive_responses.groupby("participant")]


def get_participant_accuracy_filtering_by_answer(passive_responses, answer_column, question_type_answer, filter_by_class):
    
    passive_responses = passive_responses.copy().query(question_type_answer+" == "+filter_by_class)
    
    return [(df[answer_column] == df[question_type_answer]).sum() / len(df) 
            for participant_id, df in passive_responses.groupby("participant")]

# T-test for MASS questions

## Load model's predictions

In [4]:
question_type = "mass"
acc, model_mass_predicted = get_question_accuracy(question_type)
question_type = "force"
acc2, model_force_predicted = get_question_accuracy(question_type)

100%|██████████| 264/264 [00:01<00:00, 138.76it/s]
100%|██████████| 264/264 [00:00<00:00, 622.65it/s]
  5%|▍         | 13/264 [00:00<00:01, 128.11it/s]

Model's accuracy on test set: [56.06060606 62.87878788]


100%|██████████| 264/264 [00:01<00:00, 152.39it/s]
100%|██████████| 264/264 [00:00<00:00, 671.13it/s]


Model's accuracy on test set: [56.43939394 59.46969697]


In [5]:
acc, acc2

(56.06060606060606, 59.46969696969697)

In [6]:
model_mass_answers = [MASS_CLASS_COLS[i] for i in model_mass_predicted]
model_force_answers = [FORCE_CLASS_COLS[i] for i in model_force_predicted]

## Load human results

In [7]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()


 



In [8]:
rdata_path = "data/experiment1.rdata"
r['load'](rdata_path)

is_passive = (r['df.l'].condition == '0')
responses = r['df.l'][["participant", "mass", "relationship", "trueMass", "trueRelationship", 'post_ent_rel', 'post_ent_mass']]
passive_responses = responses[is_passive]

  res = PandasDataFrame.from_items(items)


In [9]:
passive_responses["model_mass"] = model_mass_answers
passive_responses["model_relationship"] = model_force_answers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
passive_responses.participant.unique()

array([ 2,  3,  6,  7, 11, 13, 15, 19, 23, 25, 27, 29, 31, 32, 34, 39, 40,
       41, 47, 48, 54, 56, 58, 60])

In [11]:
human_mass_accuracy_list = get_participant_accuracy(passive_responses, "mass", "trueMass")
human_force_accuracy_list = get_participant_accuracy(passive_responses, "relationship", "trueRelationship")

model_mass_accuracy_list = get_participant_accuracy(passive_responses, "model_mass", "trueMass")
model_force_accuracy_list = get_participant_accuracy(passive_responses, "model_relationship", "trueRelationship")

In [12]:
print(np.mean(human_mass_accuracy_list), "±", np.std(human_mass_accuracy_list))
print(np.mean(model_mass_accuracy_list), "±", np.std(model_mass_accuracy_list))

0.43560606060606055 ± 0.14840559109211918
0.5606060606060606 ± 0.1499923475244192


In [13]:
print(np.mean(model_force_accuracy_list), "±", np.std(model_force_accuracy_list))
print(np.mean(human_force_accuracy_list), "±", np.std(human_force_accuracy_list))

0.5946969696969697 ± 0.15741359663015264
0.6022727272727272 ± 0.21450901719345106


# Perform t-test on overall accuracy

In [14]:
# Null hypothesis (opposite of what we want to prove): accuracies are not significantly different
# Alternative hypothesis: accuracies are significantly different

In [75]:
from scipy.stats import ttest_ind, ttest_rel, f_oneway

#### Is the model significantly better answering mass questions than humans?

In [16]:
ttest_rel(human_mass_accuracy_list, model_mass_accuracy_list)

Ttest_relResult(statistic=-2.5410778355110044, pvalue=0.018257011838468833)

#### Is the model significantly better answering force questions than humans?

In [17]:
ttest_rel(human_force_accuracy_list, model_force_accuracy_list)

Ttest_relResult(statistic=0.14062873191761607, pvalue=0.8893876174537672)

#### Is the model significantly better answering force questions than mass questions?

In [90]:
ttest_ind(model_mass_accuracy_list, model_force_accuracy_list)

Ttest_indResult(statistic=-0.7519322605334596, pvalue=0.4559221487681704)

# Perform z-test on overall accuracy

In [19]:
# Null hypothesis (opposite of what we want to prove): accuracies are not significantly different
# Alternative hypothesis: classifier 2 is significantly more accurate than the first

z_zero_point_five = 1.645

Z = z_test(passive_responses["trueMass"], passive_responses["mass"], passive_responses["model_mass"])

print(Z)
if Z < -z_zero_point_five:
    print("Classifier two significantly better than classifier one.")
else:
    print("Classifier two not significantly better.")

Z = z_test(passive_responses["trueRelationship"], passive_responses["relationship"], passive_responses["model_relationship"])

print(Z)
if Z < -z_zero_point_five:
    print("Classifier two significantly better than classifier one.")
else:
    print("Classifier two not significantly better than classifier one.")

# Are correct guesses / errors correlated between humans and model?

In [None]:
from scipy.stats import chisquare

In [50]:
passive_responses["human_mass_correct_guesses"] = (passive_responses["mass"] == passive_responses["trueMass"])
passive_responses["human_force_correct_guesses"] = (passive_responses["relationship"] == passive_responses["trueRelationship"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [51]:
passive_responses["model_mass_correct_guesses"] = (passive_responses["model_mass"] == passive_responses["trueMass"])
passive_responses["model_force_correct_guesses"] = (passive_responses["model_relationship"] == passive_responses["trueRelationship"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [54]:
mass_coincidence = get_participant_accuracy(passive_responses, "mass", "model_mass")
force_coincidence = get_participant_accuracy(passive_responses, "relationship", "model_relationship")

print(np.mean(mass_coincidence), np.std(mass_coincidence))
print(np.mean(force_coincidence), np.std(force_coincidence))

In [69]:
def test_for_independence(first_answers, second_answers):
    both_correct = (first_answers & second_answers).sum()
    both_wrong = (~first_answers & ~second_answers).sum()
    first_correct_second_wrong = (~first_answers & second_answers).sum()
    first_wrong_second_correct = (first_answers & ~second_answers).sum()
    
    matrix = [[both_correct, model_correct_human_wrong], [model_wrong_human_correct, both_wrong]]
    chisquare_results = chisquare(matrix, axis=None)
    
    return matrix, chisquare_results

In [70]:
test_for_independence(passive_responses["human_force_correct_guesses"], passive_responses["model_force_correct_guesses"])

([[94, 63], [65, 42]],
 Power_divergenceResult(statistic=20.757575757575758, pvalue=0.00011821240002149164))

In [71]:
test_for_independence(passive_responses["human_mass_correct_guesses"], passive_responses["model_mass_correct_guesses"])

([[65, 63], [65, 66]],
 Power_divergenceResult(statistic=0.07335907335907335, pvalue=0.99483034234225))

In [86]:
def test_for_coincidence(first_answers, second_answers):
    
    matrix = []
    all_classes = first_answers.unique()
    
    for class_name in all_classes:
        this_class_coincidences = []
        for second_class_name in all_classes:
            n_coincidences = ((first_answers == class_name) & (second_answers == second_class_name)).sum()
            this_class_coincidences.append(n_coincidences)
            
        matrix.append(this_class_coincidences)
    
    chisquare_results = chisquare(matrix, axis=None)
    
    return matrix, chisquare_results 

In [87]:
test_for_coincidence(passive_responses["mass"], passive_responses["model_mass"])

([[23, 23, 39], [27, 39, 20], [24, 21, 48]],
 Power_divergenceResult(statistic=27.47727272727273, pvalue=0.0005845625172034362))

In [88]:
test_for_coincidence(passive_responses["relationship"], passive_responses["model_relationship"])

([[56, 25, 40], [32, 37, 7], [24, 9, 34]],
 Power_divergenceResult(statistic=63.81818181818182, pvalue=8.261900559100757e-11))

# Are correct guesses / errors correlated to informativeness?

### Mass responses

In [77]:
is_not_na = ~(passive_responses["post_ent_mass"].isna())

not_na_passive_responses = passive_responses[is_not_na]
post_mass_correct_guesses = not_na_passive_responses[not_na_passive_responses.model_mass_correct_guesses].post_ent_mass
post_mass_wrong_guesses = not_na_passive_responses[~not_na_passive_responses.model_mass_correct_guesses].post_ent_mass

f_oneway(post_mass_correct_guesses, post_mass_wrong_guesses)

F_onewayResult(statistic=0.01279904562116854, pvalue=0.910067041455921)

In [81]:
post_mass_correct_guesses = not_na_passive_responses[not_na_passive_responses.human_mass_correct_guesses].post_ent_mass
post_mass_wrong_guesses = not_na_passive_responses[~not_na_passive_responses.human_mass_correct_guesses].post_ent_mass

f_oneway(post_mass_correct_guesses, post_mass_wrong_guesses)

F_onewayResult(statistic=0.006737936877802744, pvalue=0.934681767921192)

### Force responses

In [79]:
post_force_correct_guesses = not_na_passive_responses[not_na_passive_responses.model_force_correct_guesses].post_ent_rel
post_force_wrong_guesses = not_na_passive_responses[~not_na_passive_responses.model_force_correct_guesses].post_ent_rel

f_oneway(post_force_correct_guesses, post_force_wrong_guesses)

F_onewayResult(statistic=5.518325262971857, pvalue=0.020039887109630675)

In [80]:
post_force_correct_guesses = not_na_passive_responses[not_na_passive_responses.human_force_correct_guesses].post_ent_rel
post_force_wrong_guesses = not_na_passive_responses[~not_na_passive_responses.human_force_correct_guesses].post_ent_rel

f_oneway(post_force_correct_guesses, post_force_wrong_guesses)

F_onewayResult(statistic=6.152748062725771, pvalue=0.014155734767664217)

# Is the model better at predicting any force class?

In [26]:
repel_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'repel'")

none_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'none'")

attract_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'attract'")

In [27]:
print(np.mean(repel_accuracy_list), np.std(repel_accuracy_list))
print(np.mean(none_accuracy_list), np.std(none_accuracy_list))
print(np.mean(attract_accuracy_list), np.std(attract_accuracy_list))

print()

print(ttest_ind(repel_accuracy_list, none_accuracy_list))
print(ttest_ind(repel_accuracy_list, attract_accuracy_list))
print(ttest_ind(attract_accuracy_list, none_accuracy_list))

0.5972222222222222 0.2718109137608462
0.5666666666666668 0.23570226039551584
0.6388888888888888 0.2873355675774611

Ttest_indResult(statistic=0.40731024949571437, pvalue=0.685669440130222)
Ttest_indResult(statistic=-0.5052132657565751, pvalue=0.6158211993432529)
Ttest_indResult(statistic=0.9319889226569723, pvalue=0.3562082784594377)


# Is the model better at predicting any mass class?

In [28]:
a_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'A'")

same_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'same'")

b_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'B'")

In [29]:
print(np.mean(a_accuracy_list), np.std(a_accuracy_list))
print(np.mean(same_accuracy_list), np.std(same_accuracy_list))
print(np.mean(b_accuracy_list), np.std(b_accuracy_list))

print()

print(ttest_ind(a_accuracy_list, same_accuracy_list))
print(ttest_ind(a_accuracy_list, b_accuracy_list))
print(ttest_ind(b_accuracy_list, same_accuracy_list))

0.6666666666666666 0.1717960677340692
0.40625 0.26821652478299446
0.625 0.26020824993326663

Ttest_indResult(statistic=3.921011546884571, pvalue=0.00029150727723554646)
Ttest_indResult(statistic=0.6408699444616551, pvalue=0.5247872789047803)
Ttest_indResult(statistic=2.8073330548972777, pvalue=0.007303420602440454)


# Is the model better at predicting any force class than humans?

In [30]:
human_repel_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                         "trueRelationship", "'repel'")

human_none_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                        "trueRelationship", "'none'")

human_attract_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                   "trueRelationship", "'attract'")

print(np.mean(human_repel_accuracy_list), np.std(human_repel_accuracy_list))
print(np.mean(human_none_accuracy_list), np.std(human_none_accuracy_list))
print(np.mean(human_attract_accuracy_list), np.std(human_attract_accuracy_list))

0.5277777777777778 0.3458305443885759
0.6166666666666667 0.2576604138956718
0.6527777777777778 0.29626012510696587


In [31]:
print(ttest_rel(repel_accuracy_list, human_repel_accuracy_list))
print(ttest_rel(none_accuracy_list, human_none_accuracy_list))
print(ttest_rel(attract_accuracy_list, human_attract_accuracy_list))

Ttest_relResult(statistic=0.6790483458299109, pvalue=0.5038838059771544)
Ttest_relResult(statistic=-0.7429670248402688, pvalue=0.46502329616012295)
Ttest_relResult(statistic=-0.17052013832006113, pvalue=0.8660922491049519)


# Is the model better at predicting any mass class than humans?

In [32]:
human_a_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                     "trueMass", "'A'")

human_same_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                        "trueMass", "'same'")

human_b_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                     "trueMass", "'B'")

print(np.mean(human_a_accuracy_list), np.std(human_a_accuracy_list))
print(np.mean(human_same_accuracy_list), np.std(human_same_accuracy_list))
print(np.mean(human_b_accuracy_list), np.std(human_b_accuracy_list))

print()

0.4791666666666667 0.21550167568309583
0.3541666666666667 0.3054766312211496
0.4861111111111111 0.33304385578560547



In [33]:
print(ttest_rel(a_accuracy_list, human_a_accuracy_list))
print(ttest_rel(same_accuracy_list, human_same_accuracy_list))
print(ttest_rel(b_accuracy_list, human_b_accuracy_list))

Ttest_relResult(statistic=3.1906626065525105, pvalue=0.0040684049186777696)
Ttest_relResult(statistic=0.6026892774303364, pvalue=0.5526120908127179)
Ttest_relResult(statistic=1.6830069266853707, pvalue=0.10590104861701705)
