# Question to be answered:

- Is the accuracy of model/human significantly better? In both force and mass questions?
- Is the distribution of responses significantly different?

In [1]:
import isaac.constants
isaac.constants.TQDM_DISABLE = True

from torch import nn
from isaac.utils import get_cuda_device_if_available
import joblib

from isaac.dataset import read_dataset, prepare_dataset
from isaac.models import MultiBranchModel
from isaac.constants import BASIC_TRAINING_COLS, MASS_CLASS_COLS, FORCE_CLASS_COLS
from isaac.evaluation import evaluate_saved_model
from isaac.statistical_tests import z_test

import torch
import glob
from torch.autograd import Variable
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
device = get_cuda_device_if_available()
print(device)

cpu


In [3]:
def get_question_accuracy_for_group_of_models(question_type):
    normalise_data = True
    scaler_path = "scalers/passive_dual_scaler.sk"
    network_dims = (len(BASIC_TRAINING_COLS), 25, 3, 0.5)
    dataset_path = "data/passive_trials_exp1.h5"
    class_columns = [list(MASS_CLASS_COLS), list(FORCE_CLASS_COLS)]
    multiclass = True
    seq_end = 1800
    step_size = 3
    
    models = sorted(glob.glob("models/best_"+question_type+"_model_seed_*.pt"))

    group_accuracy = []
    group_predictions = []

    for model_path in tqdm(models):
        accuracies, predicted = evaluate_saved_model(model_path, network_dims, dataset_path, 
                                                     training_columns=BASIC_TRAINING_COLS, class_columns=class_columns, 
                                                     step_size=step_size, seq_end=seq_end, scaler_path=scaler_path,
                                                     arch=MultiBranchModel, multiclass=multiclass)

        if question_type == "mass":
            accuracy = accuracies[0]
            predicted = predicted[:, 0]
        else:
            accuracy = accuracies[1]
            predicted = predicted[:, 1]

        group_accuracy.append(accuracy)
        group_predictions.append(predicted.numpy())

    return group_accuracy, group_predictions

def get_participant_accuracy(passive_responses, answer_column, question_type_answer):
    return [(df[answer_column] == df[question_type_answer]).sum() / len(df) 
            for participant_id, df in passive_responses.groupby("participant")]


def get_participant_accuracy_filtering_by_answer(passive_responses, answer_column, question_type_answer, filter_by_class):
    
    passive_responses = passive_responses.copy().query(question_type_answer+" == "+filter_by_class)
    
    return [(df[answer_column] == df[question_type_answer]).sum() / len(df) 
            for participant_id, df in passive_responses.groupby("participant")]

# T-test for MASS questions

## Load model's predictions

In [4]:
question_type = "mass"
group_mass_acc, group_mass_prediction = get_question_accuracy_for_group_of_models(question_type)
     
question_type = "force"
group_force_acc, group_force_prediction = get_question_accuracy_for_group_of_models(question_type)

 20%|██        | 1/5 [00:03<00:12,  3.12s/it]

Model's accuracy on test set: [31.01851852 36.11111111]


 40%|████      | 2/5 [00:05<00:08,  2.95s/it]

Model's accuracy on test set: [32.87037037 44.44444444]


 60%|██████    | 3/5 [00:08<00:05,  2.80s/it]

Model's accuracy on test set: [27.77777778 22.22222222]


 80%|████████  | 4/5 [00:10<00:02,  2.69s/it]

Model's accuracy on test set: [32.40740741 39.35185185]


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

Model's accuracy on test set: [33.33333333 43.51851852]


 20%|██        | 1/5 [00:02<00:10,  2.54s/it]

Model's accuracy on test set: [31.01851852 36.11111111]


 40%|████      | 2/5 [00:04<00:07,  2.51s/it]

Model's accuracy on test set: [32.87037037 44.44444444]


 60%|██████    | 3/5 [00:07<00:04,  2.49s/it]

Model's accuracy on test set: [29.16666667 34.25925926]


 80%|████████  | 4/5 [00:09<00:02,  2.50s/it]

Model's accuracy on test set: [32.40740741 39.35185185]


100%|██████████| 5/5 [00:12<00:00,  2.48s/it]

Model's accuracy on test set: [33.33333333 43.51851852]





In [5]:
np.mean(group_mass_acc), np.mean(group_force_acc)

(31.481481481481485, 39.53703703703703)

In [6]:
from scipy.stats import mode

group_mass_prediction = np.array(group_mass_prediction)
group_force_prediction = np.array(group_force_prediction)

majority_mass_predictions, _ = mode(group_mass_prediction, axis=0)
majority_force_predictions, _ = mode(group_force_prediction, axis=0)

In [7]:
ensemble_mass_answers = [MASS_CLASS_COLS[i] for i in majority_mass_predictions[0]]
ensemble_force_answers = [FORCE_CLASS_COLS[i] for i in majority_force_predictions[0]]

## Load human results

In [8]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()


 



In [9]:
rdata_path = "for_hector_small/data/exp1.rdata"
r['load'](rdata_path)

is_passive = (r['dfc.l'].condition == '0') & (r['dfc.l'].practice == 0.) & (r['dfc.l'].exclude == 0)
responses = r['dfc.l'][["participant", "mass", "relationship", "trueMass", "trueRelationship", 'post_ent', 'post_ent_rel.rtheta', 'post_ent_mass.rtheta', 'corMass', 'corRel']]
passive_responses = responses[is_passive]

  res = PandasDataFrame.from_items(items)


In [11]:
passive_responses["model_mass"] = ensemble_mass_answers
passive_responses["model_relationship"] = ensemble_force_answers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
human_mass_accuracy_list = get_participant_accuracy(passive_responses, "mass", "trueMass")
human_force_accuracy_list = get_participant_accuracy(passive_responses, "relationship", "trueRelationship")

model_mass_accuracy_list = get_participant_accuracy(passive_responses, "model_mass", "trueMass")
model_force_accuracy_list = get_participant_accuracy(passive_responses, "model_relationship", "trueRelationship")

In [13]:
print(np.mean(human_mass_accuracy_list), "±", np.std(human_mass_accuracy_list))
print(np.mean(model_mass_accuracy_list), "±", np.std(model_mass_accuracy_list))

0.4537037037037037 ± 0.1922272175187657
0.3287037037037037 ± 0.02220292371904037


In [14]:
print(np.mean(model_force_accuracy_list), "±", np.std(model_force_accuracy_list))
print(np.mean(human_force_accuracy_list), "±", np.std(human_force_accuracy_list))

0.4166666666666667 ± 0.05782405554072591
0.611111111111111 ± 0.2151657414559676


# Perform t-test on overall accuracy

In [15]:
# Null hypothesis (opposite of what we want to prove): accuracies are not significantly different
# Alternative hypothesis: accuracies are significantly different

In [16]:
from scipy.stats import ttest_ind, ttest_rel, f_oneway

#### Is the model significantly better answering mass questions than humans?

In [17]:
ttest_rel(human_mass_accuracy_list, model_mass_accuracy_list)

Ttest_relResult(statistic=3.1906626065525105, pvalue=0.0040684049186777696)

#### Is the model significantly better answering force questions than humans?

In [18]:
ttest_rel(human_force_accuracy_list, model_force_accuracy_list)

Ttest_relResult(statistic=3.7455367630225616, pvalue=0.0010559813084842161)

#### Is the model significantly better answering force questions than mass questions?

In [19]:
ttest_ind(model_mass_accuracy_list, model_force_accuracy_list)

Ttest_indResult(statistic=-6.810688280954811, pvalue=1.7530289028889897e-08)

# Perform z-test on overall accuracy

In [20]:
# Null hypothesis (opposite of what we want to prove): accuracies are not significantly different
# Alternative hypothesis: classifier 2 is significantly more accurate than the first

z_zero_point_five = 1.645

Z = z_test(passive_responses["trueMass"], passive_responses["mass"], passive_responses["model_mass"])

print(Z)
if Z < -z_zero_point_five:
    print("Classifier two significantly better than classifier one.")
else:
    print("Classifier two not significantly better.")

Z = z_test(passive_responses["trueRelationship"], passive_responses["relationship"], passive_responses["model_relationship"])

print(Z)
if Z < -z_zero_point_five:
    print("Classifier two significantly better than classifier one.")
else:
    print("Classifier two not significantly better than classifier one.")

# Are correct guesses / errors correlated between humans and model?

In [21]:
from scipy.stats import chisquare

In [22]:
passive_responses["model_mass_correct_guesses"] = (passive_responses["model_mass"] == passive_responses["trueMass"])
passive_responses["model_force_correct_guesses"] = (passive_responses["model_relationship"] == passive_responses["trueRelationship"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
mass_coincidence = get_participant_accuracy(passive_responses, "mass", "model_mass")
force_coincidence = get_participant_accuracy(passive_responses, "relationship", "model_relationship")

print(np.mean(mass_coincidence), np.std(mass_coincidence))
print(np.mean(force_coincidence), np.std(force_coincidence))

0.34259259259259256 0.14669425479402648
0.40277777777777785 0.1353721436779023


In [24]:
def test_for_independence(first_answers, second_answers):
    both_correct = (first_answers & second_answers).sum()
    both_wrong = (~first_answers & ~second_answers).sum()
    first_correct_second_wrong = (~first_answers & second_answers).sum()
    first_wrong_second_correct = (first_answers & ~second_answers).sum()
    
    matrix = [[both_correct, first_correct_second_wrong], [first_wrong_second_correct, both_wrong]]
    chisquare_results = chisquare(matrix, axis=None)
    
    return matrix, chisquare_results

In [25]:
test_for_independence(passive_responses["corRel"], passive_responses["model_force_correct_guesses"])

([[51, 39], [81, 45]],
 Power_divergenceResult(statistic=19.333333333333336, pvalue=0.00023326399901124658))

In [26]:
test_for_independence(passive_responses["corMass"], passive_responses["model_mass_correct_guesses"])

([[35, 36], [63, 82]],
 Power_divergenceResult(statistic=28.703703703703702, pvalue=2.5845665408193226e-06))

In [27]:
def test_for_coincidence(first_answers, second_answers):
    
    matrix = []
    all_classes = first_answers.unique()
    
    for class_name in all_classes:
        this_class_coincidences = []
        for second_class_name in all_classes:
            n_coincidences = ((first_answers == class_name) & (second_answers == second_class_name)).sum()
            this_class_coincidences.append(n_coincidences)
            
        matrix.append(this_class_coincidences)
    
    chisquare_results = chisquare(matrix, axis=None)
    
    return matrix, chisquare_results, all_classes

In [28]:
test_for_coincidence(passive_responses["mass"], passive_responses["model_mass"])

([[0, 0, 69], [0, 0, 72], [0, 1, 74]],
 Power_divergenceResult(statistic=426.58333333333337, pvalue=3.8326502365740223e-87),
 array(['same', 'B', 'A'], dtype=object))

In [29]:
test_for_coincidence(passive_responses["relationship"], passive_responses["model_relationship"])

([[2, 60, 0], [10, 85, 0], [2, 57, 0]],
 Power_divergenceResult(statistic=374.91666666666663, pvalue=4.3192848529991275e-76),
 array(['attract', 'none', 'repel'], dtype=object))

# Are correct guesses / errors correlated to informativeness?

### Mass responses

In [30]:
not_na_passive_responses = passive_responses[passive_responses["post_ent"].notna()]

post_mass_correct_guesses = not_na_passive_responses.query("model_mass_correct_guesses")["post_ent_mass.rtheta"]
post_mass_wrong_guesses = not_na_passive_responses.query("not model_mass_correct_guesses")["post_ent_mass.rtheta"]

f_oneway(post_mass_correct_guesses, post_mass_wrong_guesses)

F_onewayResult(statistic=4.297642802150152, pvalue=0.03976847389477966)

In [31]:
post_mass_correct_guesses = not_na_passive_responses.query("corMass == 1")["post_ent_mass.rtheta"]
post_mass_wrong_guesses = not_na_passive_responses.query("corMass == 0")["post_ent_mass.rtheta"]

f_oneway(post_mass_correct_guesses, post_mass_wrong_guesses)

F_onewayResult(statistic=0.03857733827167141, pvalue=0.8445376237361524)

### Force responses

In [32]:
post_force_correct_guesses = not_na_passive_responses[not_na_passive_responses.model_force_correct_guesses]['post_ent_rel.rtheta']
post_force_wrong_guesses = not_na_passive_responses[~not_na_passive_responses.model_force_correct_guesses]['post_ent_rel.rtheta']

f_oneway(post_force_correct_guesses, post_force_wrong_guesses)

F_onewayResult(statistic=0.02512060144015596, pvalue=0.8742667881234977)

In [33]:
post_force_correct_guesses = not_na_passive_responses[(not_na_passive_responses.corRel).astype(bool)]['post_ent_rel.rtheta']
post_force_wrong_guesses = not_na_passive_responses[~(not_na_passive_responses.corRel).astype(bool)]['post_ent_rel.rtheta']

f_oneway(post_force_correct_guesses, post_force_wrong_guesses)

F_onewayResult(statistic=5.982515021800705, pvalue=0.015531865919044263)

#### Same statistic calculated a different way

import statsmodels.api as sm
from statsmodels.formula.api import ols

df_copy = not_na_passive_responses.copy()
df_copy = df_copy.rename({'post_ent_rel.rtheta': 'post_ent_rel_rtheta',
                          'post_ent_mass.rtheta': 'post_ent_mass_rtheta'}, axis=1)

moore_lm = ols('post_ent_mass_rtheta ~ corMass', data=df_copy).fit()
table = sm.stats.anova_lm(moore_lm, typ=2)
print(table)

moore_lm = ols('post_ent_rel_rtheta ~ corRel', data=df_copy).fit()
table = sm.stats.anova_lm(moore_lm, typ=2)
print(table)

# Is the model better at predicting any force class?

In [34]:
repel_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'repel'")

none_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'none'")

attract_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_relationship", 
                                                                   "trueRelationship", "'attract'")

In [35]:
print(np.mean(repel_accuracy_list), np.std(repel_accuracy_list))
print(np.mean(none_accuracy_list), np.std(none_accuracy_list))
print(np.mean(attract_accuracy_list), np.std(attract_accuracy_list))

print()

print(ttest_ind(repel_accuracy_list, none_accuracy_list))
print(ttest_ind(repel_accuracy_list, attract_accuracy_list))
print(ttest_ind(attract_accuracy_list, none_accuracy_list))

0.0 0.0
0.90625 0.14091405477571545
0.0625 0.16535945694153692

Ttest_indResult(statistic=-30.843071863340935, pvalue=2.2310244708265363e-32)
Ttest_indResult(statistic=-1.8126539343499317, pvalue=0.07641426001850131)
Ttest_indResult(statistic=-18.625344245835084, pvalue=4.665556838046147e-23)


# Is the model better at predicting any mass class?

In [36]:
a_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'A'")

same_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'same'")

b_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "model_mass", 
                                                                   "trueMass", "'B'")

In [37]:
print(np.mean(a_accuracy_list), np.std(a_accuracy_list))
print(np.mean(same_accuracy_list), np.std(same_accuracy_list))
print(np.mean(b_accuracy_list), np.std(b_accuracy_list))

print()

print(ttest_ind(a_accuracy_list, same_accuracy_list))
print(ttest_ind(a_accuracy_list, b_accuracy_list))
print(ttest_ind(b_accuracy_list, same_accuracy_list))

0.986111111111111 0.0666087711571211
0.0 0.0
0.0 0.0

Ttest_indResult(statistic=70.99999999999999, pvalue=1.1611844013922297e-48)
Ttest_indResult(statistic=70.99999999999999, pvalue=1.1611844013922297e-48)
Ttest_indResult(statistic=nan, pvalue=nan)


# Is the model better at predicting any force class than humans?

In [38]:
human_repel_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                         "trueRelationship", "'repel'")

human_none_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                        "trueRelationship", "'none'")

human_attract_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "relationship", 
                                                                   "trueRelationship", "'attract'")

print(np.mean(human_repel_accuracy_list), np.std(human_repel_accuracy_list))
print(np.mean(human_none_accuracy_list), np.std(human_none_accuracy_list))
print(np.mean(human_attract_accuracy_list), np.std(human_attract_accuracy_list))

0.5277777777777778 0.3458305443885759
0.6041666666666666 0.25937290829143195
0.75 0.3227486121839514


In [39]:
print(ttest_rel(repel_accuracy_list, human_repel_accuracy_list))
print(ttest_rel(none_accuracy_list, human_none_accuracy_list))
print(ttest_rel(attract_accuracy_list, human_attract_accuracy_list))

Ttest_relResult(statistic=-7.318998697600914, pvalue=1.9058341561166007e-07)
Ttest_relResult(statistic=4.2818917730832595, pvalue=0.00027888703835986467)
Ttest_relResult(statistic=-8.75193294508301, pvalue=8.879042557701807e-09)


# Is the model better at predicting any mass class than humans?

In [40]:
human_a_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                     "trueMass", "'A'")

human_same_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                        "trueMass", "'same'")

human_b_accuracy_list = get_participant_accuracy_filtering_by_answer(passive_responses, "mass", 
                                                                     "trueMass", "'B'")

print(np.mean(human_a_accuracy_list), np.std(human_a_accuracy_list))
print(np.mean(human_same_accuracy_list), np.std(human_same_accuracy_list))
print(np.mean(human_b_accuracy_list), np.std(human_b_accuracy_list))

print()

0.5 0.2545875386086578
0.375 0.3236438943814179
0.4861111111111111 0.33304385578560547



In [41]:
print(ttest_rel(a_accuracy_list, human_a_accuracy_list))
print(ttest_rel(same_accuracy_list, human_same_accuracy_list))
print(ttest_rel(b_accuracy_list, human_b_accuracy_list))

Ttest_relResult(statistic=8.57694401686331, pvalue=1.2724602303790703e-08)
Ttest_relResult(statistic=-5.556838403145626, pvalue=1.184889244136472e-05)
Ttest_relResult(statistic=-7.000000000000001, pvalue=3.9141861589268866e-07)
