# Question to be answered:

- Is the accuracy of model/human significantly better? In both force and mass questions?
- Is the distribution of responses significantly different?

In [1]:
import isaac.constants
isaac.constants.TQDM_DISABLE = True

from torch import nn
from isaac.utils import get_cuda_device_if_available
import joblib

from isaac.dataset import read_dataset, prepare_dataset
from isaac.models import MultiBranchModel
from isaac.constants import BASIC_TRAINING_COLS, MASS_CLASS_COLS, FORCE_CLASS_COLS
from isaac.evaluation import evaluate_saved_model
from isaac.statistical_tests import z_test
from isaac.visualization import plot_confusion_matrix

from scipy.stats import ttest_ind, ttest_rel, f_oneway

import torch
import glob
from torch.autograd import Variable
import numpy as np
import pandas as pd
from tqdm import tqdm

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
device = get_cuda_device_if_available()
print(device)

cpu


In [4]:
def get_question_accuracy_for_group_of_models(question_type):
    normalise_data = True
    scaler_path = "scalers/passive_dual_scaler.sk"
    network_dims = (len(BASIC_TRAINING_COLS), 25, 3, 0.5)
    dataset_path = "data/test_passive_trials.h5"
    class_columns = [list(MASS_CLASS_COLS), list(FORCE_CLASS_COLS)]
    multiclass = True
    seq_end = 2700
    step_size = 3
    
    models = sorted(glob.glob("models/train_25_mb/best_"+question_type+"_model_seed_*.pt"))

    group_accuracy = []
    group_predictions = []
    
    for model_path in tqdm(models):
        accuracies, predicted = evaluate_saved_model(model_path, network_dims, dataset_path, 
                                                     training_columns=BASIC_TRAINING_COLS, class_columns=class_columns, 
                                                     step_size=step_size, seq_end=seq_end, scaler_path=scaler_path,
                                                     arch=MultiBranchModel, multiclass=multiclass, trials=None)

        if question_type == "mass":
            accuracy = accuracies[0]
            predicted = predicted[:, 0]
        else:
            accuracy = accuracies[1]
            predicted = predicted[:, 1]

        group_accuracy.append(accuracy)
        group_predictions.append(predicted.numpy())

    return group_accuracy, group_predictions


def get_participant_accuracy_filtering_by_answer(passive_responses, answer_column, question_type_answer, filter_by_class):
    
    passive_responses = passive_responses.copy().query(question_type_answer+" == "+filter_by_class)
    return passive_responses[answer_column]

# T-test for MASS questions

## Load model's predictions

In [5]:
print("MASS")
question_type = "mass"
group_mass_acc, group_mass_prediction = get_question_accuracy_for_group_of_models(question_type)
     
print("\nFORCE")
question_type = "force"
group_force_acc, group_force_prediction = get_question_accuracy_for_group_of_models(question_type)

  0%|          | 0/25 [00:00<?, ?it/s]

MASS


  4%|▍         | 1/25 [00:15<06:18, 15.76s/it]

Model's accuracy on test set: [64.9 60.6]


  8%|▊         | 2/25 [00:29<05:45, 15.04s/it]

Model's accuracy on test set: [62.5 61.7]


 12%|█▏        | 3/25 [00:42<05:17, 14.43s/it]

Model's accuracy on test set: [67.1 68.2]


 16%|█▌        | 4/25 [00:55<04:54, 14.04s/it]

Model's accuracy on test set: [64.2 64.3]


 20%|██        | 5/25 [01:09<04:39, 13.98s/it]

Model's accuracy on test set: [64.  65.4]


 24%|██▍       | 6/25 [01:22<04:22, 13.81s/it]

Model's accuracy on test set: [62.4 61.1]


 28%|██▊       | 7/25 [01:35<04:06, 13.69s/it]

Model's accuracy on test set: [64.2 63.4]


 32%|███▏      | 8/25 [01:49<03:51, 13.63s/it]

Model's accuracy on test set: [60.3 61.3]


 36%|███▌      | 9/25 [02:02<03:36, 13.55s/it]

Model's accuracy on test set: [59.6 56.5]


 40%|████      | 10/25 [02:16<03:22, 13.49s/it]

Model's accuracy on test set: [59.5 58.6]


 44%|████▍     | 11/25 [02:29<03:07, 13.42s/it]

Model's accuracy on test set: [63.  62.4]


 48%|████▊     | 12/25 [02:42<02:54, 13.41s/it]

Model's accuracy on test set: [63.6 63.9]


 52%|█████▏    | 13/25 [02:56<02:40, 13.41s/it]

Model's accuracy on test set: [62.6 58.3]


 56%|█████▌    | 14/25 [03:09<02:27, 13.45s/it]

Model's accuracy on test set: [69.1 68.7]


 60%|██████    | 15/25 [03:23<02:14, 13.46s/it]

Model's accuracy on test set: [61.5 54. ]


 64%|██████▍   | 16/25 [03:36<02:00, 13.40s/it]

Model's accuracy on test set: [64.4 62.9]


 68%|██████▊   | 17/25 [03:49<01:46, 13.31s/it]

Model's accuracy on test set: [68.2 63. ]


 72%|███████▏  | 18/25 [04:02<01:32, 13.28s/it]

Model's accuracy on test set: [63.  63.7]


 76%|███████▌  | 19/25 [04:15<01:19, 13.23s/it]

Model's accuracy on test set: [62.9 57.1]


 80%|████████  | 20/25 [04:29<01:06, 13.20s/it]

Model's accuracy on test set: [63.  57.6]


 84%|████████▍ | 21/25 [04:42<00:52, 13.19s/it]

Model's accuracy on test set: [67.  61.3]


 88%|████████▊ | 22/25 [04:55<00:39, 13.12s/it]

Model's accuracy on test set: [64.7 61.8]


 92%|█████████▏| 23/25 [05:08<00:26, 13.08s/it]

Model's accuracy on test set: [62.  60.8]


 96%|█████████▌| 24/25 [05:21<00:13, 13.06s/it]

Model's accuracy on test set: [63.2 59. ]


100%|██████████| 25/25 [05:34<00:00, 13.14s/it]
  0%|          | 0/25 [00:00<?, ?it/s]

Model's accuracy on test set: [63.4 64. ]

FORCE


  4%|▍         | 1/25 [00:13<05:34, 13.92s/it]

Model's accuracy on test set: [63.9 63. ]


  8%|▊         | 2/25 [00:27<05:18, 13.85s/it]

Model's accuracy on test set: [61.7 65.1]


 12%|█▏        | 3/25 [00:40<05:01, 13.69s/it]

Model's accuracy on test set: [67.1 68.2]


 16%|█▌        | 4/25 [00:57<05:04, 14.49s/it]

Model's accuracy on test set: [62.5 64.8]


 20%|██        | 5/25 [01:11<04:47, 14.36s/it]

Model's accuracy on test set: [65.1 61.9]


 24%|██▍       | 6/25 [01:26<04:39, 14.70s/it]

Model's accuracy on test set: [59.5 59.4]


 28%|██▊       | 7/25 [01:40<04:17, 14.32s/it]

Model's accuracy on test set: [63.3 63.8]


 32%|███▏      | 8/25 [01:55<04:06, 14.49s/it]

Model's accuracy on test set: [60.5 62.4]


 36%|███▌      | 9/25 [02:08<03:46, 14.18s/it]

Model's accuracy on test set: [58.7 61.3]


 40%|████      | 10/25 [02:22<03:32, 14.16s/it]

Model's accuracy on test set: [58.6 60.4]


 44%|████▍     | 11/25 [02:36<03:16, 14.00s/it]

Model's accuracy on test set: [63.2 64.8]


 48%|████▊     | 12/25 [02:49<02:59, 13.84s/it]

Model's accuracy on test set: [60.2 63.8]


 52%|█████▏    | 13/25 [03:03<02:44, 13.69s/it]

Model's accuracy on test set: [63.6 61. ]


 56%|█████▌    | 14/25 [03:17<02:31, 13.78s/it]

Model's accuracy on test set: [68.3 71.6]


 60%|██████    | 15/25 [03:32<02:21, 14.16s/it]

Model's accuracy on test set: [59.9 56.5]


 64%|██████▍   | 16/25 [03:46<02:08, 14.27s/it]

Model's accuracy on test set: [62.5 66.9]


 68%|██████▊   | 17/25 [04:02<01:56, 14.59s/it]

Model's accuracy on test set: [67.7 61.8]


 72%|███████▏  | 18/25 [04:16<01:41, 14.47s/it]

Model's accuracy on test set: [63.  63.7]


 76%|███████▌  | 19/25 [04:29<01:25, 14.25s/it]

Model's accuracy on test set: [61.5 58.5]


 80%|████████  | 20/25 [04:43<01:10, 14.11s/it]

Model's accuracy on test set: [63.  57.6]


 84%|████████▍ | 21/25 [04:57<00:56, 14.02s/it]

Model's accuracy on test set: [63.9 62.7]


 88%|████████▊ | 22/25 [05:11<00:42, 14.09s/it]

Model's accuracy on test set: [64.8 62.3]


 92%|█████████▏| 23/25 [05:25<00:28, 14.07s/it]

Model's accuracy on test set: [64.8 60.5]


 96%|█████████▌| 24/25 [05:41<00:14, 14.39s/it]

Model's accuracy on test set: [59.4 62.5]


100%|██████████| 25/25 [05:57<00:00, 14.90s/it]

Model's accuracy on test set: [63.4 64. ]





In [9]:
np.mean(group_mass_acc), np.std(group_mass_acc)

(63.61200000000001, 2.330033476154366)

In [10]:
np.mean(group_force_acc), np.std(group_force_acc)

(62.74, 3.203373222089489)

#### Is the model significantly better answering force questions than mass questions? And humans?

In [26]:
print(ttest_ind(model_mass_accuracy_list, model_force_accuracy_list))
print(ttest_ind(human_mass_accuracy_list, human_force_accuracy_list))

Ttest_indResult(statistic=-1.9991187485802633, pvalue=0.049479134569990736)
Ttest_indResult(statistic=-4.3304952743613745, pvalue=4.86608749152777e-05)
