# Question to be answered:

- Is the accuracy of model/human significantly better? In both force and mass questions?
- Is the distribution of responses significantly different?

In [20]:
import isaac.constants
isaac.constants.TQDM_DISABLE = True

from torch import nn
from isaac.utils import get_cuda_device_if_available
import joblib

from isaac.dataset import read_dataset, prepare_dataset
from isaac.models import MultiBranchModel
from isaac.constants import BASIC_TRAINING_COLS, MASS_CLASS_COLS, FORCE_CLASS_COLS
from isaac.evaluation import evaluate_saved_model
from isaac.statistical_tests import z_test
from isaac.visualization import plot_confusion_matrix

from scipy.stats import ttest_ind, ttest_rel, f_oneway

import torch
import glob
from torch.autograd import Variable
import numpy as np
import pandas as pd
from tqdm import tqdm

In [21]:
device = get_cuda_device_if_available()
print(device)

cuda:0


In [22]:
normalise_data = True
scaler_path = "scalers_js/passive_dual_scaler.sk"
network_dims = (len(BASIC_TRAINING_COLS), 25, 3, 0.5)
dataset_path = "data/js_test_passive_trials.h5"
class_columns = [list(MASS_CLASS_COLS), list(FORCE_CLASS_COLS)]
multiclass = True
seq_end = 2700
step_size = 3
N_MODELS = 25
TRIALS = read_dataset(dataset_path)

def get_question_accuracy_for_group_of_models(question_type):    

    model_paths = sorted(glob.glob("models/train_25_mb_with_js_data/best_"+question_type+"_model_seed_*.pt"))
    accuracies, predicted = evaluate_saved_model(tqdm(model_paths), network_dims, None, 
                                                 training_columns=BASIC_TRAINING_COLS, class_columns=class_columns, 
                                                 step_size=step_size, seq_end=seq_end, scaler_path=scaler_path,
                                                 arch=MultiBranchModel, multiclass=multiclass, trials=TRIALS)
    
    
    if question_type == "mass":
        question_index = 0
    else:
        question_index = 1

    accuracies = np.stack(accuracies)[:, question_index]
    
    predicted = [x[:, question_index].numpy() for x in predicted]

    return accuracies, predicted

def get_participant_accuracy_filtering_by_answer(passive_responses, answer_column, question_type_answer, filter_by_class):
    
    passive_responses = passive_responses.copy().query(question_type_answer+" == "+filter_by_class)
    return passive_responses[answer_column]

# T-test for MASS questions

## Load model's predictions

In [4]:
print("MASS")
question_type = "mass"
group_mass_acc, group_mass_prediction = get_question_accuracy_for_group_of_models(question_type)
     
print("\nFORCE")
question_type = "force"
group_force_acc, group_force_prediction = get_question_accuracy_for_group_of_models(question_type)

  0%|          | 0/25 [00:00<?, ?it/s]

MASS


100%|██████████| 25/25 [01:49<00:00,  4.22s/it]
  0%|          | 0/25 [00:00<?, ?it/s]


FORCE


100%|██████████| 25/25 [01:41<00:00,  3.97s/it]


In [9]:
np.mean(group_mass_acc), np.std(group_mass_acc)

(62.928000000000004, 2.893650981027256)

In [25]:
np.argmax(group_mass_acc)

2

In [10]:
np.mean(group_force_acc), np.std(group_force_acc)

(62.087999999999994, 3.4886467290340537)

In [38]:
sorted(zip(group_force_acc, range(len(group_force_acc))), key=lambda x: x[0], reverse=True)[:5]

[(69.7, 2), (68.0, 6), (66.5, 14), (65.2, 12), (65.1, 9)]

In [39]:
sorted(zip(group_mass_acc, range(len(group_mass_acc))), key=lambda x: x[0], reverse=True)[:5]

[(68.7, 2), (66.6, 16), (65.6, 14), (65.4, 21), (65.3, 12)]

In [None]:
[2, 16, 14, 21, 12]
[2, 6, 14, 12, 9]

#### Is the model significantly better answering force questions than mass questions? And humans?

In [7]:
print(ttest_ind(group_mass_acc, group_force_acc))

Ttest_indResult(statistic=0.9079120008416578, pvalue=0.36846059086537775)


In [12]:
from isaac.visualization import make_frame_curried
import moviepy.editor as mpy

def make_clip(trial_data):

    duration = len(trial_data)

    n_bodies = sum(["o"+str(i)+".x" in list(trial_data.columns) for i in range(1, 5)])
    
    while (len(trial_data) + 1) % 60 != 0:
        trial_data = trial_data.append(trial_data.iloc[-1], ignore_index=True)
    make_frame = make_frame_curried(trial_data, n_bodies)
    clip = mpy.VideoClip(make_frame, duration=duration / 60)
    return clip, trial_data

clips = [make_clip(replay)[0] for replay in TRIALS[:25]]

In [17]:
a = clips[0]

In [19]:
a.write_videofile("here.mp4")

AttributeError: No 'fps' (frames per second) attribute specified for function write_videofile and the clip has no 'fps' attribute. Either provide e.g. fps=24 in the arguments of the function, or define the clip's fps with `clip.fps=24`