# Eyettention

In [None]:
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
import model
import torch
from torch.utils import model_zoo
import pandas as pd
from utils import *
from sklearn.model_selection import StratifiedKFold, KFold
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam, RMSprop
from transformers import BertTokenizerFast, BertTokenizer
from model import Eyettention
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from torch.nn.functional import cross_entropy, softmax
from collections import deque
import pickle
import json
import matplotlib.pyplot as plt
import argparse
import random
from scasim import *
from evaluate_e_z_reader_model import *
from uniform_model import *

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#DEVICE = 'cuda'
DEVICE = 'cpu'
scanpath_gen_flag = True
atten_type = "local_g"
save_data_folder = "./drive/MyDrive/results/BSC/NRS"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Training loop**

In [None]:
if __name__ == '__main__':
	gpu = 0

	torch.set_default_tensor_type('torch.FloatTensor')
	availbl = torch.cuda.is_available()
	if availbl:
		device = f'cuda:{gpu}'
	else:
		device = 'cpu'
#	torch.cuda.set_device(gpu)

	cf = {"model_pretrained": "bert-base-chinese",
			"lr": 1e-3,
			"max_grad_norm": 10,
			"n_epochs": 150,  # 1000
			"n_folds": 5,
			"dataset": 'BSC',
			"atten_type": 'local-g',
			"batch_size": 256,
			"max_sn_len": 27, #include start token and end token
			"max_sp_len": 40, #include start token and end token
			"norm_type": "z-score",
			"earlystop_patience": 150,
			"max_pred_len": 60
			}

	#Encode the label into interger categories, setting the exclusive category 'cf["max_sn_len"]-1' as the end sign
	le = LabelEncoder()
	le.fit(np.append(np.arange(-cf["max_sn_len"]+3, cf["max_sn_len"]-1), cf["max_sn_len"]-1))
	#le.classes_

	#load corpus
	word_info_df, pos_info_df, eyemovement_df = load_corpus(cf["dataset"])
	#Make list with sentence index
	sn_list = np.unique(eyemovement_df.sn.values).tolist()
	#Make list with reader index
	reader_list = np.unique(eyemovement_df.id.values).tolist()

	random.seed(0)
	fold_indx = 0
	#for scanpath generation
	sp_dnn_list = []
	sp_human_list = []
	for i in range(5):
		print('Sampling time:', i)
		loss_dict = {'val_loss':[], 'train_loss':[], 'test_ll':[], 'test_ll_SE':[], 'test_mse_dur':[], 'test_mse_dur_SE':[], 'test_mse_land_pos':[], 'test_mse_land_pos_SE':[], 'central_scasim_dnn':[], 'central_scasim_dnn_SE':[], 'central_scasim_human':[], 'central_scasim_human_SE':[], 'scasim_dnn':[], 'scasim_dnn_SE':[], 'scasim_human':[], 'scasim_human_SE':[], 'uniform_scasim':[], 'uniform_scasim_SE':[], 'uniform_central_scasim':[], 'uniform_central_scasim_SE':[], 'uniform_nll_SE':[], 'uniform_nll':[], 'uniform_mse_dur_SE':[], 'uniform_mse_dur':[], 'uniform_mse_land_pos_SE':[], 'uniform_mse_land_pos':[],
		             'ez_reader_scasim':[], 'ez_reader_scasim_SE':[], 'ez_reader_central_scasim':[], 'ez_reader_central_scasim_SE':[], 'ez_reader_mse_dur_SE':[], 'ez_reader_mse_dur':[], 'ez_reader_mse_land_pos_SE':[], 'ez_reader_mse_land_pos':[]}

		readers_test = random.sample(reader_list, int(np.ceil(len(reader_list)*0.3)))
		reader_list_val = random.sample(readers_test, int(np.ceil(len(readers_test)*0.5)))
		reader_list_test = set(readers_test) - set(reader_list_val)
		reader_list_test = list(reader_list_test)
		reader_list_train = set(reader_list) - set(reader_list_test) - set(reader_list_val)
		reader_list_train = list(reader_list_train)

		sn_test = random.sample(sn_list, int(np.ceil(len(sn_list)*0.3)))
		sn_list_val = random.sample(sn_test, int(np.ceil(len(sn_test)*0.5)))
		sn_list_test = set(sn_test) - set(sn_list_val)
		sn_list_test = list(sn_list_test)
		sn_list_train = set(sn_list) - set(sn_list_test) - set(sn_list_val)
		sn_list_train = list(sn_list_train)

		#initialize tokenizer
		tokenizer = BertTokenizer.from_pretrained(cf['model_pretrained'])
		#Preparing batch data
		dataset_train = BSCdataset(word_info_df, eyemovement_df, cf, reader_list_train, sn_list_train, tokenizer)
		train_dataloaderr = DataLoader(dataset_train, batch_size = cf["batch_size"], shuffle = True, drop_last=True)

		dataset_val = BSCdataset(word_info_df, eyemovement_df, cf, reader_list_val, sn_list_val, tokenizer)
		val_dataloaderr = DataLoader(dataset_val, batch_size = cf["batch_size"], shuffle = False, drop_last=True)

		dataset_test = BSCdataset(word_info_df, eyemovement_df, cf, reader_list_test, sn_list_test, tokenizer)
		test_dataloaderr = DataLoader(dataset_test, batch_size = cf["batch_size"], shuffle = False, drop_last=False)

		#z-score normalization for gaze features
		fix_dur_mean, fix_dur_std = calculate_mean_std(dataloader=train_dataloaderr, feat_key="sp_fix_dur", padding_value=0, scale=1000)
		landing_pos_mean, landing_pos_std = calculate_mean_std(dataloader=train_dataloaderr, feat_key="sp_landing_pos", padding_value=0)
		sn_word_len_mean, sn_word_len_std = calculate_mean_std(dataloader=train_dataloaderr, feat_key="sn_word_len")

		# load model
		dnn = Eyettention(cf)

		#training
		episode = 0
		optimizer = Adam(dnn.parameters(), lr=cf["lr"])
		dnn.train()
		dnn.to(device)
		av_score = deque(maxlen=100)
		av_location_score = deque(maxlen=100)
		av_duration_score = deque(maxlen=100)
		av_land_pos_score = deque(maxlen=100)
		old_score = 1e10
		save_ep_couter = 0
		print('Start training')
		print("fold_indx", fold_indx)
		for episode_i in range(episode, cf["n_epochs"]+1):
			dnn.train()
			print('episode:', episode_i)
			counter = 0
			for batchh in train_dataloaderr:
				counter += 1
				batchh.keys()
				sn_ids = batchh["sn_ids"].to(device)
				sn_input_ids = batchh["sn_input_ids"].to(device)
				sn_attention_mask = batchh["sn_attention_mask"].to(device)
				sp_input_ids = batchh["sp_input_ids"].to(device)
				sp_attention_mask = batchh["sp_attention_mask"].to(device)
				sp_pos = batchh["sp_pos"].to(device)
				sp_landing_pos = batchh["sp_landing_pos"].to(device) # [256, 40]
				sp_fix_dur = (batchh["sp_fix_dur"]/1000).to(device) # [256, 40]
				sn_word_len = batchh["sn_word_len"].to(device)

				# normalize gaze features (z-score normalisation)
				mask = ~torch.eq(sp_fix_dur, 0)
				sp_fix_dur = (sp_fix_dur-fix_dur_mean)/fix_dur_std * mask
				sp_fix_dur = torch.nan_to_num(sp_fix_dur) # [256, 40]
				sp_landing_pos = (sp_landing_pos - landing_pos_mean)/landing_pos_std * mask
				sp_landing_pos = torch.nan_to_num(sp_landing_pos)
				sn_word_len = (sn_word_len - sn_word_len_mean)/sn_word_len_std
				sn_word_len = torch.nan_to_num(sn_word_len)

				# zero old gradients
				optimizer.zero_grad()
				# predict output with DNN
				location_preds, duration_preds, landing_pos_preds, atten_weights = dnn(sn_emd=sn_input_ids,
											sn_mask=sn_attention_mask,
											sp_emd=sp_input_ids,
											sp_pos=sp_pos,
											word_ids_sn=None,
											word_ids_sp=None,
											sp_fix_dur=sp_fix_dur,
											sp_landing_pos=sp_landing_pos,
											sn_word_len = sn_word_len,
											sn_pred=None,
											sn_word_freq=None
											                                            )#[batch, step, dec_o_dim]

				location_preds = location_preds.permute(0,2,1)              #[batch, dec_o_dim, step]

				#prepare label and mask
				# Compute loss for fixation locations
				pad_mask, label = load_label(sp_pos, cf, le, device)
				loss = nn.CrossEntropyLoss(reduction="none")
				batch_location_error = torch.mean(torch.masked_select(loss(location_preds, label), ~pad_mask))

				# Compute loss for fixation durations
				duration_labels = sp_fix_dur[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
				duration_preds = duration_preds.squeeze(-1)  # Remove extra dimension (from [256, 39, 1] to [256, 39])
				dur_loss = nn.MSELoss(reduction="none")
				batch_duration_error = torch.mean(dur_loss(duration_preds, duration_labels))

				# Compute loss for landing position
				landing_pos_labels = sp_landing_pos[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
				landing_pos_preds = landing_pos_preds.squeeze(-1)  # Remove extra dimension (from [256, 39, 1] to [256, 39])
				land_pos_loss = nn.MSELoss(reduction="none")
				batch_land_pos_error = torch.mean(land_pos_loss(landing_pos_preds, landing_pos_labels))

				# Combined loss for both location and duration
				batch_error = batch_location_error + batch_duration_error + batch_land_pos_error

				# backpropagate loss
				batch_error.backward()
				# clip gradients
				gradient_clipping(dnn, cf["max_grad_norm"])

				#learn
				optimizer.step()
				av_location_score.append(batch_location_error.to('cpu').detach().numpy())
				av_duration_score.append(batch_duration_error.to('cpu').detach().numpy())
				av_land_pos_score.append(batch_land_pos_error.to('cpu').detach().numpy())
				av_score.append(batch_error.to('cpu').detach().numpy())
				print('counter:',counter)
				print('\rSample {}\tLocation Loss: {:.10f}\tDuration Loss: {:.10f}\tLanding position Loss: {:.10f}'.format(
          counter, np.mean(av_location_score), np.mean(av_duration_score), np.mean(av_land_pos_score)), end=" ")
			loss_dict['train_loss'].append(np.mean(av_score))
			if np.mean(av_score) < old_score:
				# save model if val loss is smallest
				torch.save(dnn.state_dict(), '{}/BSC_3head_arch_NRS_{}.pth'.format(save_data_folder, fold_indx))
				old_score = np.mean(av_score)
				print('\nsaved model state dict\n')
				save_ep_couter = episode_i
			else:
				#early stopping
				if episode_i - save_ep_couter >= cf["earlystop_patience"]:
					break

			location_val_loss = []
			duration_val_loss = []
			land_pos_val_loss = []
			val_loss = []
			dnn.eval()
			for batchh in val_dataloaderr:
				print("batchh", batchh)
				with torch.no_grad():
					sn_ids_val = batchh["sn_ids"].to(device)
					sn_input_ids_val = batchh["sn_input_ids"].to(device)
					sn_attention_mask_val = batchh["sn_attention_mask"].to(device)
					sp_input_ids_val = batchh["sp_input_ids"].to(device)
					sp_attention_mask_val = batchh["sp_attention_mask"].to(device)
					sp_pos_val = batchh["sp_pos"].to(device)
					sp_landing_pos_val = batchh["sp_landing_pos"].to(device)
					sp_fix_dur_val = (batchh["sp_fix_dur"]/1000).to(device)
					sn_word_len_val = batchh["sn_word_len"].to(device)

					#normalize gaze features
					mask = ~torch.eq(sp_fix_dur_val, 0)
					sp_fix_dur_val = (sp_fix_dur_val-fix_dur_mean)/fix_dur_std * mask
					sp_landing_pos_val = (sp_landing_pos_val - landing_pos_mean)/landing_pos_std * mask
					sp_fix_dur_val = torch.nan_to_num(sp_fix_dur_val)
					sp_landing_pos_val = torch.nan_to_num(sp_landing_pos_val)
					sn_word_len_val = (sn_word_len_val - sn_word_len_mean)/sn_word_len_std
					sn_word_len_val = torch.nan_to_num(sn_word_len_val)

					location_preds_val, duration_preds_val, landing_pos_preds_val, atten_weights_val = dnn(sn_emd=sn_input_ids_val,
														sn_mask=sn_attention_mask_val,
														sp_emd=sp_input_ids_val,
														sp_pos=sp_pos_val,
														word_ids_sn=None,
														word_ids_sp=None,
														sp_fix_dur=sp_fix_dur_val,
														sp_landing_pos=sp_landing_pos_val,
														sn_word_len = sn_word_len_val,
														sn_pred = None,
														sn_word_freq = None)#[batch, step, dec_o_dim]
					location_preds_val = location_preds_val.permute(0,2,1)              #[batch, dec_o_dim, step

					# Compute location prediction error
					loss = nn.CrossEntropyLoss(reduction="none")
					pad_mask_val, label_val = load_label(sp_pos_val, cf, le, device)
					location_error_val = torch.mean(torch.masked_select(loss(location_preds_val, label_val), ~pad_mask_val))
					location_val_loss.append(location_error_val.detach().to('cpu').numpy())

					# Compute duration prediction error
					duration_labels_val = sp_fix_dur_val[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
					duration_preds_val = duration_preds_val.squeeze(-1)
					duration_error_val = torch.mean(dur_loss(duration_preds_val, duration_labels_val))
					duration_val_loss.append(duration_error_val.detach().to('cpu').numpy())

					# Compute loss for landing position
					landing_pos_labels_val = sp_landing_pos_val[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
					landing_pos_preds_val = landing_pos_preds_val.squeeze(-1)  # Remove extra dimension (from [256, 39, 1] to [256, 39])
					land_pos_error_val = torch.mean(land_pos_loss(landing_pos_preds_val, landing_pos_labels_val))
					land_pos_val_loss.append(land_pos_error_val.detach().to('cpu').numpy())

					combined_loss = location_error_val + duration_error_val + land_pos_error_val
					val_loss.append(combined_loss.detach().to('cpu').numpy())

			print('\nValidation loss for locations {} \n'.format(np.mean(location_val_loss)))
			print('\nValidation loss for duration {} \n'.format(np.mean(duration_val_loss)))
			print('\nValidation loss for landing position {} \n'.format(np.mean(land_pos_val_loss)))
			loss_dict['val_loss'].append(np.mean(val_loss))

			if np.mean(val_loss) < old_score:
				# save model if val loss is smallest
				torch.save(dnn.state_dict(), '{}/BSC_3head_arch_NRS_{}.pth'.format(save_data_folder, fold_indx))
				old_score = np.mean(val_loss)
				print('\nsaved model state dict\n')
				save_ep_couter = episode_i
			else:
				#early stopping
				if episode_i - save_ep_couter >= cf["earlystop_patience"]:
					break
		fold_indx += 1

		#evaluation
		dnn = Eyettention(cf)
		dnn.eval()
		res_llh=[]
		res_mse_dur = []
		res_mse_land_pos = []
		res_central_scasim_human = []
		res_central_scasim_dnn = []
		res_scasim_human = []
		res_scasim_dnn = []
		uniform_central_scasim_scores = []
		uniform_scasim_scores = []
		uniform_nll_scores = []
		uniform_mse_dur_scores = []
		uniform_mse_land_pos_scores = []
		ez_reader_central_scasim_scores = []
		ez_reader_scasim_scores = []
		ez_reader_nll_scores = []
		ez_reader_mse_dur_scores = []
		ez_reader_mse_land_pos_scores = []
		dnn.load_state_dict(torch.load(os.path.join(save_data_folder, f'BSC_3head_arch_NRS_{fold_indx}.pth'), map_location='cpu'))
		dnn.to(device)
		batch_indx = 0
		print("Evaluating for fold", fold_indx)
		for batchh in test_dataloaderr:
			with torch.no_grad():
				sn_ids_test = batchh["sn_ids"].to(device)
				sn_input_ids_test = batchh["sn_input_ids"].to(device)
				sn_attention_mask_test = batchh["sn_attention_mask"].to(device)
				sp_input_ids_test = batchh["sp_input_ids"].to(device)
				sp_attention_mask_test = batchh["sp_attention_mask"].to(device)
				sp_pos_test = batchh["sp_pos"].to(device) # 28: '<Sep>', 29: '<'Pad'>'
				sp_landing_pos_test = batchh["sp_landing_pos"].to(device)
				sp_fix_dur_test = (batchh["sp_fix_dur"]/1000).to(device)
				sn_word_len_test = batchh["sn_word_len"].to(device)

				#normalize gaze features
				mask = ~torch.eq(sp_fix_dur_test, 0)
				sp_fix_dur_test = (sp_fix_dur_test-fix_dur_mean)/fix_dur_std * mask
				sp_landing_pos_test = (sp_landing_pos_test - landing_pos_mean)/landing_pos_std * mask
				sp_fix_dur_test = torch.nan_to_num(sp_fix_dur_test)
				sp_landing_pos_test = torch.nan_to_num(sp_landing_pos_test)
				sn_word_len_test = (sn_word_len_test - sn_word_len_mean)/sn_word_len_std
				sn_word_len_test = torch.nan_to_num(sn_word_len_test)

				location_preds_test, duration_preds_test, landing_pos_preds_test, atten_weights_test = dnn(sn_emd=sn_input_ids_test,
														sn_mask=sn_attention_mask_test,
														sp_emd=sp_input_ids_test,
														sp_pos=sp_pos_test,
														word_ids_sn=None,
														word_ids_sp=None,
														sp_fix_dur=sp_fix_dur_test,
														sp_landing_pos=sp_landing_pos_test,
														sn_word_len = sn_word_len_test,
														sn_pred = None,
														sn_word_freq = None
														) #[batch, step, dec_o_dim]


				########## Evaluate location predictions ##########
				m = nn.Softmax(dim=2)
				location_preds_test = m(location_preds_test).detach().to('cpu').numpy()

				#prepare label and mask
				pad_mask_test, label_test = load_label(sp_pos_test, cf, le, 'cpu')
				#compute log likelihood for the batch samples
				res_batch = eval_log_llh(location_preds_test, label_test, pad_mask_test)
				res_llh.append(np.array(res_batch))

				uniform_output = construct_uniform_tensor(location_preds_test)
				uniform_nll = eval_log_llh(uniform_output, label_test, pad_mask_test)
				uniform_nll_scores.append(np.array(uniform_nll))
				print("Uniform_nll", np.mean(uniform_nll), uniform_nll)

				print("######### Eyettention 2.0 model evaluation ##########")
				duration_preds_test = duration_preds_test.squeeze(-1)
				duration_labels_test = sp_fix_dur_test[:, :39]
				test_mask = mask[:, :39]
				mse_dur = eval_mse(duration_preds_test, duration_labels_test, test_mask)
				print("MSE for durations", np.mean(mse_dur))
				res_mse_dur.append(np.array(mse_dur))

				landing_pos_preds_test = landing_pos_preds_test.squeeze(-1)
				landing_pos_labels_test = sp_landing_pos_test[:, :39]
				mse_landing_pos = eval_mse(landing_pos_preds_test, landing_pos_labels_test, test_mask)
				print("MSE for landing positions", np.mean(mse_landing_pos))
				res_mse_land_pos.append(np.array(mse_landing_pos))


				if bool(scanpath_gen_flag) == True:
					sn_len = (torch.sum(sn_attention_mask_test, axis=1) - 2).detach().to('cpu').numpy()
					# compute the scan path generated from the model when the first CLS token is given
					sp_dnn, _, dur_dnn, land_pos_dnn = dnn.scanpath_generation(sn_emd=sn_input_ids_test,
														 sn_mask=sn_attention_mask_test,
														 word_ids_sn=None,
														 sn_word_len = sn_word_len_test,
														 le=le,
														 sn_word_freq = None,
														 sn_pred = None,
														 sp_fix_dur=sp_fix_dur_test,
														 sp_landing_pos = sp_landing_pos_test,
														 max_pred_len=cf['max_pred_len'])

					sp_dnn, sp_human = prepare_scanpath(sp_dnn.detach().to('cpu').numpy(),
                                              dur_dnn.detach().to('cpu').numpy(),
																							land_pos_dnn.detach().to('cpu').numpy(),
																							sn_len, sp_pos_test,
                                              sp_fix_dur_test, sp_landing_pos_test, cf, sn_ids_test,
																							fix_dur_mean, fix_dur_std, landing_pos_mean, landing_pos_std)

					sp_dnn_list.extend(sp_dnn)
					sp_human_list.extend(sp_human)

					sp_dnn = convert_sp_to_lists(sp_dnn)
					sp_human = convert_sp_to_lists(sp_human)
					sp_human = modify_landing_pos(sp_human.copy())
					sp_dnn = modify_landing_pos(sp_dnn.copy())
					random_sp = sample_random_sp("BSC", sp_human)
					random_sp = convert_sp_to_lists(random_sp)
					random_sp = modify_landing_pos(random_sp.copy())

					scasim_scores_dnn = compute_scasim(sp_dnn, sp_human)
					res_scasim_dnn.append(scasim_scores_dnn)
					print("Mean scasim dnn", np.mean(scasim_scores_dnn))
					scasim_scores_human = compute_scasim(sp_human, random_sp)
					res_scasim_human.append(scasim_scores_human)
					print("Mean scasim human", np.mean(scasim_scores_human))

					central_scasim_scores_dnn = compute_central_scasim("BSC_most_central_sp.txt", sp_dnn)
					central_scasim_scores_human = compute_central_scasim("BSC_most_central_sp.txt", sp_human)
					res_central_scasim_dnn.append(np.array(central_scasim_scores_dnn))
					res_central_scasim_human.append(np.array(central_scasim_scores_human))
					print("Mean central scasim dnn", np.mean(central_scasim_scores_dnn))
					print("Mean central scasim human", np.mean(central_scasim_scores_human))

					print("######### Uniform baseline model evaluation ##########")
					mean_dur_uniform, std_dur_uniform, mean_land_pos_uniform, std_land_pos_uniform = compute_mean_std_uniform("baseline/uniform/BSC_uniform_results.csv")
					uniform_central_scasim, uniform_scasim, dur_mse_scores, land_pos_mse_scores = evaluate_uniform_model("BSC", sp_human, landing_pos_mean, landing_pos_std, fix_dur_mean, fix_dur_std, mean_land_pos_uniform, std_land_pos_uniform, mean_dur_uniform, std_dur_uniform)
					uniform_central_scasim_scores.append(np.array(uniform_central_scasim))
					uniform_scasim_scores.append(np.array(uniform_scasim))
					uniform_mse_dur_scores.append(np.array(dur_mse_scores))
					uniform_mse_land_pos_scores.append(np.array(land_pos_mse_scores))
					print("Uniform mean central Scasim score:", np.mean(uniform_central_scasim))
					print("Uniform mean Scasim score:", np.mean(uniform_scasim))
					print("MSE for uniform durations", np.mean(dur_mse_scores))
					print("MSE for uniform landing pos", np.mean(land_pos_mse_scores))

					print("######### E-Z Reader model evaluation ##########")
					mean_dur_ez_reader, std_dur_ez_reader, mean_land_pos_ez_reader, std_land_pos_ez_reader = compute_mean_std_ez_reader("baseline/E-Z_Reader/BSCSimulationResults.txt")
					central_scasim_ez_reader, scasim_ez_reader, dur_mse_ez_reader, land_pos_mse_ez_reader = evaluate_ez_reader("BSC", "baseline/E-Z_Reader/BSCSimulationResults.txt", sp_human, landing_pos_mean, landing_pos_std, fix_dur_mean, fix_dur_std, mean_land_pos_ez_reader, std_land_pos_ez_reader, mean_dur_ez_reader, std_dur_ez_reader)
					ez_reader_central_scasim_scores.append(np.array(central_scasim_ez_reader))
					print("Central scasim E-Z Reader", np.mean(central_scasim_ez_reader))
					ez_reader_scasim_scores.append(np.array(scasim_ez_reader))
					print("Mean scasim score E-Z Reader", np.mean(scasim_ez_reader))
					ez_reader_mse_dur_scores.append(np.array(dur_mse_ez_reader))
					print("MSE for durations E-Z Reader", np.mean(dur_mse_ez_reader))
					ez_reader_mse_land_pos_scores.append(np.array(land_pos_mse_ez_reader))
					print("MSE for landing pos E-Z Reader", np.mean(land_pos_mse_ez_reader))

				batch_indx +=1

		res_llh = np.concatenate(res_llh).ravel()
		loss_dict['test_ll'].append(res_llh)
		res_mse_dur = np.concatenate(res_mse_dur).ravel()
		loss_dict['test_mse_dur'].append(res_mse_dur)
		res_mse_land_pos = np.concatenate(res_mse_land_pos).ravel()
		loss_dict['test_mse_land_pos'].append(res_mse_land_pos)

		res_central_scasim_dnn = np.concatenate(res_central_scasim_dnn).ravel()
		loss_dict['central_scasim_dnn'].append(res_central_scasim_dnn)
		res_central_scasim_human = np.concatenate(res_central_scasim_human).ravel()
		loss_dict['central_scasim_human'].append(res_central_scasim_human)
		res_scasim_dnn = np.concatenate(res_scasim_dnn).ravel()
		loss_dict['scasim_dnn'].append(res_scasim_dnn)
		res_scasim_human = np.concatenate(res_scasim_human).ravel()
		loss_dict['scasim_human'].append(res_scasim_human)

		loss_dict['fix_dur_mean'] = fix_dur_mean
		loss_dict['fix_dur_std'] = fix_dur_std
		loss_dict['landing_pos_mean'] = landing_pos_mean
		loss_dict['landing_pos_std'] = landing_pos_std
		loss_dict['sn_word_len_mean'] = sn_word_len_mean
		loss_dict['sn_word_len_std'] = sn_word_len_std

		uniform_central_scasim_scores = np.concatenate(uniform_central_scasim_scores).ravel()
		loss_dict['uniform_central_scasim'].append(uniform_central_scasim_scores)
		uniform_scasim_scores = np.concatenate(uniform_scasim_scores).ravel()
		loss_dict['uniform_scasim'].append(uniform_scasim_scores)

		uniform_mse_dur_scores = np.concatenate(uniform_mse_dur_scores).ravel()
		loss_dict['uniform_mse_dur'].append(uniform_mse_dur_scores)
		uniform_mse_land_pos_scores = np.concatenate(uniform_mse_land_pos_scores).ravel()
		loss_dict['uniform_mse_land_pos'].append(uniform_mse_land_pos_scores)
		uniform_nll_scores = np.concatenate(uniform_nll_scores).ravel()
		loss_dict['uniform_nll'].append(uniform_nll_scores)

		ez_reader_central_scasim_scores = np.concatenate(ez_reader_central_scasim_scores).ravel()
		loss_dict['ez_reader_central_scasim'].append(ez_reader_central_scasim_scores)
		ez_reader_scasim_scores = np.concatenate(ez_reader_scasim_scores).ravel()
		loss_dict['ez_reader_scasim'].append(ez_reader_scasim_scores)

		ez_reader_mse_dur_scores = np.concatenate(ez_reader_mse_dur_scores).ravel()
		loss_dict['ez_reader_mse_dur'].append(ez_reader_mse_dur_scores)
		ez_reader_mse_land_pos_scores = np.concatenate(ez_reader_mse_land_pos_scores).ravel()
		loss_dict['ez_reader_mse_land_pos'].append(ez_reader_mse_land_pos_scores)

		print('Test likelihood is {}'.format(np.mean(res_llh)))
		loss_dict['test_ll_SE'].append(np.std(res_llh)/ np.sqrt(len(res_llh)))
		print("Standard error for NLL", np.std(res_llh)/ np.sqrt(len(res_llh)))

		print('Test MSE for durations is {}'.format(np.mean(res_mse_dur)))
		loss_dict['test_mse_dur_SE'].append(np.std(res_mse_dur)/ np.sqrt(len(res_mse_dur)))
		print("Standard error for MSE dur", np.std(res_mse_dur) / np.sqrt(len(res_mse_dur)))

		print('Test MSE for landing positions is {}'.format(np.mean(res_mse_land_pos)))
		loss_dict['test_mse_land_pos_SE'].append(np.std(res_mse_land_pos)/ np.sqrt(len(res_mse_land_pos)))
		print("Standard error for MSE land pos", np.std(res_mse_land_pos) / np.sqrt(len(res_mse_land_pos)))

		print("Central Scasim dnn", np.mean(loss_dict['central_scasim_dnn']))
		loss_dict['central_scasim_dnn_SE'].append(np.std(res_central_scasim_dnn)/ np.sqrt(len(res_central_scasim_dnn)))
		print("Standard error for Central scasim DNN", np.std(res_central_scasim_dnn) / np.sqrt(len(res_central_scasim_dnn)))

		print("Central Scasim human", np.mean(loss_dict['central_scasim_human']))
		loss_dict['central_scasim_human_SE'].append(np.std(res_central_scasim_human)/ np.sqrt(len(res_central_scasim_human)))
		print("Standard error for Central scasim human", np.std(res_central_scasim_human) / np.sqrt(len(res_central_scasim_human)))

		print("Scasim dnn", np.mean(loss_dict['scasim_dnn']))
		loss_dict['scasim_dnn_SE'].append(np.std(res_scasim_dnn)/ np.sqrt(len(res_scasim_dnn)))
		print("Standard error for scasim dnn", np.std(res_scasim_dnn) / np.sqrt(len(res_scasim_dnn)))

		print("Scasim human", np.mean(loss_dict['scasim_human']))
		loss_dict['scasim_human_SE'].append(np.std(res_scasim_human)/ np.sqrt(len(res_scasim_human)))
		print("Standard error for scasim human", np.std(res_scasim_human) / np.sqrt(len(res_scasim_human)))

		print("Uniform central scasim", np.mean(loss_dict['uniform_central_scasim']))
		loss_dict['uniform_central_scasim_SE'].append(np.std(uniform_central_scasim_scores)/ np.sqrt(len(uniform_central_scasim_scores)))
		print("Standard error for uniform central scasim", np.std(uniform_central_scasim_scores) / np.sqrt(len(uniform_central_scasim_scores)))

		print("Uniform scasim", np.mean(loss_dict['uniform_scasim']))
		loss_dict['uniform_scasim_SE'].append(np.std(uniform_scasim_scores)/ np.sqrt(len(uniform_scasim_scores)))
		print("Standard error for uniform scasim", np.std(uniform_scasim_scores) / np.sqrt(len(uniform_scasim_scores)))

		print("Uniform MSE durations", np.mean(loss_dict['uniform_mse_dur']))
		loss_dict['uniform_mse_dur_SE'].append(np.std(dur_mse_scores)/ np.sqrt(len(dur_mse_scores)))
		print("Standard error for uniform MSE durations", np.std(dur_mse_scores) / np.sqrt(len(dur_mse_scores)))

		print("Uniform MSE landing pos", np.mean(loss_dict['uniform_mse_land_pos']))
		loss_dict['uniform_mse_land_pos_SE'].append(np.std(land_pos_mse_scores)/ np.sqrt(len(land_pos_mse_scores)))
		print("Standard error for uniform MSE landing pos", np.std(land_pos_mse_scores) / np.sqrt(len(land_pos_mse_scores)))

		print("Uniform NLL", np.mean(loss_dict['uniform_nll']))
		loss_dict['uniform_nll_SE'].append(np.std(uniform_nll_scores)/ np.sqrt(len(uniform_nll_scores)))
		print("Standard error for uniform NLL", np.std(uniform_nll_scores) / np.sqrt(len(uniform_nll_scores)))

		print("E-Z Reader central scasim", np.mean(loss_dict['ez_reader_central_scasim']))
		loss_dict['ez_reader_central_scasim_SE'].append(np.std(ez_reader_central_scasim_scores)/ np.sqrt(len(ez_reader_central_scasim_scores)))
		print("Standard error for E-Z Reader central scasim", np.std(ez_reader_central_scasim_scores) / np.sqrt(len(ez_reader_central_scasim_scores)))

		print("E-Z Reader scasim", np.mean(loss_dict['ez_reader_scasim']))
		loss_dict['ez_reader_scasim_SE'].append(np.std(ez_reader_scasim_scores)/ np.sqrt(len(ez_reader_scasim_scores)))
		print("Standard error for E-Z Reader scasim", np.std(ez_reader_scasim_scores) / np.sqrt(len(ez_reader_scasim_scores)))

		print("E-Z Reader MSE durations", np.mean(loss_dict['ez_reader_mse_dur']))
		loss_dict['ez_reader_mse_dur_SE'].append(np.std(ez_reader_mse_dur_scores)/ np.sqrt(len(ez_reader_mse_dur_scores)))
		print("Standard error for E-Z Reader MSE durations", np.std(ez_reader_mse_dur_scores) / np.sqrt(len(ez_reader_mse_dur_scores)))

		print("E-Z Reader MSE landing pos", np.mean(loss_dict['ez_reader_mse_land_pos']))
		loss_dict['ez_reader_mse_land_pos_SE'].append(np.std(ez_reader_mse_land_pos_scores)/ np.sqrt(len(ez_reader_mse_land_pos_scores)))
		print("Standard error for E-Z Reader MSE landing pos", np.std(ez_reader_mse_land_pos_scores) / np.sqrt(len(ez_reader_mse_land_pos_scores)))

		#save results
		with open('{}/res_BSC_eyettention_NRS_Fold{}.pickle'.format(save_data_folder, fold_indx), 'wb') as handle:
			pickle.dump(loss_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
		fold_indx += 1


  _C._set_default_tensor_type(t)


Sampling time: 0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

keeping Bert with pre-trained weights


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

keeping Bert with pre-trained weights
Evaluating for fold 0
Uniform_nll -5.672425270080566 [-5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.67242

  item = torch.tensor(item)


Mean scasim dnn 1780.889534883721
Mean scasim human 2116.2906976744184
Mean central scasim dnn 1257.9244186046512
Mean central scasim human 1532.110465116279
######### Uniform baseline model evaluation ##########
Uniform mean central Scasim score: 5033.609756097561
Uniform mean Scasim score: 5236.713414634146
MSE for uniform durations 5.110812267152275
MSE for uniform landing pos 2.3145506856281584
######### E-Z Reader model evaluation ##########
Central scasim E-Z Reader 2772.245283018868
Mean scasim score E-Z Reader 3029.893081761006
MSE for durations E-Z Reader 4.642984780125648
MSE for landing pos E-Z Reader 1.9243550503966194
Test likelihood is -2.246588616417714
Standard error for NLL 0.05594379226176701
Test MSE for durations is 0.11300576960148159
Standard error for MSE dur 0.008855434761624472
Test MSE for landing positions is 0.06184603899803966
Standard error for MSE land pos 0.006565777454650925
Central Scasim dnn 1257.9244186046512
Standard error for Central scasim DNN 36.



keeping Bert with pre-trained weights
keeping Bert with pre-trained weights
Evaluating for fold 1
Uniform_nll -5.672425270080566 [-5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.6724252

  item = torch.tensor(item)


Mean scasim dnn 1656.338888888889
Mean scasim human 1934.9055555555556
Mean central scasim dnn 1281.2444444444445
Mean central scasim human 1357.6444444444444
######### Uniform baseline model evaluation ##########
Uniform mean central Scasim score: 4966.687116564417
Uniform mean Scasim score: 5227.2822085889575
MSE for uniform durations 4.128231971541797
MSE for uniform landing pos 2.130409403995502
######### E-Z Reader model evaluation ##########
Central scasim E-Z Reader 2859.425287356322
Mean scasim score E-Z Reader 3084.074712643678
MSE for durations E-Z Reader 3.6804281932184066
MSE for landing pos E-Z Reader 1.9828712241402988
Test likelihood is -1.6551523864988817
Standard error for NLL 0.04678864015243141
Test MSE for durations is 0.1048761857725266
Standard error for MSE dur 0.004229556745943615
Test MSE for landing positions is 0.0738675419965552
Standard error for MSE land pos 0.002407708541131622
Central Scasim dnn 1281.2444444444445
Standard error for Central scasim DNN 43



keeping Bert with pre-trained weights
keeping Bert with pre-trained weights
Evaluating for fold 2
Uniform_nll -5.672425270080566 [-5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.6724252

  item = torch.tensor(item)


Mean scasim dnn 1644.685393258427
Mean scasim human 2040.8258426966293
Mean central scasim dnn 1226.2078651685392
Mean central scasim human 1437.4831460674156
######### Uniform baseline model evaluation ##########
Uniform mean central Scasim score: 5131.165644171779
Uniform mean Scasim score: 5366.0245398773
MSE for uniform durations 4.57638650364671
MSE for uniform landing pos 2.1182986611237555
######### E-Z Reader model evaluation ##########
Central scasim E-Z Reader 2668.5
Mean scasim score E-Z Reader 2956.964705882353
MSE for durations E-Z Reader 4.145270971690907
MSE for landing pos E-Z Reader 2.059164444926907
Test likelihood is -1.898527887747617
Standard error for NLL 0.04910642263183652
Test MSE for durations is 0.08501806628649657
Standard error for MSE dur 0.004361873446870742
Test MSE for landing positions is 0.056541759782376584
Standard error for MSE land pos 0.0032627735574550527
Central Scasim dnn 1226.2078651685392
Standard error for Central scasim DNN 36.462428460154



keeping Bert with pre-trained weights
keeping Bert with pre-trained weights
Evaluating for fold 3
Uniform_nll -5.672425270080566 [-5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.6724252

  item = torch.tensor(item)


Mean scasim dnn 2058.3260869565215
Mean scasim human 2006.3097826086957
Mean central scasim dnn 1561.7065217391305
Mean central scasim human 1544.7065217391305
######### Uniform baseline model evaluation ##########
Uniform mean central Scasim score: 4707.327272727272
Uniform mean Scasim score: 5020.945454545455
MSE for uniform durations 4.979956564758763
MSE for uniform landing pos 2.1347024697245973
######### E-Z Reader model evaluation ##########
Central scasim E-Z Reader 2825.389534883721
Mean scasim score E-Z Reader 3045.046511627907
MSE for durations E-Z Reader 4.483156663040782
MSE for landing pos E-Z Reader 1.9496448804992577
Test likelihood is -1.6777374311411368
Standard error for NLL 0.03938261174857972
Test MSE for durations is 0.16343743771152652
Standard error for MSE dur 0.018862010100109367
Test MSE for landing positions is 0.09194021879533387
Standard error for MSE land pos 0.004387542932350861
Central Scasim dnn 1561.7065217391305
Standard error for Central scasim DNN 



keeping Bert with pre-trained weights
keeping Bert with pre-trained weights
Evaluating for fold 4
Uniform_nll -5.672425270080566 [-5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.672425270080566, -5.6724252

  item = torch.tensor(item)


Mean scasim dnn 1665.1256830601094
Mean scasim human 1937.0601092896175
Mean central scasim dnn 1306.0765027322404
Mean central scasim human 1372.6612021857923
######### Uniform baseline model evaluation ##########
Uniform mean central Scasim score: 4953.5329341317365
Uniform mean Scasim score: 5245.275449101797
MSE for uniform durations 4.196741593098212
MSE for uniform landing pos 2.3050443080371013
######### E-Z Reader model evaluation ##########
Central scasim E-Z Reader 2678.8685714285716
Mean scasim score E-Z Reader 2964.7771428571427
MSE for durations E-Z Reader 3.7516402074268886
MSE for landing pos E-Z Reader 2.0054101381983074
Test likelihood is -1.8826899130879218
Standard error for NLL 0.04633558885906301
Test MSE for durations is 0.10902763693466212
Standard error for MSE dur 0.006675348386091778
Test MSE for landing positions is 0.08430517738674834
Standard error for MSE land pos 0.0034502814668112168
Central Scasim dnn 1306.0765027322404
Standard error for Central scasim

### Statistical tests ###

In [None]:
from scipy import stats

fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/NRS/res_BSC_eyettention_NRS_Fold{fold_index}.pickle', 'rb') as handle:
      print(fold_index)
      fold_results = pickle.load(handle)

      # NLL
      test_ll = fold_results['test_ll'][0]

      with open(f'drive/MyDrive/results/BSC/NRS/res_BSC_original_eyettention_NRS_Fold{fold_index}.pickle', 'rb') as original:
        original_results = pickle.load(original)
        original_ll = original_results['test_ll'][0]
        t_statistic, p_value = stats.ttest_ind(test_ll, original_ll)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in NLL scores between Eyettention 2.0 and original Eyettention scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and original Eyettention NLL scores.")

      uniform_nll = fold_results['uniform_nll'][0]

      t_statistic, p_value = stats.ttest_ind(test_ll, uniform_nll)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in NLL scores between predicted and uniformly generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between the predicted and uniform NLL scores.")

      # Scasim
      scasim_dnn = fold_results['scasim_dnn'][0]
      scasim_human = fold_results['scasim_human'][0]
      ez_reader_scasim = fold_results['ez_reader_scasim'][0]
      uniform_scasim = fold_results['uniform_scasim'][0]

      t_statistic, p_value = stats.ttest_ind(scasim_dnn, ez_reader_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader generated Scasim scores.")
      t_statistic, p_value = stats.ttest_ind(scasim_dnn, uniform_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform Scasim scores.")

      t_statistic, p_value = stats.ttest_ind(scasim_dnn, scasim_human)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the human scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and human Scasim scores.")

      # Central Scasim
      central_scasim_dnn = fold_results['central_scasim_dnn'][0]
      central_scasim_human = fold_results['central_scasim_human'][0]
      ez_reader_central_scasim = fold_results['ez_reader_central_scasim'][0]
      uniform_central_scasim = fold_results['uniform_central_scasim'][0]

      t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, ez_reader_central_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader Central Scasim scores.")
      t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, uniform_central_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform Central Scasim scores.")

      t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, central_scasim_human)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and the human scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and human Central Scasim scores.")

      # MSE durations
      test_mse_dur = fold_results['test_mse_dur'][0]
      ez_reader_mse_dur = fold_results['ez_reader_mse_dur'][0]
      uniform_mse_dur = fold_results['uniform_mse_dur'][0]

      t_statistic, p_value = stats.ttest_ind(test_mse_dur, ez_reader_mse_dur)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for durations between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader MSE scores for durations.")
      t_statistic, p_value = stats.ttest_ind(test_mse_dur, uniform_mse_dur)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for durations between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform MSE scores for durations.")

      # MSE landing pos
      test_mse_land_pos = fold_results['test_mse_land_pos'][0]
      ez_reader_mse_land_pos = fold_results['ez_reader_mse_land_pos'][0]
      uniform_mse_land_pos = fold_results['uniform_mse_land_pos'][0]

      t_statistic, p_value = stats.ttest_ind(test_mse_land_pos, ez_reader_mse_land_pos)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for landing pos between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader MSE scores for landing pos.")
      t_statistic, p_value = stats.ttest_ind(test_mse_land_pos, uniform_mse_land_pos)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for landing pos between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform MSE scores for landing pos.")

      fold_index += 1


0
Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and original Eyettention NLL scores.
Reject null hypothesis: Statistically significant difference in NLL scores between predicted and uniformly generated scanpaths.
Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and E-Z Reader generated scanpaths.
Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the uniform generated scanpaths.
Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the human scanpaths.
Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and E-Z Reader generated scanpaths.
Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and the uniform generated scanpaths.
Reject null hypothesis: Statistically significant difference in 

  res = hypotest_fun_out(*samples, **kwds)


Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and original Eyettention NLL scores.
Reject null hypothesis: Statistically significant difference in NLL scores between predicted and uniformly generated scanpaths.
Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and E-Z Reader generated scanpaths.
Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the uniform generated scanpaths.
Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the human scanpaths.
Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and E-Z Reader generated scanpaths.
Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and the uniform generated scanpaths.
Fail to reject null hypothesis: No statistically significant diffe

In [None]:
import pickle
import numpy as np

# Analysing the results
test_ll_all_folds = []
test_mse_dur_all_folds = []
test_mse_land_pos_all_folds = []
scasim_dnn_all_folds = []
scasim_human_all_folds = []
central_scasim_dnn_all_folds = []
central_scasim_human_all_folds = []

uniform_ll_all_folds = []
uniform_mse_dur_all_folds = []
uniform_mse_land_pos_all_folds = []
uniform_scasim_all_folds = []
uniform_central_scasim_all_folds = []

ez_reader_mse_dur_all_folds = []
ez_reader_mse_land_pos_all_folds = []
ez_reader_scasim_all_folds = []
ez_reader_central_scasim_all_folds = []

fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/NRS/res_BSC_eyettention_NRS_Fold{fold_index}.pickle', 'rb') as original:
      reader_results = pickle.load(original)
      test_ll_all_folds.extend(reader_results['test_ll'][0])
      test_mse_dur_all_folds.extend(reader_results['test_mse_dur'][0])
      test_mse_land_pos_all_folds.extend(reader_results['test_mse_land_pos'][0])
      scasim_dnn_all_folds.extend(reader_results['scasim_dnn'][0])
      scasim_human_all_folds.extend(reader_results['scasim_human'][0])
      central_scasim_dnn_all_folds.extend(reader_results['central_scasim_dnn'][0])
      central_scasim_human_all_folds.extend(reader_results['central_scasim_human'][0])

      uniform_ll_all_folds.extend(reader_results['uniform_nll'][0])
      uniform_mse_dur_all_folds.extend(reader_results['uniform_mse_dur'][0])
      uniform_mse_land_pos_all_folds.extend(reader_results['uniform_mse_land_pos'][0])
      uniform_scasim_all_folds.extend(reader_results['uniform_scasim'][0])
      uniform_central_scasim_all_folds.extend(reader_results['uniform_central_scasim'][0])

      ez_reader_mse_dur_all_folds.extend(reader_results['ez_reader_mse_dur'][0])
      ez_reader_mse_land_pos_all_folds.extend(reader_results['ez_reader_mse_land_pos'][0])
      ez_reader_scasim_all_folds.extend(reader_results['ez_reader_scasim'][0])
      ez_reader_central_scasim_all_folds.extend(reader_results['ez_reader_central_scasim'][0])

      fold_index += 1

# Calculate overall statistics
def calculate_mean_and_se(values):
    mean_val = np.mean(values)
    se_val = np.std(values) / np.sqrt(len(values))
    return mean_val, se_val

mean_nll, se_nll = calculate_mean_and_se(test_ll_all_folds)
mean_mse_dur, se_mse_dur = calculate_mean_and_se(test_mse_dur_all_folds)
mean_mse_land_pos, se_mse_land_pos = calculate_mean_and_se(test_mse_land_pos_all_folds)
mean_scasim, se_scasim = calculate_mean_and_se(scasim_dnn_all_folds)
mean_central_scasim, se_central_scasim = calculate_mean_and_se(central_scasim_dnn_all_folds)
mean_scasim_human, se_scasim_human = calculate_mean_and_se(scasim_human_all_folds)
mean_central_scasim_human, se_central_scasim_human = calculate_mean_and_se(central_scasim_human_all_folds)

mean_nll_uniform, se_nll_uniform = calculate_mean_and_se(uniform_ll_all_folds)
mean_mse_dur_uniform, se_mse_dur_uniform = calculate_mean_and_se(uniform_mse_dur_all_folds)
mean_mse_land_pos_uniform, se_mse_land_pos_uniform = calculate_mean_and_se(uniform_mse_land_pos_all_folds)
mean_scasim_uniform, se_scasim_uniform = calculate_mean_and_se(uniform_scasim_all_folds)
mean_central_scasim_uniform, se_central_scasim_uniform = calculate_mean_and_se(uniform_central_scasim_all_folds)

mean_nll_ez_reader, se_nll_ez_reader = calculate_mean_and_se(ez_reader_mse_dur_all_folds)
mean_mse_dur_ez_reader, se_mse_dur_ez_reader = calculate_mean_and_se(ez_reader_mse_dur_all_folds)
mean_mse_land_pos_ez_reader, se_mse_land_pos_ez_reader = calculate_mean_and_se(ez_reader_mse_land_pos_all_folds)
mean_scasim_ez_reader, se_scasim_ez_reader = calculate_mean_and_se(ez_reader_scasim_all_folds)
mean_central_scasim_ez_reader, se_central_scasim_ez_reader = calculate_mean_and_se(ez_reader_central_scasim_all_folds)


print("Overall Mean NLL:", mean_nll)
print("Overall Standard error NLL:", se_nll)

print("Overall Mean MSE dur:", mean_mse_dur)
print("Overall Standard error MSE dur:", se_mse_dur)

print("Overall Mean MSE landing pos:", mean_mse_land_pos)
print("Overall Standard error MSE landing pos:", se_mse_land_pos)

print("Overall Mean scasim:", mean_scasim)
print("Overall Standard error scasim:", se_scasim)

print("Overall Mean Central scasim:", mean_central_scasim)
print("Overall Standard error Central scasim:", se_central_scasim)

print("Human Mean scasim:", mean_scasim_human)
print("Human Standard error scasim:", se_scasim_human)

print("Human Mean central scasim:", mean_central_scasim_human)
print("Human Standard error central scasim:", se_central_scasim_human)


print("Overall Mean NLL Uniform:", mean_nll_uniform)
print("Overall Standard error NLL:", se_nll_uniform)

print("Overall Mean MSE dur Uniform:", mean_mse_dur_uniform)
print("Overall Standard error MSE dur Uniform:", se_mse_dur_uniform)

print("Overall Mean MSE landing pos Uniform:", mean_mse_land_pos_uniform)
print("Overall Standard error MSE landing pos:", se_mse_land_pos_uniform)

print("Overall Mean scasim Uniform:", mean_scasim_uniform)
print("Overall Standard error scasim:", se_scasim_uniform)

print("Overall Mean Central scasim Uniform:", mean_central_scasim_uniform)
print("Overall Standard error Central scasim:", se_central_scasim_uniform)


print("Overall Mean MSE dur E-Z Reader:", mean_mse_dur_ez_reader)
print("Overall Standard error MSE dur E-Z Reader:", se_mse_dur_ez_reader)

print("Overall Mean MSE landing pos E-Z Reader:", mean_mse_land_pos_ez_reader)
print("Overall Standard error MSE landing pos:", se_mse_land_pos_ez_reader)

print("Overall Mean scasim E-Z Reader:", mean_scasim_ez_reader)
print("Overall Standard error scasim:", se_scasim_ez_reader)

print("Overall Mean Central scasim E-Z Reader:", mean_central_scasim_ez_reader)
print("Overall Standard error Central scasim:", se_central_scasim_ez_reader)

Overall Mean NLL: -1.8679092274668676
Overall Standard error NLL: 0.02240794317320321
Overall Mean MSE dur: 0.11535392163597553
Overall Standard error MSE dur: 0.004684533963432554
Overall Mean MSE landing pos: 0.07396093336501913
Overall Standard error MSE landing pos: 0.001933896809380796
Overall Mean scasim: 1762.160535117057
Overall Standard error scasim: 27.684055325045975
Overall Mean Central scasim: 1328.448160535117
Overall Standard error Central scasim: 21.89805050923021
Human Mean scasim: 2005.7915273132664
Human Standard error scasim: 25.726573687899954
Human Mean central scasim: 1448.376811594203
Human Standard error central scasim: 20.415768197837465
Overall Mean NLL Uniform: -5.672425270080566
Overall Standard error NLL: 0.0
Overall Mean MSE dur Uniform: 4.598022752434668
Overall Standard error MSE dur Uniform: 0.08533188589178006
Overall Mean MSE landing pos Uniform: 2.20108763260853
Overall Standard error MSE landing pos: 0.03093059094746841
Overall Mean scasim Uniform: