# Eyettention

In [None]:
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
import model
import torch
from torch.utils import model_zoo
import pandas as pd
from utils import *
from sklearn.model_selection import StratifiedKFold, KFold
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam, RMSprop
from transformers import BertTokenizerFast
from model import Eyettention
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from torch.nn.functional import cross_entropy, softmax
from collections import deque, Counter
import pickle
from transformers import BertTokenizer
from evaluate_e_z_reader_model import *
import json
import matplotlib.pyplot as plt
import argparse
import random
from scasim import *
from uniform_model import *

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#DEVICE = 'cuda'
DEVICE = 'cpu'
test_mode = 'text'
#test_mode = 'subject'
scanpath_gen_flag = True
atten_type = "local_g"
save_data_folder = "./drive/MyDrive/results/BSC/New_Sentence"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Training loop**

In [None]:
if __name__ == '__main__':
	gpu = 0

	torch.set_default_tensor_type('torch.FloatTensor')
	availbl = torch.cuda.is_available()
	if availbl:
		device = f'cuda:{gpu}'
	else:
		device = 'cpu'
	#torch.cuda.set_device(gpu)

	cf = {"model_pretrained": "bert-base-chinese",
			"lr": 1e-3,
			"max_grad_norm": 10,
			"n_epochs": 150,  # 1000
			"n_folds": 5,
			"dataset": 'BSC',
			"atten_type": 'local-g',
			"batch_size": 256,
			"max_sn_len": 27, #include start token and end token
			"max_sp_len": 40, #include start token and end token
			"norm_type": "z-score",
			"earlystop_patience": 20,
			"max_pred_len": 60
			}

	#Encode the label into interger categories, setting the exclusive category 'cf["max_sn_len"]-1' as the end sign
	le = LabelEncoder()
	le.fit(np.append(np.arange(-cf["max_sn_len"]+3, cf["max_sn_len"]-1), cf["max_sn_len"]-1))
	#le.classes_

	#load corpus
	word_info_df, pos_info_df, eyemovement_df = load_corpus(cf["dataset"])
	#Make list with sentence index
	sn_list = np.unique(eyemovement_df.sn.values).tolist()
	#Make list with reader index
	reader_list = np.unique(eyemovement_df.id.values).tolist()

	#Split training&test sets by text or reader, depending on configuration
	if test_mode == 'text':
		print('Start evaluating on new sentences.')
		split_list = sn_list
	elif test_mode == 'subject':
		print('Start evaluating on new readers.')
		split_list = reader_list

	n_folds = cf["n_folds"]
	kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
	fold_indx = 0
	#for scanpath generation
	sp_dnn_list = []
	sp_human_list = []
	for train_idx, test_idx in kf.split(split_list):

		loss_dict = {'val_loss':[], 'train_loss':[], 'test_ll':[], 'test_ll_SE':[], 'test_mse_dur':[], 'test_mse_dur_SE':[], 'test_mse_land_pos':[], 'test_mse_land_pos_SE':[], 'central_scasim_dnn':[], 'central_scasim_dnn_SE':[], 'central_scasim_human':[], 'central_scasim_human_SE':[], 'scasim_dnn':[], 'scasim_dnn_SE':[], 'scasim_human':[], 'scasim_human_SE':[], 'uniform_scasim':[], 'uniform_scasim_SE':[], 'uniform_central_scasim':[], 'uniform_central_scasim_SE':[], 'uniform_nll_SE':[], 'uniform_nll':[], 'uniform_mse_dur_SE':[], 'uniform_mse_dur':[], 'uniform_mse_land_pos_SE':[], 'uniform_mse_land_pos':[],
		             'ez_reader_scasim':[], 'ez_reader_scasim_SE':[], 'ez_reader_central_scasim':[], 'ez_reader_central_scasim_SE':[], 'ez_reader_mse_dur_SE':[], 'ez_reader_mse_dur':[], 'ez_reader_mse_land_pos_SE':[], 'ez_reader_mse_land_pos':[]}
		list_train = [split_list[i] for i in train_idx]
		list_test = [split_list[i] for i in test_idx]

		# create train validation split for training the models:
		kf_val = KFold(n_splits=n_folds, shuffle=True, random_state=0)
		for train_index, val_index in kf_val.split(list_train):
			# we only evaluate a single fold
			break
		list_train_net = [list_train[i] for i in train_index]
		list_val_net = [list_train[i] for i in val_index]

		if test_mode == 'text':
			sn_list_train = list_train_net
			sn_list_val = list_val_net
			sn_list_test = list_test
			reader_list_train, reader_list_val, reader_list_test = reader_list, reader_list, reader_list

		elif test_mode == 'subject':
			reader_list_train = list_train_net
			reader_list_val = list_val_net
			reader_list_test = list_test
			sn_list_train, sn_list_val, sn_list_test = sn_list, sn_list, sn_list

		#initialize tokenizer
		tokenizer = BertTokenizer.from_pretrained(cf['model_pretrained'])
		#Preparing batch data
		dataset_train = BSCdataset(word_info_df, eyemovement_df, cf, reader_list_train, sn_list_train, tokenizer)
		train_dataloaderr = DataLoader(dataset_train, batch_size = cf["batch_size"], shuffle = True, drop_last=True)

		dataset_val = BSCdataset(word_info_df, eyemovement_df, cf, reader_list_val, sn_list_val, tokenizer)
		val_dataloaderr = DataLoader(dataset_val, batch_size = cf["batch_size"], shuffle = False, drop_last=True)

		dataset_test = BSCdataset(word_info_df, eyemovement_df, cf, reader_list_test, sn_list_test, tokenizer)
		test_dataloaderr = DataLoader(dataset_test, batch_size = cf["batch_size"], shuffle = False, drop_last=False)

		#z-score normalization for gaze features
		fix_dur_mean, fix_dur_std = calculate_mean_std(dataloader=train_dataloaderr, feat_key="sp_fix_dur", padding_value=0, scale=1000)
		landing_pos_mean, landing_pos_std = calculate_mean_std(dataloader=train_dataloaderr, feat_key="sp_landing_pos", padding_value=0)
		sn_word_len_mean, sn_word_len_std = calculate_mean_std(dataloader=train_dataloaderr, feat_key="sn_word_len")

		# load model
		dnn = Eyettention(cf)

		#training
		episode = 0
		optimizer = Adam(dnn.parameters(), lr=cf["lr"])
		dnn.train()
		dnn.to(device)
		av_score = deque(maxlen=100)
		av_location_score = deque(maxlen=100)
		av_duration_score = deque(maxlen=100)
		av_land_pos_score = deque(maxlen=100)
		old_score = 1e10
		save_ep_couter = 0
		print('Start training')
		print("fold_indx", fold_indx)
		for episode_i in range(episode, cf["n_epochs"]+1):
			dnn.train()
			print('episode:', episode_i)
			counter = 0
			for batchh in train_dataloaderr:
				counter += 1
				batchh.keys()
				sn_ids = batchh["sn_ids"].to(device)
				sn_input_ids = batchh["sn_input_ids"].to(device)
				sn_attention_mask = batchh["sn_attention_mask"].to(device)
				sp_input_ids = batchh["sp_input_ids"].to(device)
				sp_attention_mask = batchh["sp_attention_mask"].to(device)
				sp_pos = batchh["sp_pos"].to(device)
				sp_landing_pos = batchh["sp_landing_pos"].to(device) # [256, 40]
				sp_fix_dur = (batchh["sp_fix_dur"]/1000).to(device) # [256, 40]
				sn_word_len = batchh["sn_word_len"].to(device)

				# normalize gaze features (z-score normalisation)
				mask = ~torch.eq(sp_fix_dur, 0)
				sp_fix_dur = (sp_fix_dur-fix_dur_mean)/fix_dur_std * mask
				sp_fix_dur = torch.nan_to_num(sp_fix_dur) # [256, 40]
				sp_landing_pos = (sp_landing_pos - landing_pos_mean)/landing_pos_std * mask
				sp_landing_pos = torch.nan_to_num(sp_landing_pos)
				sn_word_len = (sn_word_len - sn_word_len_mean)/sn_word_len_std
				sn_word_len = torch.nan_to_num(sn_word_len)

				# zero old gradients
				optimizer.zero_grad()
				# predict output with DNN
				location_preds, duration_preds, landing_pos_preds, atten_weights = dnn(sn_emd=sn_input_ids,
											sn_mask=sn_attention_mask,
											sp_emd=sp_input_ids,
											sp_pos=sp_pos,
											word_ids_sn=None,
											word_ids_sp=None,
											sp_fix_dur=sp_fix_dur,
											sp_landing_pos=sp_landing_pos,
											sn_word_len = sn_word_len,
											sn_pred=None,
											sn_word_freq=None
											                                            )#[batch, step, dec_o_dim]

				location_preds = location_preds.permute(0,2,1)              #[batch, dec_o_dim, step]

				#prepare label and mask
				# Compute loss for fixation locations
				pad_mask, label = load_label(sp_pos, cf, le, device)
				loss = nn.CrossEntropyLoss(reduction="none")
				batch_location_error = torch.mean(torch.masked_select(loss(location_preds, label), ~pad_mask))

				# Compute loss for fixation durations
				duration_labels = sp_fix_dur[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
				duration_preds = duration_preds.squeeze(-1)  # Remove extra dimension (from [256, 39, 1] to [256, 39])
				dur_loss = nn.MSELoss(reduction="none")
				batch_duration_error = torch.mean(dur_loss(duration_preds, duration_labels))

				# Compute loss for landing position
				landing_pos_labels = sp_landing_pos[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
				landing_pos_preds = landing_pos_preds.squeeze(-1)  # Remove extra dimension (from [256, 39, 1] to [256, 39])
				land_pos_loss = nn.MSELoss(reduction="none")
				batch_land_pos_error = torch.mean(land_pos_loss(landing_pos_preds, landing_pos_labels))

				# Combined loss for both location and duration
				batch_error = batch_location_error + batch_duration_error + batch_land_pos_error

				# backpropagate loss
				batch_error.backward()
				# clip gradients
				gradient_clipping(dnn, cf["max_grad_norm"])

				#learn
				optimizer.step()
				av_location_score.append(batch_location_error.to('cpu').detach().numpy())
				av_duration_score.append(batch_duration_error.to('cpu').detach().numpy())
				av_land_pos_score.append(batch_land_pos_error.to('cpu').detach().numpy())
				av_score.append(batch_error.to('cpu').detach().numpy())
				print('counter:',counter)
				print('\rSample {}\tLocation Loss: {:.10f}\tDuration Loss: {:.10f}\tLanding position Loss: {:.10f}'.format(
          counter, np.mean(av_location_score), np.mean(av_duration_score), np.mean(av_land_pos_score)), end=" ")
			loss_dict['train_loss'].append(np.mean(av_score))

			location_val_loss = []
			duration_val_loss = []
			land_pos_val_loss = []
			val_loss = []
			dnn.eval()
			for batchh in val_dataloaderr:
				with torch.no_grad():
					sn_ids_val = batchh["sn_ids"].to(device)
					sn_input_ids_val = batchh["sn_input_ids"].to(device)
					sn_attention_mask_val = batchh["sn_attention_mask"].to(device)
					sp_input_ids_val = batchh["sp_input_ids"].to(device)
					sp_attention_mask_val = batchh["sp_attention_mask"].to(device)
					sp_pos_val = batchh["sp_pos"].to(device)
					sp_landing_pos_val = batchh["sp_landing_pos"].to(device)
					sp_fix_dur_val = (batchh["sp_fix_dur"]/1000).to(device)
					sn_word_len_val = batchh["sn_word_len"].to(device)

					#normalize gaze features
					mask = ~torch.eq(sp_fix_dur_val, 0)
					sp_fix_dur_val = (sp_fix_dur_val-fix_dur_mean)/fix_dur_std * mask
					sp_landing_pos_val = (sp_landing_pos_val - landing_pos_mean)/landing_pos_std * mask
					sp_fix_dur_val = torch.nan_to_num(sp_fix_dur_val)
					sp_landing_pos_val = torch.nan_to_num(sp_landing_pos_val)
					sn_word_len_val = (sn_word_len_val - sn_word_len_mean)/sn_word_len_std
					sn_word_len_val = torch.nan_to_num(sn_word_len_val)

					location_preds_val, duration_preds_val, landing_pos_preds_val, atten_weights_val = dnn(sn_emd=sn_input_ids_val,
														sn_mask=sn_attention_mask_val,
														sp_emd=sp_input_ids_val,
														sp_pos=sp_pos_val,
														word_ids_sn=None,
														word_ids_sp=None,
														sp_fix_dur=sp_fix_dur_val,
														sp_landing_pos=sp_landing_pos_val,
														sn_word_len = sn_word_len_val,
														sn_pred = None,
														sn_word_freq = None)#[batch, step, dec_o_dim]
					location_preds_val = location_preds_val.permute(0,2,1)              #[batch, dec_o_dim, step

					# Compute location prediction error
					loss = nn.CrossEntropyLoss(reduction="none")
					pad_mask_val, label_val = load_label(sp_pos_val, cf, le, device)
					location_error_val = torch.mean(torch.masked_select(loss(location_preds_val, label_val), ~pad_mask_val))
					location_val_loss.append(location_error_val.detach().to('cpu').numpy())

					# Compute duration prediction error
					duration_labels_val = sp_fix_dur_val[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
					duration_preds_val = duration_preds_val.squeeze(-1)
					duration_error_val = torch.mean(dur_loss(duration_preds_val, duration_labels_val))
					duration_val_loss.append(duration_error_val.detach().to('cpu').numpy())

					# Compute loss for landing position
					landing_pos_labels_val = sp_landing_pos_val[:, :39] # Adjust duration_labels to match the sequence length of duration_preds
					landing_pos_preds_val = landing_pos_preds_val.squeeze(-1)  # Remove extra dimension (from [256, 39, 1] to [256, 39])
					land_pos_error_val = torch.mean(land_pos_loss(landing_pos_preds_val, landing_pos_labels_val))
					land_pos_val_loss.append(land_pos_error_val.detach().to('cpu').numpy())

					combined_loss = location_error_val + duration_error_val + land_pos_error_val
					val_loss.append(combined_loss.detach().to('cpu').numpy())

			print('\nValidation loss for locations {} \n'.format(np.mean(location_val_loss)))
			print('\nValidation loss for duration {} \n'.format(np.mean(duration_val_loss)))
			print('\nValidation loss for landing position {} \n'.format(np.mean(land_pos_val_loss)))
			loss_dict['val_loss'].append(np.mean(val_loss))

			if np.mean(val_loss) < old_score:
				# save model if val loss is smallest
				torch.save(dnn.state_dict(), '{}/BSC_3head_arch_new_sentence_{}.pth'.format(save_data_folder, fold_indx))
				old_score = np.mean(val_loss)
				print('\nsaved model state dict\n')
				save_ep_couter = episode_i
			else:
				#early stopping
				if episode_i - save_ep_couter >= cf["earlystop_patience"]:
					break
		fold_indx += 1

		#evaluation
		dnn.eval()
		res_llh=[]
		res_mse_dur = []
		res_mse_land_pos = []
		res_central_scasim_human = []
		res_central_scasim_dnn = []
		res_scasim_human = []
		res_scasim_dnn = []
		uniform_central_scasim_scores = []
		uniform_scasim_scores = []
		uniform_nll_scores = []
		uniform_mse_dur_scores = []
		uniform_mse_land_pos_scores = []
		ez_reader_central_scasim_scores = []
		ez_reader_scasim_scores = []
		ez_reader_nll_scores = []
		ez_reader_mse_dur_scores = []
		ez_reader_mse_land_pos_scores = []
		dnn.load_state_dict(torch.load(os.path.join(save_data_folder, f'BSC_3head_arch_new_sentence_{fold_indx}.pth'), map_location='cpu'))
		dnn.to(device)
		batch_indx = 0
		print("Evaluating for fold", fold_indx)
		for batchh in test_dataloaderr:
			with torch.no_grad():
				sn_ids_test = batchh["sn_ids"].to(device)
				sn_input_ids_test = batchh["sn_input_ids"].to(device)
				sn_attention_mask_test = batchh["sn_attention_mask"].to(device)
				sp_input_ids_test = batchh["sp_input_ids"].to(device)
				sp_attention_mask_test = batchh["sp_attention_mask"].to(device)
				sp_pos_test = batchh["sp_pos"].to(device) # 28: '<Sep>', 29: '<'Pad'>'
				sp_landing_pos_test = batchh["sp_landing_pos"].to(device)
				sp_fix_dur_test = (batchh["sp_fix_dur"]/1000).to(device)
				sn_word_len_test = batchh["sn_word_len"].to(device)

				#normalize gaze features
				mask = ~torch.eq(sp_fix_dur_test, 0)
				sp_fix_dur_test = (sp_fix_dur_test-fix_dur_mean)/fix_dur_std * mask
				sp_landing_pos_test = (sp_landing_pos_test - landing_pos_mean)/landing_pos_std * mask
				sp_fix_dur_test = torch.nan_to_num(sp_fix_dur_test)
				sp_landing_pos_test = torch.nan_to_num(sp_landing_pos_test)
				sn_word_len_test = (sn_word_len_test - sn_word_len_mean)/sn_word_len_std
				sn_word_len_test = torch.nan_to_num(sn_word_len_test)

				location_preds_test, duration_preds_test, landing_pos_preds_test, atten_weights_test = dnn(sn_emd=sn_input_ids_test,
														sn_mask=sn_attention_mask_test,
														sp_emd=sp_input_ids_test,
														sp_pos=sp_pos_test,
														word_ids_sn=None,
														word_ids_sp=None,
														sp_fix_dur=sp_fix_dur_test,
														sp_landing_pos=sp_landing_pos_test,
														sn_word_len = sn_word_len_test,
														sn_pred = None,
														sn_word_freq = None
														) #[batch, step, dec_o_dim]


				########## Evaluate location predictions ##########
				m = nn.Softmax(dim=2)
				location_preds_test = m(location_preds_test).detach().to('cpu').numpy()

				#prepare label and mask
				pad_mask_test, label_test = load_label(sp_pos_test, cf, le, 'cpu')
				#compute log likelihood for the batch samples
				res_batch = eval_log_llh(location_preds_test, label_test, pad_mask_test)
				res_llh.append(np.array(res_batch))

				uniform_output = construct_uniform_tensor(location_preds_test)
				uniform_nll = eval_log_llh(uniform_output, label_test, pad_mask_test)
				uniform_nll_scores.append(np.array(uniform_nll))
				print("uniform_nll", np.mean(uniform_nll), uniform_nll)

				print("######### Eyettention 2.0 model evaluation ##########")
				duration_preds_test = duration_preds_test.squeeze(-1)
				duration_labels_test = sp_fix_dur_test[:, :39]
				test_mask = mask[:, :39]
				mse_dur = eval_mse(duration_preds_test, duration_labels_test, test_mask)
				print("MSE for durations", np.mean(mse_dur))
				res_mse_dur.append(np.array(mse_dur))

				landing_pos_preds_test = landing_pos_preds_test.squeeze(-1)
				landing_pos_labels_test = sp_landing_pos_test[:, :39]
				mse_landing_pos = eval_mse(landing_pos_preds_test, landing_pos_labels_test, test_mask)
				print("MSE for landing positions", np.mean(mse_landing_pos))
				res_mse_land_pos.append(np.array(mse_landing_pos))

				if bool(scanpath_gen_flag) == True:
					sn_len = (torch.sum(sn_attention_mask_test, axis=1) - 2).detach().to('cpu').numpy()
					# compute the scan path generated from the model when the first CLS token is given
					sp_dnn, _, dur_dnn, land_pos_dnn = dnn.scanpath_generation(sn_emd=sn_input_ids_test,
														 sn_mask=sn_attention_mask_test,
														 word_ids_sn=None,
														 sn_word_len = sn_word_len_test,
														 le=le,
														 sn_word_freq = None,
														 sn_pred = None,
														 sp_fix_dur=sp_fix_dur_test,
														 sp_landing_pos = sp_landing_pos_test,
														 max_pred_len=cf['max_pred_len'])

					sp_dnn, sp_human = prepare_scanpath(sp_dnn.detach().to('cpu').numpy(),
                                              dur_dnn.detach().to('cpu').numpy(),
																							land_pos_dnn.detach().to('cpu').numpy(),
																							sn_len, sp_pos_test,
                                              sp_fix_dur_test, sp_landing_pos_test, cf, sn_ids_test,
																							fix_dur_mean, fix_dur_std, landing_pos_mean, landing_pos_std)

					sp_dnn_list.extend(sp_dnn)
					sp_human_list.extend(sp_human)

					sp_dnn = convert_sp_to_lists(sp_dnn)
					sp_human = convert_sp_to_lists(sp_human)
					sp_human = modify_landing_pos(sp_human.copy())
					sp_dnn = modify_landing_pos(sp_dnn.copy())
					random_sp = sample_random_sp("BSC", sp_human)
					random_sp = convert_sp_to_lists(random_sp)
					random_sp = modify_landing_pos(random_sp.copy())

					scasim_scores_dnn = compute_scasim(sp_dnn, sp_human)
					res_scasim_dnn.append(scasim_scores_dnn)
					print("Mean scasim dnn", np.mean(scasim_scores_dnn))
					scasim_scores_human = compute_scasim(sp_human, random_sp)
					res_scasim_human.append(scasim_scores_human)
					print("Mean scasim human", np.mean(scasim_scores_human))

					central_scasim_scores_dnn = compute_central_scasim("BSC_most_central_sp.txt", sp_dnn)
					central_scasim_scores_human = compute_central_scasim("BSC_most_central_sp.txt", sp_human)
					res_central_scasim_dnn.append(np.array(central_scasim_scores_dnn))
					res_central_scasim_human.append(np.array(central_scasim_scores_human))
					print("Mean central scasim dnn", np.mean(central_scasim_scores_dnn))
					print("Mean central scasim human", np.mean(central_scasim_scores_human))

					print("######### Uniform baseline model evaluation ##########")
					mean_dur_uniform, std_dur_uniform, mean_land_pos_uniform, std_land_pos_uniform = compute_mean_std_uniform("baseline/uniform/BSC_uniform_results.csv")
					uniform_central_scasim, uniform_scasim, dur_mse_scores, land_pos_mse_scores = evaluate_uniform_model("BSC", sp_human, landing_pos_mean, landing_pos_std, fix_dur_mean, fix_dur_std, mean_land_pos_uniform, std_land_pos_uniform, mean_dur_uniform, std_dur_uniform)
					uniform_central_scasim_scores.append(np.array(uniform_central_scasim))
					uniform_scasim_scores.append(np.array(uniform_scasim))
					uniform_mse_dur_scores.append(np.array(dur_mse_scores))
					uniform_mse_land_pos_scores.append(np.array(land_pos_mse_scores))
					print("Uniform mean central Scasim score:", np.mean(uniform_central_scasim))
					print("Uniform mean Scasim score:", np.mean(uniform_scasim))
					print("MSE for uniform durations", np.mean(dur_mse_scores))
					print("MSE for uniform landing pos", np.mean(land_pos_mse_scores))

					print("######### E-Z Reader model evaluation ##########")
					mean_dur_ez_reader, std_dur_ez_reader, mean_land_pos_ez_reader, std_land_pos_ez_reader = compute_mean_std_ez_reader("baseline/E-Z_Reader/BSCSimulationResults.txt")
					central_scasim_ez_reader, scasim_ez_reader, dur_mse_ez_reader, land_pos_mse_ez_reader = evaluate_ez_reader("BSC", "baseline/E-Z_Reader/BSCSimulationResults.txt", sp_human, landing_pos_mean, landing_pos_std, fix_dur_mean, fix_dur_std, mean_land_pos_ez_reader, std_land_pos_ez_reader, mean_dur_ez_reader, std_dur_ez_reader)
					ez_reader_central_scasim_scores.append(np.array(central_scasim_ez_reader))
					print("Central scasim E-Z Reader", np.mean(central_scasim_ez_reader))
					ez_reader_scasim_scores.append(np.array(scasim_ez_reader))
					print("Mean scasim score E-Z Reader", np.mean(scasim_ez_reader))
					ez_reader_mse_dur_scores.append(np.array(dur_mse_ez_reader))
					print("MSE for durations E-Z Reader", np.mean(dur_mse_ez_reader))
					ez_reader_mse_land_pos_scores.append(np.array(land_pos_mse_ez_reader))
					print("MSE for landing pos E-Z Reader", np.mean(land_pos_mse_ez_reader))

				batch_indx +=1

		res_llh = np.concatenate(res_llh).ravel()
		loss_dict['test_ll'].append(res_llh)
		res_mse_dur = np.concatenate(res_mse_dur).ravel()
		loss_dict['test_mse_dur'].append(res_mse_dur)
		res_mse_land_pos = np.concatenate(res_mse_land_pos).ravel()
		loss_dict['test_mse_land_pos'].append(res_mse_land_pos)

		res_central_scasim_dnn = np.concatenate(res_central_scasim_dnn).ravel()
		loss_dict['central_scasim_dnn'].append(res_central_scasim_dnn)
		res_central_scasim_human = np.concatenate(res_central_scasim_human).ravel()
		loss_dict['central_scasim_human'].append(res_central_scasim_human)
		res_scasim_dnn = np.concatenate(res_scasim_dnn).ravel()
		loss_dict['scasim_dnn'].append(res_scasim_dnn)
		res_scasim_human = np.concatenate(res_scasim_human).ravel()
		loss_dict['scasim_human'].append(res_scasim_human)

		loss_dict['fix_dur_mean'] = fix_dur_mean
		loss_dict['fix_dur_std'] = fix_dur_std
		loss_dict['landing_pos_mean'] = landing_pos_mean
		loss_dict['landing_pos_std'] = landing_pos_std
		loss_dict['sn_word_len_mean'] = sn_word_len_mean
		loss_dict['sn_word_len_std'] = sn_word_len_std

		uniform_central_scasim_scores = np.concatenate(uniform_central_scasim_scores).ravel()
		loss_dict['uniform_central_scasim'].append(uniform_central_scasim_scores)
		uniform_scasim_scores = np.concatenate(uniform_scasim_scores).ravel()
		loss_dict['uniform_scasim'].append(uniform_scasim_scores)

		uniform_mse_dur_scores = np.concatenate(uniform_mse_dur_scores).ravel()
		loss_dict['uniform_mse_dur'].append(uniform_mse_dur_scores)
		uniform_mse_land_pos_scores = np.concatenate(uniform_mse_land_pos_scores).ravel()
		loss_dict['uniform_mse_land_pos'].append(uniform_mse_land_pos_scores)
		uniform_nll_scores = np.concatenate(uniform_nll_scores).ravel()
		loss_dict['uniform_nll'].append(uniform_nll_scores)

		ez_reader_central_scasim_scores = np.concatenate(ez_reader_central_scasim_scores).ravel()
		loss_dict['ez_reader_central_scasim'].append(ez_reader_central_scasim_scores)
		ez_reader_scasim_scores = np.concatenate(ez_reader_scasim_scores).ravel()
		loss_dict['ez_reader_scasim'].append(ez_reader_scasim_scores)

		ez_reader_mse_dur_scores = np.concatenate(ez_reader_mse_dur_scores).ravel()
		loss_dict['ez_reader_mse_dur'].append(ez_reader_mse_dur_scores)
		ez_reader_mse_land_pos_scores = np.concatenate(ez_reader_mse_land_pos_scores).ravel()
		loss_dict['ez_reader_mse_land_pos'].append(ez_reader_mse_land_pos_scores)

		print('Test likelihood is {}'.format(np.mean(res_llh)))
		loss_dict['test_ll_SE'].append(np.std(res_llh)/ np.sqrt(len(res_llh)))
		print("Standard error for NLL", np.std(res_llh)/ np.sqrt(len(res_llh)))

		print('Test MSE for durations is {}'.format(np.mean(res_mse_dur)))
		loss_dict['test_mse_dur_SE'].append(np.std(res_mse_dur)/ np.sqrt(len(res_mse_dur)))
		print("Standard error for MSE dur", np.std(res_mse_dur) / np.sqrt(len(res_mse_dur)))

		print('Test MSE for landing positions is {}'.format(np.mean(res_mse_land_pos)))
		loss_dict['test_mse_land_pos_SE'].append(np.std(res_mse_land_pos)/ np.sqrt(len(res_mse_land_pos)))
		print("Standard error for MSE land pos", np.std(res_mse_land_pos) / np.sqrt(len(res_mse_land_pos)))

		print("Central Scasim dnn", np.mean(loss_dict['central_scasim_dnn']))
		loss_dict['central_scasim_dnn_SE'].append(np.std(res_central_scasim_dnn)/ np.sqrt(len(res_central_scasim_dnn)))
		print("Standard error for Central scasim DNN", np.std(res_central_scasim_dnn) / np.sqrt(len(res_central_scasim_dnn)))

		print("Central Scasim human", np.mean(loss_dict['central_scasim_human']))
		loss_dict['central_scasim_human_SE'].append(np.std(res_central_scasim_human)/ np.sqrt(len(res_central_scasim_human)))
		print("Standard error for Central scasim human", np.std(res_central_scasim_human) / np.sqrt(len(res_central_scasim_human)))

		print("Scasim dnn", np.mean(loss_dict['scasim_dnn']))
		loss_dict['scasim_dnn_SE'].append(np.std(res_scasim_dnn)/ np.sqrt(len(res_scasim_dnn)))
		print("Standard error for scasim dnn", np.std(res_scasim_dnn) / np.sqrt(len(res_scasim_dnn)))

		print("Scasim human", np.mean(loss_dict['scasim_human']))
		loss_dict['scasim_human_SE'].append(np.std(res_scasim_human)/ np.sqrt(len(res_scasim_human)))
		print("Standard error for scasim human", np.std(res_scasim_human) / np.sqrt(len(res_scasim_human)))

		print("Uniform central scasim", np.mean(loss_dict['uniform_central_scasim']))
		loss_dict['uniform_central_scasim_SE'].append(np.std(uniform_central_scasim_scores)/ np.sqrt(len(uniform_central_scasim_scores)))
		print("Standard error for uniform central scasim", np.std(uniform_central_scasim_scores) / np.sqrt(len(uniform_central_scasim_scores)))

		print("Uniform scasim", np.mean(loss_dict['uniform_scasim']))
		loss_dict['uniform_scasim_SE'].append(np.std(uniform_scasim_scores)/ np.sqrt(len(uniform_scasim_scores)))
		print("Standard error for uniform scasim", np.std(uniform_scasim_scores) / np.sqrt(len(uniform_scasim_scores)))

		print("Uniform MSE durations", np.mean(loss_dict['uniform_mse_dur']))
		loss_dict['uniform_mse_dur_SE'].append(np.std(dur_mse_scores)/ np.sqrt(len(dur_mse_scores)))
		print("Standard error for uniform MSE durations", np.std(dur_mse_scores) / np.sqrt(len(dur_mse_scores)))

		print("Uniform MSE landing pos", np.mean(loss_dict['uniform_mse_land_pos']))
		loss_dict['uniform_mse_land_pos_SE'].append(np.std(land_pos_mse_scores)/ np.sqrt(len(land_pos_mse_scores)))
		print("Standard error for uniform MSE landing pos", np.std(land_pos_mse_scores) / np.sqrt(len(land_pos_mse_scores)))

		print("Uniform NLL", np.mean(loss_dict['uniform_nll']))
		loss_dict['uniform_nll_SE'].append(np.std(uniform_nll_scores)/ np.sqrt(len(uniform_nll_scores)))
		print("Standard error for uniform NLL", np.std(uniform_nll_scores) / np.sqrt(len(uniform_nll_scores)))

		print("E-Z Reader central scasim", np.mean(loss_dict['ez_reader_central_scasim']))
		loss_dict['ez_reader_central_scasim_SE'].append(np.std(ez_reader_central_scasim_scores)/ np.sqrt(len(ez_reader_central_scasim_scores)))
		print("Standard error for E-Z Reader central scasim", np.std(ez_reader_central_scasim_scores) / np.sqrt(len(ez_reader_central_scasim_scores)))

		print("E-Z Reader scasim", np.mean(loss_dict['ez_reader_scasim']))
		loss_dict['ez_reader_scasim_SE'].append(np.std(ez_reader_scasim_scores)/ np.sqrt(len(ez_reader_scasim_scores)))
		print("Standard error for E-Z Reader scasim", np.std(ez_reader_scasim_scores) / np.sqrt(len(ez_reader_scasim_scores)))

		print("E-Z Reader MSE durations", np.mean(loss_dict['ez_reader_mse_dur']))
		loss_dict['ez_reader_mse_dur_SE'].append(np.std(ez_reader_mse_dur_scores)/ np.sqrt(len(ez_reader_mse_dur_scores)))
		print("Standard error for E-Z Reader MSE durations", np.std(ez_reader_mse_dur_scores) / np.sqrt(len(ez_reader_mse_dur_scores)))

		print("E-Z Reader MSE landing pos", np.mean(loss_dict['ez_reader_mse_land_pos']))
		loss_dict['ez_reader_mse_land_pos_SE'].append(np.std(ez_reader_mse_land_pos_scores)/ np.sqrt(len(ez_reader_mse_land_pos_scores)))
		print("Standard error for E-Z Reader MSE landing pos", np.std(ez_reader_mse_land_pos_scores) / np.sqrt(len(ez_reader_mse_land_pos_scores)))

		#save results
		with open('{}/res_BSC_{}_eyettention_Fold{}.pickle'.format(save_data_folder, test_mode, fold_indx), 'wb') as handle:
			pickle.dump(loss_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
		fold_indx += 1


**Statistical tests**

In [None]:
from scipy import stats

fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/New_Reader/res_BSC_subject_eyettention_Fold{fold_index}.pickle', 'rb') as handle:
      print(fold_index)
      fold_results = pickle.load(handle)

      # NLL
      test_ll = fold_results['test_ll'][0]

      with open(f'drive/MyDrive/results/BSC/New_Reader/res_BSC_original_eyettention_fold{fold_index}.pickle', 'rb') as original:
        original_results = pickle.load(original)
        original_ll = original_results['test_ll'][0]
        t_statistic, p_value = stats.ttest_ind(test_ll, original_ll)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in NLL scores between Eyettention 2.0 and original Eyettention scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and original Eyettention NLL scores.")

      uniform_nll = fold_results['uniform_nll'][0]

      t_statistic, p_value = stats.ttest_ind(test_ll, uniform_nll)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in NLL scores between predicted and uniformly generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between the predicted and uniform NLL scores.")

      # Scasim
      scasim_dnn = fold_results['scasim_dnn'][0]
      scasim_human = fold_results['scasim_human'][0]
      ez_reader_scasim = fold_results['ez_reader_scasim'][0]
      uniform_scasim = fold_results['uniform_scasim'][0]

      t_statistic, p_value = stats.ttest_ind(scasim_dnn, ez_reader_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader generated Scasim scores.")
      t_statistic, p_value = stats.ttest_ind(scasim_dnn, uniform_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform Scasim scores.")

      t_statistic, p_value = stats.ttest_ind(scasim_dnn, scasim_human)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Scasim scores between predicted and the human scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and human Scasim scores.")

      # Central Scasim
      central_scasim_dnn = fold_results['central_scasim_dnn'][0]
      central_scasim_human = fold_results['central_scasim_human'][0]
      ez_reader_central_scasim = fold_results['ez_reader_central_scasim'][0]
      uniform_central_scasim = fold_results['uniform_central_scasim'][0]

      t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, ez_reader_central_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader Central Scasim scores.")
      t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, uniform_central_scasim)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform Central Scasim scores.")

      t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, central_scasim_human)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between predicted and the human scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and human Central Scasim scores.")

      # MSE durations
      test_mse_dur = fold_results['test_mse_dur'][0]
      ez_reader_mse_dur = fold_results['ez_reader_mse_dur'][0]
      uniform_mse_dur = fold_results['uniform_mse_dur'][0]

      t_statistic, p_value = stats.ttest_ind(test_mse_dur, ez_reader_mse_dur)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for durations between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader MSE scores for durations.")
      t_statistic, p_value = stats.ttest_ind(test_mse_dur, uniform_mse_dur)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for durations between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform MSE scores for durations.")

      # MSE landing pos
      test_mse_land_pos = fold_results['test_mse_land_pos'][0]
      ez_reader_mse_land_pos = fold_results['ez_reader_mse_land_pos'][0]
      uniform_mse_land_pos = fold_results['uniform_mse_land_pos'][0]

      t_statistic, p_value = stats.ttest_ind(test_mse_land_pos, ez_reader_mse_land_pos)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for landing pos between predicted and E-Z Reader generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and E-Z Reader MSE scores for landing pos.")
      t_statistic, p_value = stats.ttest_ind(test_mse_land_pos, uniform_mse_land_pos)
      # Interpretation based on p-value
      if p_value < 0.05:
        print("Reject null hypothesis: Statistically significant difference in MSE scores for landing pos between predicted and the uniform generated scanpaths.")
      else:
        print("Fail to reject null hypothesis: No statistically significant difference between predicted and uniform MSE scores for landing pos.")

      fold_index += 1


### Statistical tests for Eyettention Reader model ###

In [None]:
from scipy import stats

fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/New_Sentence/res_BSC_text_eyettention_Fold{fold_index}.pickle', 'rb') as handle:
      print(fold_index)
      fold_results = pickle.load(handle)

      # NLL
      test_ll = fold_results['test_ll'][0]

      with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
        reader_results = pickle.load(original)
        reader_ll = reader_results['test_ll'][0]
        t_statistic, p_value = stats.ttest_ind(test_ll, reader_ll)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in NLL scores between Eyettention 2.0 and Eyettention Reader scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and Eyettention Reader NLL scores.")

      # Scasim
      scasim_dnn = fold_results['scasim_dnn'][0]

      with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
        reader_results = pickle.load(original)
        reader_scasim = reader_results['scasim_dnn'][0]
        t_statistic, p_value = stats.ttest_ind(scasim_dnn, reader_scasim)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in Scasim scores between Eyettention 2.0 and Eyettention Reader scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and Eyettention Reader Scasim scores.")

      # Central Scasim
      central_scasim_dnn = fold_results['central_scasim_dnn'][0]

      with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
        reader_results = pickle.load(original)
        reader_central_scasim = reader_results['central_scasim_dnn'][0]
        t_statistic, p_value = stats.ttest_ind(central_scasim_dnn, reader_central_scasim)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in Central Scasim scores between Eyettention 2.0 and Eyettention Reader scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and Eyettention Reader Central Scasim scores.")


      # MSE durations
      test_mse_dur = fold_results['test_mse_dur'][0]

      with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
        reader_results = pickle.load(original)
        reader_test_mse_dur = reader_results['test_mse_dur'][0]
        t_statistic, p_value = stats.ttest_ind(test_mse_dur, reader_test_mse_dur)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in MSE scores for dur between Eyettention 2.0 and Eyettention Reader scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and Eyettention Reader MSE scores for dur.")


      # MSE landing pos
      test_mse_land_pos = fold_results['test_mse_land_pos'][0]

      with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
        reader_results = pickle.load(original)
        reader_mse_land_pos = reader_results['test_mse_land_pos'][0]
        t_statistic, p_value = stats.ttest_ind(test_mse_land_pos, reader_mse_land_pos)
        # Interpretation based on p-value
        if p_value < 0.05:
          print("Reject null hypothesis: Statistically significant difference in MSE scores for land pos between Eyettention 2.0 and Eyettention Reader scanpaths.")
        else:
          print("Fail to reject null hypothesis: No statistically significant difference found between Eyettention 2.0 and Eyettention Reader MSE scores for land pos.")

      fold_index += 1

In [None]:
fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
      print(fold_index)
      reader_results = pickle.load(original)
      print("Mean NLL", np.mean(reader_results['test_ll'][0]))
      print("Standard error NLL", np.std(reader_results['test_ll'][0]) / np.sqrt(len(reader_results['test_ll'][0])))

      print("Mean MSE dur", np.mean(reader_results['test_mse_dur'][0]))
      print("Standard error MSE dur", np.std(reader_results['test_mse_dur'][0]) / np.sqrt(len(reader_results['test_mse_dur'][0])))

      print("Mean MSE landing pos", np.mean(reader_results['test_mse_land_pos'][0]))
      print("Standard error MSE landing pos", np.std(reader_results['test_mse_land_pos'][0]) / np.sqrt(len(reader_results['test_mse_land_pos'][0])))

      print("Mean scasim", np.mean(reader_results['scasim_dnn'][0]))
      print("Standard error scasim", np.std(reader_results['scasim_dnn'][0]) / np.sqrt(len(reader_results['scasim_dnn'][0])))

      print("Mean Central scasim", np.mean(reader_results['central_scasim_dnn'][0]))
      print("Standard error Central scasim", np.std(reader_results['central_scasim_dnn'][0]) / np.sqrt(len(reader_results['central_scasim_dnn'][0])))

      fold_index += 1


In [None]:
import pickle
import numpy as np

# Analysing the results
test_ll_all_folds = []
test_mse_dur_all_folds = []
test_mse_land_pos_all_folds = []
scasim_dnn_all_folds = []
scasim_human_all_folds = []
central_scasim_dnn_all_folds = []
central_scasim_human_all_folds = []

uniform_ll_all_folds = []
uniform_mse_dur_all_folds = []
uniform_mse_land_pos_all_folds = []
uniform_scasim_all_folds = []
uniform_central_scasim_all_folds = []

ez_reader_mse_dur_all_folds = []
ez_reader_mse_land_pos_all_folds = []
ez_reader_scasim_all_folds = []
ez_reader_central_scasim_all_folds = []

fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/NRS/res_BSC_eyettention_NRS_Fold{fold_index}.pickle', 'rb') as original:
      reader_results = pickle.load(original)
      test_ll_all_folds.extend(reader_results['test_ll'][0])
      test_mse_dur_all_folds.extend(reader_results['test_mse_dur'][0])
      test_mse_land_pos_all_folds.extend(reader_results['test_mse_land_pos'][0])
      scasim_dnn_all_folds.extend(reader_results['scasim_dnn'][0])
      scasim_human_all_folds.extend(reader_results['scasim_human'][0])
      central_scasim_dnn_all_folds.extend(reader_results['central_scasim_dnn'][0])
      central_scasim_human_all_folds.extend(reader_results['central_scasim_human'][0])

      uniform_ll_all_folds.extend(reader_results['uniform_nll'][0])
      uniform_mse_dur_all_folds.extend(reader_results['uniform_mse_dur'][0])
      uniform_mse_land_pos_all_folds.extend(reader_results['uniform_mse_land_pos'][0])
      uniform_scasim_all_folds.extend(reader_results['uniform_scasim'][0])
      uniform_central_scasim_all_folds.extend(reader_results['uniform_central_scasim'][0])

      ez_reader_mse_dur_all_folds.extend(reader_results['ez_reader_mse_dur'][0])
      ez_reader_mse_land_pos_all_folds.extend(reader_results['ez_reader_mse_land_pos'][0])
      ez_reader_scasim_all_folds.extend(reader_results['ez_reader_scasim'][0])
      ez_reader_central_scasim_all_folds.extend(reader_results['ez_reader_central_scasim'][0])

      fold_index += 1

# Calculate overall statistics
def calculate_mean_and_se(values):
    mean_val = np.mean(values)
    se_val = np.std(values) / np.sqrt(len(values))
    return mean_val, se_val

mean_nll, se_nll = calculate_mean_and_se(test_ll_all_folds)
mean_mse_dur, se_mse_dur = calculate_mean_and_se(test_mse_dur_all_folds)
mean_mse_land_pos, se_mse_land_pos = calculate_mean_and_se(test_mse_land_pos_all_folds)
mean_scasim, se_scasim = calculate_mean_and_se(scasim_dnn_all_folds)
mean_central_scasim, se_central_scasim = calculate_mean_and_se(central_scasim_dnn_all_folds)
mean_scasim_human, se_scasim_human = calculate_mean_and_se(scasim_human_all_folds)
mean_central_scasim_human, se_central_scasim_human = calculate_mean_and_se(central_scasim_human_all_folds)

mean_nll_uniform, se_nll_uniform = calculate_mean_and_se(uniform_ll_all_folds)
mean_mse_dur_uniform, se_mse_dur_uniform = calculate_mean_and_se(uniform_mse_dur_all_folds)
mean_mse_land_pos_uniform, se_mse_land_pos_uniform = calculate_mean_and_se(uniform_mse_land_pos_all_folds)
mean_scasim_uniform, se_scasim_uniform = calculate_mean_and_se(uniform_scasim_all_folds)
mean_central_scasim_uniform, se_central_scasim_uniform = calculate_mean_and_se(uniform_central_scasim_all_folds)

mean_nll_ez_reader, se_nll_ez_reader = calculate_mean_and_se(ez_reader_mse_dur_all_folds)
mean_mse_dur_ez_reader, se_mse_dur_ez_reader = calculate_mean_and_se(ez_reader_mse_dur_all_folds)
mean_mse_land_pos_ez_reader, se_mse_land_pos_ez_reader = calculate_mean_and_se(ez_reader_mse_land_pos_all_folds)
mean_scasim_ez_reader, se_scasim_ez_reader = calculate_mean_and_se(ez_reader_scasim_all_folds)
mean_central_scasim_ez_reader, se_central_scasim_ez_reader = calculate_mean_and_se(ez_reader_central_scasim_all_folds)


print("Overall Mean NLL:", mean_nll)
print("Overall Standard error NLL:", se_nll)

print("Overall Mean MSE dur:", mean_mse_dur)
print("Overall Standard error MSE dur:", se_mse_dur)

print("Overall Mean MSE landing pos:", mean_mse_land_pos)
print("Overall Standard error MSE landing pos:", se_mse_land_pos)

print("Overall Mean scasim:", mean_scasim)
print("Overall Standard error scasim:", se_scasim)

print("Overall Mean Central scasim:", mean_central_scasim)
print("Overall Standard error Central scasim:", se_central_scasim)

print("Human Mean scasim:", mean_scasim_human)
print("Human Standard error scasim:", se_scasim_human)

print("Human Mean central scasim:", mean_central_scasim_human)
print("Human Standard error central scasim:", se_central_scasim_human)


print("Overall Mean NLL Uniform:", mean_nll_uniform)
print("Overall Standard error NLL:", se_nll_uniform)

print("Overall Mean MSE dur Uniform:", mean_mse_dur_uniform)
print("Overall Standard error MSE dur Uniform:", se_mse_dur_uniform)

print("Overall Mean MSE landing pos Uniform:", mean_mse_land_pos_uniform)
print("Overall Standard error MSE landing pos:", se_mse_land_pos_uniform)

print("Overall Mean scasim Uniform:", mean_scasim_uniform)
print("Overall Standard error scasim:", se_scasim_uniform)

print("Overall Mean Central scasim Uniform:", mean_central_scasim_uniform)
print("Overall Standard error Central scasim:", se_central_scasim_uniform)


print("Overall Mean MSE dur E-Z Reader:", mean_mse_dur_ez_reader)
print("Overall Standard error MSE dur E-Z Reader:", se_mse_dur_ez_reader)

print("Overall Mean MSE landing pos E-Z Reader:", mean_mse_land_pos_ez_reader)
print("Overall Standard error MSE landing pos:", se_mse_land_pos_ez_reader)

print("Overall Mean scasim E-Z Reader:", mean_scasim_ez_reader)
print("Overall Standard error scasim:", se_scasim_ez_reader)

print("Overall Mean Central scasim E-Z Reader:", mean_central_scasim_ez_reader)
print("Overall Standard error Central scasim:", se_central_scasim_ez_reader)


In [None]:
import pickle
import numpy as np

# Analysing the results
test_ll_all_folds = []
test_mse_dur_all_folds = []
test_mse_land_pos_all_folds = []
scasim_dnn_all_folds = []
scasim_human_all_folds = []
central_scasim_dnn_all_folds = []
central_scasim_human_all_folds = []

fold_index = 0
for i in range(5):
  with open(f'drive/MyDrive/results/BSC/Eyettention_Reader/emb_size_64/res_BSC_eyettention_reader_Fold{fold_index}.pickle', 'rb') as original:
      reader_results = pickle.load(original)
      test_ll_all_folds.extend(reader_results['test_ll'][0])
      test_mse_dur_all_folds.extend(reader_results['test_mse_dur'][0])
      test_mse_land_pos_all_folds.extend(reader_results['test_mse_land_pos'][0])
      scasim_dnn_all_folds.extend(reader_results['scasim_dnn'][0])
      scasim_human_all_folds.extend(reader_results['scasim_human'][0])
      central_scasim_dnn_all_folds.extend(reader_results['central_scasim_dnn'][0])
      central_scasim_human_all_folds.extend(reader_results['central_scasim_human'][0])

      fold_index += 1

# Calculate overall statistics
def calculate_mean_and_se(values):
    mean_val = np.mean(values)
    se_val = np.std(values) / np.sqrt(len(values))
    return mean_val, se_val

mean_nll, se_nll = calculate_mean_and_se(test_ll_all_folds)
mean_mse_dur, se_mse_dur = calculate_mean_and_se(test_mse_dur_all_folds)
mean_mse_land_pos, se_mse_land_pos = calculate_mean_and_se(test_mse_land_pos_all_folds)
mean_scasim, se_scasim = calculate_mean_and_se(scasim_dnn_all_folds)
mean_central_scasim, se_central_scasim = calculate_mean_and_se(central_scasim_dnn_all_folds)
mean_scasim_human, se_scasim_human = calculate_mean_and_se(scasim_human_all_folds)
mean_central_scasim_human, se_central_scasim_human = calculate_mean_and_se(central_scasim_human_all_folds)


print("Overall Mean NLL:", mean_nll)
print("Overall Standard error NLL:", se_nll)

print("Overall Mean MSE dur:", mean_mse_dur)
print("Overall Standard error MSE dur:", se_mse_dur)

print("Overall Mean MSE landing pos:", mean_mse_land_pos)
print("Overall Standard error MSE landing pos:", se_mse_land_pos)

print("Overall Mean scasim:", mean_scasim)
print("Overall Standard error scasim:", se_scasim)

print("Overall Mean Central scasim:", mean_central_scasim)
print("Overall Standard error Central scasim:", se_central_scasim)

print("Human Mean scasim:", mean_scasim_human)
print("Human Standard error scasim:", se_scasim_human)

print("Human Mean central scasim:", mean_central_scasim_human)
print("Human Standard error central scasim:", se_central_scasim_human)