# TüEyeQ dataset validation
Extracking data for comparison with other dataset. 

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, roc_auc_score, plot_roc_curve, accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import Normalizer

### Loading the participant features
The table below holds the info we have on each participant. Each participant has a unique subject ID. The info about the tasks was removed, as it was deemed unnecessary. Our taget value should be age / gender.

In [2]:
participant_features = pd.read_csv('TuEyeQ/cft_full.csv', index_col=1)
del participant_features['task_id']
del participant_features['cft_task']
participant_features = participant_features.drop_duplicates()
participant_features.head()

FileNotFoundError: [Errno 2] No such file or directory: 'TuEyeQ/cft_full.csv'

### Reading eye tracking features
 Here is what eye tracking features for Some of the participants and some readings have been removed due to too much noise. 

In [None]:
eye_tracking_features_path = 'TuEyeQ/EyeMovementData/split'
arbitrary_eye_tracking_features = pd.read_csv(eye_tracking_features_path+'/ABT22/task_01.csv', index_col=0)
arbitrary_eye_tracking_features['gender'] = participant_features.loc['ABT22']['gender']
arbitrary_eye_tracking_features.head(6)

### Appending target values to feature vectors
We want a feature vector to consist of eye tracking features and then a target value in the end, based on the subject ID.

In [None]:
participants = list(set(participant_features.index))

def load_participant(participant_id):
    tasks = []
    
    if not os.path.isdir(eye_tracking_features_path+'/'+participant_id):
        #print(participant_id+' has no readings.')
        return
    
    for task in os.listdir(eye_tracking_features_path+'/'+participant_id):
        df = pd.read_csv(eye_tracking_features_path+'/'+participant_id+'/'+task, index_col=0)
        gender = participant_features.loc[participant_id]['gender']
        age = participant_features.loc[participant_id]['age']
        df['gender'] = gender
        df['age'] = age
        tasks.append(df)
        
    return tasks

def drop_nulls(lst):
    return list(filter(None, lst))

In [None]:
abt22 = load_participant('ABT22')

### Making a heat map
It might be a good idea to visualise the data before attacking it. Perhaps we can even see a difference. Here I make a heatmap of the locations the male and female participants look and scanpaths.

In [None]:
all_males = participant_features[participant_features['gender']==1].index
all_females = participant_features[participant_features['gender']==2].index

all_male_readings = drop_nulls([load_participant(subject) for subject in tqdm(all_males)])
all_female_readings = drop_nulls([load_participant(subject) for subject in tqdm(all_females)])

In [None]:
def make_heatmap_scanpath(readings, verbose=True):
    heatmaps, all_x, all_y = [], [], []

    for subject_ind, subject in tqdm(enumerate(readings)):
        for reading_ind, i in enumerate(subject):
            fixations = i[i['eventType']=='fixation']
            mean_x = fixations['meanX']
            mean_y = fixations['meanY']

            if mean_x.shape[0] < 2 and mean_y.shape[0] < 2:
                if verbose:
                    print(f"Reading {reading_ind} on subject {subject_ind} too small to work with.")
                break
            heatmap, x_edges, y_edges = np.histogram2d(mean_x, mean_y, bins=(20,40))
            heatmaps.append(heatmap)
            all_x.append(mean_x)
            all_y.append(mean_y)

    combined_heatmap = sum(heatmaps)
    if verbose:
        print("Drawing...")
    plt.figure(figsize=(10,8))
    plt.subplot(2,1,1)
    plt.imshow(combined_heatmap);
    ax = plt.gca() # Inverter x-akse. Den passede ikke af en eller anden årsag.
    ax.invert_xaxis()
    
    plt.subplot(2,1,2)
    for x, y in zip(all_x, all_y):
        plt.plot(x, y, linewidth=0.1/len(readings), c='blue')

In [None]:
make_heatmap_scanpath(all_male_readings, verbose=False)

In [None]:
make_heatmap_scanpath(all_female_readings, verbose=False)

### Random Forest Classification

In [None]:
rfc = RandomForestClassifier(
    n_estimators=1000, criterion='entropy', min_samples_split=5,
    min_samples_leaf=1, random_state=42, max_features='sqrt'
)

dummy = DummyClassifier()

#The features available in all entries - Except start time.
feats = ['duration', 'meanPupilDiameter', 'eventIdxLeft', 'eventIdxRight', 'meanX', 'meanY',
         'startSaccadeX', 'startSaccadeY', 'endSaccadeX', 'endSaccadeY', 'microsaccadeCount',
        'microsaccadeAmplitude', 'microsaccadePeakVelocity']

X_prepared = [i for p in all_male_readings+all_female_readings[:52] for i in p]
X_prepared = [d.mean() for d in X_prepared] #Take the mean of all values - To make single feature vector
X_prepared = [x for x in X_prepared if not np.isnan(x.loc['gender'])] #Remove nan-values.
X_prepared = [x.fillna(0) for x in X_prepared]
np.random.shuffle(X_prepared)
X = [x.loc[feats] for x in X_prepared] #Take all features except last two
y = [x.loc['gender'] for x in X_prepared] # Take gender (last feature)

In [None]:
#Fit that model
acc_score = []
f1 = []
auc = []

X = np.array(X)
y = np.array(y)

k = 5
kf = KFold(n_splits=k, shuffle=False)
for train_index, test_index in tqdm(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rfc.fit(X_train,y_train)
    pred_values = rfc.predict(X_test)
    
    acc_score.append(accuracy_score(y_test, pred_values))
    f1.append(f1_score(y_test, pred_values))
    auc.append(roc_auc_score(y_test, pred_values))

dummy.fit(X_train, y_train)
ax = plt.gca()
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8)
rfc_disp = plot_roc_curve(dummy, X_test, y_test, ax=ax, alpha=0.8)
    
rfc_avg_acc_score = sum(acc_score)/k
rfc_avg_f1_score = sum(f1)/k
rfc_avg_auc_score = sum(auc)/k

print('Average Accuracy:', rfc_avg_acc_score)
print('Average F1:', rfc_avg_f1_score)
print('Average AUC:', rfc_avg_auc_score)

In [None]:
#tæl male/female.

y = np.array(y, dtype=int)
np.bincount(y)

In [None]:
preds = dummy.predict(X_test)
print(accuracy_score(y_test, preds))
print(f1_score(y_test, preds))
print(roc_auc_score(y_test, preds))

In [None]:
preds, y_test

In [None]:
len(all_female_readings[:52])

In [None]:
#Normaliserede pupil-means
pups = np.array([x['meanPupilDiameter'] for x in X_prepared])
pups = (pups - np.min(pups))/np.ptp(pups)

In [None]:
pups.min()

In [None]:
x = np.array([x['meanY'] for x in X_prepared])
min(x), max(x)