# Classification of eye tracking data from /data folder into one of four categories

The file consists of four parts:
1. Loading data
2. Extracting sequences
3. Decision Tree classification
4. CNN classification

@author: pawel@kasprowski.pl

In [1]:
import os
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, cohen_kappa_score
from sklearn.model_selection._split import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Activation, Flatten, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization

## Load samples from /data

- take only columns 2 and 3
- convert position to velocity
- label = first two letter in file name

In [2]:
def load_files(indir):
    samples = []
    names = []
    labels = []
    for file in os.listdir(indir):
        sample = np.genfromtxt(os.path.join(indir, file), delimiter='\t')
        sample = sample[:,2:4] ##omit irrelevant columns
        
        ## convert position to velocity
        vsample = np.zeros((sample.shape[0],sample.shape[1]))
        for i in range(1,sample.shape[0]):
            vsample[i] = sample[i]-sample[i-1]
        sample = vsample    
        samples.append(sample)
        names.append(file)
        labels.append(file[0:3])
    samples = np.array(samples,dtype=object)
    labels = np.array(labels,dtype=object)
    return samples,labels,names
    
samples,labels,_ = load_files("data")
print("Loaded {} samples".format(samples.shape[0]))

Loaded 72 samples


## Convert samples into chunks of *sequence_dim* length with lag = *sequence_lag*

In [3]:
def make_sequences(samples, labels, sequence_dim = 100, sequence_lag = 1, sequence_attributes = 2):
    nsamples = []
    nlabels = []
    for s in range(samples.shape[0]):
    #for sample in samples:
        sample = samples[s]
        for i in range(0,len(sample)-sequence_dim,sequence_lag):
            nsample = np.zeros((sequence_dim,sequence_attributes))
            for j in range(i,i+sequence_dim):
                for k in range(sequence_attributes):
                    nsample[j-i,k] = sample[j,k]
            nsamples.append(nsample)
            nlabels.append(labels[s])
        
    samples = np.array(nsamples)
    labels = np.array(nlabels)
    return samples,labels
   
sequence_dim = 100
print("Samples shape before sequencing",samples.shape)

print("Converting to sequences of length {}".format(sequence_dim))
samples, labels = make_sequences(samples, labels, sequence_dim)
print("Samples shape after sequencing: {}".format(samples.shape))


Samples shape before sequencing (72,)
Converting to sequences of length 100
Samples shape after sequencing: (19535, 100, 2)


## Convert labels to one-hot

In [4]:
lb = LabelBinarizer()
labels = lb.fit_transform(labels)

## Decision Tree classification - flatten samples and fit model

In [5]:
# flatten samples for Decision Tree
flatSamples = samples.reshape(samples.shape[0],-1) #tree!

(trainSamples, testSamples, trainLabels, testLabels) = train_test_split(flatSamples, labels, test_size=0.25, random_state=42)
model = DecisionTreeClassifier()
model.fit(trainSamples, trainLabels)    
treeResults = model.predict(testSamples)
print(confusion_matrix(testLabels.argmax(axis=1), treeResults.argmax(axis=1)))
print(classification_report(testLabels.argmax(axis=1), treeResults.argmax(axis=1)))
treeAcc = accuracy_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)) 
print("Accuracy Tree: {:.2f}".format(treeAcc))
print("Cohen's Kappa {:.2f}".format(cohen_kappa_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1))))

[[ 106   60   77  286]
 [  94  122  105  261]
 [ 128  139  470  530]
 [ 311  272  606 1317]]
              precision    recall  f1-score   support

           0       0.17      0.20      0.18       529
           1       0.21      0.21      0.21       582
           2       0.37      0.37      0.37      1267
           3       0.55      0.53      0.54      2506

    accuracy                           0.41      4884
   macro avg       0.32      0.33      0.32      4884
weighted avg       0.42      0.41      0.42      4884

Accuracy Tree: 0.41
Cohen's Kappa 0.10


## Build CNN 1D model

In [6]:
inputShape = (samples.shape[1],samples.shape[2])
print('inputShape:',inputShape)
model = Sequential()
model.add(Conv1D(32, 10, padding="same",input_shape=inputShape))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))
    
model.add(Conv1D(64, 10, padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Conv1D(128, 10, padding="same"))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten(input_shape=inputShape))
model.add(Dense(128, activation='sigmoid'))
model.add(Dense(64, activation='sigmoid'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()

inputShape: (100, 2)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 100, 32)           672       
_________________________________________________________________
activation (Activation)      (None, 100, 32)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 100, 32)           128       
_________________________________________________________________
dropout (Dropout)            (None, 100, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           20544     
_________________________________________________________________
activation_1 (Activation)    (None, 100, 64)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None,

## Classify using the CNN Model

In [7]:
(trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42)

model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy'])
    
EPOCHS=10
BATCH=128
model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS
              ,validation_data=(testSamples,testLabels)
              )
    
cnnResults = model.predict(testSamples)
    
print(confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))
print(classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1),target_names=lb.classes_))
print("CNN Accuracy: {:.2f}".format(accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))))
print("Cohen's Kappa {:.2f}".format(cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))))

Train on 14651 samples, validate on 4884 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[ 525    0    0    4]
 [  22  556    0    4]
 [   2    1 1258    6]
 [   0    2    1 2503]]
              precision    recall  f1-score   support

         bus       0.96      0.99      0.97       529
         kot       0.99      0.96      0.97       582
         nap       1.00      0.99      1.00      1267
         rab       0.99      1.00      1.00      2506

    accuracy                           0.99      4884
   macro avg       0.99      0.98      0.99      4884
weighted avg       0.99      0.99      0.99      4884

CNN Accuracy: 0.99
Cohen's Kappa 0.99
