In [108]:
"""
This file implements a hybrid of GA and LSTM for the image manipulation-eye gaze timeseries data set.
It is designed to solve the classification problem about predicting if a picture is manipulated based on a sequence of 
data on their eye gaze.
"""

import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [109]:
"""
Step 1: load data
"""
# load all data
data = pd.read_excel('Caldwell_ImageManipulation-EyeGaze_DataSetCombined.xlsx',
                        sheet_name='data')

data = data[["participant", "image", "image manipulated", "vote"]]

data_extended = pd.read_csv('Caldwell_Manip_Images_10-14_TimeSeries.csv')
# rename columns to make them align
data_extended = data_extended.rename(index=str, columns={'Participant_ID': 'participant', 'Image_ID': 'image'})
data = pd.merge(data_extended, data, how="left", on=["participant", "image"])    # join the dataframes
data = data.sort_values(by=["Start Time"])

# Min-Max scaling normalization
for column in range(data.shape[1] - 2):
    temp = data.iloc[:, column]  
    ma = temp.max()
    mi = temp.min()
    data.iloc[:, column] = data.iloc[:, column].apply(lambda x: (x - mi) / (ma - mi))

In [110]:
"""
Step 2: Define the LSTM model
"""
class LSTM(nn.Module):
    """reference https://github.com/jessicayung/blog-code-snippets/blob/master/lstm-pytorch/lstm-baseline.py"""
    def __init__(self, input_dim, hidden_dim, batch, seq_len, output_dim, num_layers):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch = batch
        self.num_layers = num_layers
        self.seq_len = seq_length
    
        # the LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers, batch_first = True)
        
        # the output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)
        
    def init_hidden(self):
        """This is called each time a sequence is fully learned, then the hidden state has to be reinitialized"""
        return (torch.zeros(self.num_layers, self.batch, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch, self.hidden_dim))
    
    def forward(self, input):
        """Forward pass through LSTM layer"""
        # shape of self.hidden: (a, b), where a and b both have shape (num_layers, batch_size, hidden_dim).
        # input has size batch * seq length * input_dim
        lstm_out, self.hidden = self.lstm(input.view(self.batch, self.seq_len, -1))
        
        # Only take the output from the final timetep
        y_pred = self.linear(lstm_out[:, -1, :]) 
        return y_pred

In [111]:
"""
Step 3: Define class for batching
"""
#use minibatch to preprocess data
class PrepareData(Dataset):

    def __init__(self, X, y):
        if not torch.is_tensor(X):
            self.X = torch.from_numpy(X)
        if not torch.is_tensor(y):
            self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [112]:
output_dim = 2  # no. of output classes
hidden_dim = 6  # no. of units in hidden state
num_layers = 2  # number of LSTM layers
batch = 10      # batch size
seq_length = 10 # sequence length
learning_rate = 0.01
num_epochs = 200

"""
Step 4: Define function for training
"""
def train(train_input, train_target, X_validate, Y_validate):
    # hyperparameters
    input_dim = train_input.shape[1]   # no. of input features

    # create batches with size = batch_size*seq_length, build sequences later
    train_batchs = PrepareData(X=np.array(train_input), y=np.array(train_target))
    train_batchs = DataLoader(train_batchs, batch_size=batch * seq_length, shuffle=False)

    model = LSTM(input_dim= input_dim, hidden_dim= hidden_dim, batch= batch, seq_len= seq_length, output_dim= output_dim, num_layers= num_layers)

    loss_f = nn.CrossEntropyLoss()  # classification

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)   # minibatch gradient descent

    losses = np.array([])
    accuracies = np.array([])
    previous_error = float("inf")

    """Train and validate the model"""
    for e in range(num_epochs):
    
        losses = np.array([])        # reinitialize each time to calculate average loss
        accuracies = np.array([])    # reinitialize each time to calculate average accuracy

        model.batch = batch          # restate batch size as it may has been changed by validation
        model.seq_len = seq_length   # restate sequence length

        counter = 0                  # count the number of batchs

        # idea from https://conorsdatablog.wordpress.com/2018/05/03/up-and-running-with-pytorch-minibatching-dataloading-and-model-building/
        for ix, (x, y) in enumerate(train_batchs):
            # only learn from full batchs(size = batchsize *seqlength)
            if x.shape[0] < batch * seq_length:     
                continue

            counter += 1
            model.hidden = model.init_hidden()    # reinitialize hidden state each sequence

            y = y.view(batch, seq_length)[:,-1]   # resize target into sequence

            _X = Variable(x).float().view(batch, seq_length, -1)
            _Y = Variable(y).long()

            y_pred = model(_X)
            loss = loss_f(y_pred, _Y)
            losses = np.append(losses, (loss.item()))

            # find the class from the max values in each row
            _, predicted = torch.max(y_pred, dim = 1)

            # calculate and print accuracy
            total = predicted.size(0)
            correct = predicted.data.numpy() == _Y.data.numpy()
            accuracy = 100 * sum(correct)/total
            accuracies = np.append(accuracies,accuracy)

            optimizer.zero_grad()            # zero the gradients on each pass before the update
            loss.backward()                  # backpropagate the loss through the model
            optimizer.step()                 # update the gradients w.r.t the loss

        if e % 10 == 0:                      # validate every 10 epochs  
            print("Epoch: ", e, " counter: ", counter)
            print("loss", sum(losses)/counter)
            print("accuracies", sum(accuracies)/counter)

            # find the validation error, here sequence length is 1, batch size is the length of validation data
            model.batch = X_validate.shape[0]
            model.seq_len = 1
            validate_y_pred = model(X_validate)
            validate_loss = loss_f(validate_y_pred, Y_validate)
            _, predicted = torch.max(validate_y_pred, dim = 1)
            total = predicted.size(0)
            correct = predicted.data.numpy() == Y_validate.data.numpy()
            accuracy = 100 * sum(correct)/total
            print("Validate", validate_loss.item(),accuracy)

            # terminate if validation loss is higher than previous two runs
            if validate_loss > previous_error:
                print("terminated: at epoch ", e)
                break
            previous_error = validate_loss
            
    return previous_error

In [113]:
"""
Step 5: Using Genetic Algorithm to select features, reference: lab8, only up to the validation stage
"""
# define GA settings
DNA_SIZE = 9             # number of bits in DNA
POP_SIZE = 10             # population size
CROSS_RATE = 0.8         # DNA crossover probability
MUTATION_RATE = 0.002    # mutation probability
N_GENERATIONS = 5        # generation size
features = data.columns[:9]
print(features)

# define population select function based on fitness value
# population with higher fitness value has higher chance to be selected, from lab8
def select(pop, fitness):
    idx = np.random.choice(np.arange(POP_SIZE+1), size=POP_SIZE + 1, replace=True,
                           p=fitness/(sum(fitness)))
    return pop[idx]

# define mutation function, from lab8
def mutate(child):
    for point in range(DNA_SIZE):
        if np.random.rand() < MUTATION_RATE:
            child[point] = 1 if child[point] == 0 else 0
    return child

# define gene crossover function, from lab8
def crossover(parent, pop):
    if np.random.rand() < CROSS_RATE:
        # randomly select another individual from population
        i = np.random.randint(0, POP_SIZE, size=1)    
        # choose crossover points(bits)
        cross_points = np.random.randint(0, 2, size=DNA_SIZE).astype(np.bool)
        # produce one child
        parent[cross_points] = pop[i, cross_points]  
    return parent

Index(['Fixations_ID', 'participant', 'image', 'X Pos', 'Y Pos', 'Start Time',
       'Stop Time', 'Duration', 'Samples in Fixation'],
      dtype='object')


In [114]:
# define functions to extract feature from DNA
def extract(dna):
    extracted = []
    for i in range(len(dna)):
        if dna[i] == 1:
            extracted.append(features[i])
    return extracted

# define functions to create data from the features selected.
def create_data(extracted):
    # for speed, only take 1/10 of data
    dummy_data = data[extracted]. iloc[[i for i in range(len(data)) if i % 10 == 0]]
    traindata = dummy_data.loc[data['image'] < 1]
    validatedata = dummy_data.loc[data['image'] == 1]

    # separate the data into input and target
    t_input = traindata.iloc[:, :len(extracted)]
    t_target = traindata.iloc[:, -2]

    v_input = validatedata.iloc[:, :len(extracted)]
    v_target = validatedata.iloc[:, -2]

    X_t = Variable(torch.Tensor(t_input.values).float())
    Y_t = Variable(torch.Tensor(t_target.values).long())

    X_v = Variable(torch.Tensor(v_input.values).float())
    Y_v = Variable(torch.Tensor(v_target.values).long())
    return X_t, Y_t, X_v, Y_v

In [116]:
"""
Step 6: Train the hybrid model
"""

# Initialize the population DNA, add all 1 to the population
pop = np.random.randint(2, size=(POP_SIZE, DNA_SIZE))
pop = np.append(pop, [[1]*9],axis = 0)


for t in range(N_GENERATIONS):
    print('------------- Generation ', t,'-------------')
    # fitness values for all populations
    fitness = []
    fit=np.array([])
    for p in pop:
        print("examining: ",p )
        features_extracted = extract(p)
        
        X_t, Y_t, X_v, Y_v = create_data(features_extracted)
              
        # use selected features to train the model
        loss = train(X_t, Y_t, X_v, Y_v)
        
        # add the loss related value to the fitness
        fitness.append(loss)
    
    # select parent 1 index
    p1 = fitness.index(min(fitness))
    if min(fitness) < 0.1:
        print('End-----------', pop[p1], "fit: ", fitness[p1])
        break
              
    selected_pop = select(pop, fitness)
    selected_pop_copy = selected_pop.copy()
    for parent in selected_pop:
        child = crossover(parent, selected_pop_copy)       
        child = mutate(child)
        parent[:] = child
    if t == N_GENERATIONS -1:
        print('End-----------', pop[p1], "fit: ", fitness[p1])

------------- Generation  0 -------------
examining:  [1 0 1 0 1 1 0 1 0]
Epoch:  0  counter:  25
loss 0.31864934906363485
accuracies 88.0
Validate 0.263240784406662 100.0
Epoch:  10  counter:  25
loss 0.0006949081388302147
accuracies 100.0
Validate 0.051680855453014374 100.0
Epoch:  20  counter:  25
loss 0.00026721382047981026
accuracies 100.0
Validate 0.036379266530275345 100.0
Epoch:  30  counter:  25
loss 0.00014556694077327847
accuracies 100.0
Validate 0.028918417170643806 100.0


KeyboardInterrupt: 