In [None]:
import numpy as np
import pandas as pd
from pandas import read_csv
from keras.models import Sequential
from keras.optimizers import *

from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import to_categorical

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
import seaborn as sn

In [None]:
def plotConf(conf):
    df_cm = pd.DataFrame(conf, index = [i for i in "ABCDE"],
                  columns = [i for i in "ABCDE"])
    plt.figure(figsize = (10,7))
    sn.heatmap(df_cm, annot=True)
    plt.show()
    return

In [None]:
def createTrain(X_trainIn,y_train,high):
    #Creates a 3D array of numSamples X timepoints X features for training data
    
    X_train = X_trainIn.loc[:, ~X_trainIn.columns.isin(['num_window'])]
    numFeatures = len(X_train.columns)
    trainWindows = X_trainIn.groupby('num_window')['num_window'].unique()
    trainArr = np.ndarray([len(trainWindows),int(high[0]), numFeatures])
    yTrainArr = np.ndarray([len(trainWindows),1])


    for i in range(len(trainWindows)):
        blankDF = pd.DataFrame(columns=X_train.columns)
        sample = X_train.loc[X_trainIn['num_window'] == trainWindows.index[i]]

        
        numberToPad = int(np.abs(len(sample)-high[0]))

        for j in range(numberToPad):
            blankDF.loc[j] = [0] * numFeatures
        sample = sample.append(blankDF)

        trainArr[i] = sample
        yTrainArr[i] = y_train.loc[X_trainIn['num_window'] == trainWindows.index[i]].iloc[0]
    
    
    return trainArr, yTrainArr

In [None]:
def createTest(X_testIn, y_test, high):
    #Creates a 3D array of numSamples X timepoints X features for training data
    X_test = X_testIn.loc[:, ~X_testIn.columns.isin(['num_window'])]
    numFeatures = len(X_test.columns)
    testWindows = X_testIn.groupby('num_window')['num_window'].unique()
    testArr = np.ndarray([len(testWindows),int(high[0]), numFeatures])
    yTestArr = np.ndarray([len(testWindows),1])


    for i in range(len(testWindows)):
        blankDF = pd.DataFrame(columns=X_test.columns)
        sample = X_test.loc[X_testIn['num_window'] == testWindows.index[i]]
        numberToPad = int(np.abs(len(sample)-high[0]))

        for j in range(numberToPad):
            blankDF.loc[j] = [0] * numFeatures
        sample = sample.append(blankDF)

        testArr[i] = sample
        yTestArr[i] = y_test.loc[X_testIn['num_window'] == testWindows.index[i]].iloc[0]
    
    
    return testArr, yTestArr

In [None]:
def lstmModel(trainArr, yTrainArr): 
    n_timesteps, n_features, n_outputs = trainArr.shape[1], trainArr.shape[2], yTrainArr.shape[1]

    model = Sequential()
    model.add(LSTM(64, input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.5))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
    
    return model

In [None]:
filename = "fs_raw_logo.csv"
df = read_csv(filename, header=0)

In [None]:
#This chunk of code removes samples that don't have a certain number of rows
#that correspond to time-points
#Keep all rows with values between 25th-75th percentiles
#We can pad these ones, but it would be too much to pad the others

numWindowsPerSample = df.groupby('num_window')['num_window'].count()
low = numWindowsPerSample.quantile([0.25]).values
high = numWindowsPerSample.quantile([0.75]).values

print(low)
print(high)
atLow = numWindowsPerSample.values >= low
atHigh = numWindowsPerSample.values <=high

windowBounds = atLow & atHigh

windowsToUse = numWindowsPerSample[windowBounds]

df = df.loc[df['num_window'].isin(windowsToUse.index)]
df.to_csv('window.csv')

In [None]:
names = df.user_name
window = df.num_window
y = df.classe
X = df.loc[:, ~df.columns.isin(['classe', 'user_name'])]

In [None]:
#train_inds, test_inds = next(GroupShuffleSplit().split(X, groups=window))
#X_train, X_test, y_train, y_test = X.iloc[train_inds], X.iloc[test_inds], y.iloc[train_inds], y.iloc[test_inds]

#We want to try to improve leave-one-out accuracy so we will do training 6 times
#Use sklearn leaveOneGroupOut to generate our train and test sets based on group

logo = LeaveOneGroupOut()
lossArr = []
accArr = []
confArr = []

batch=20
epochs=4

for train_index, test_index in logo.split(X, y, names):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    trainArr, yTrainArr = createTrain(X_train, y_train, high)
    testArr, yTestArr = createTest(X_test, y_test, high)

    print("Train X shape: ",trainArr.shape)
    print("Train y shape: ",yTrainArr.shape)
    print("Test X shape: ",testArr.shape)
    print("Test y shape: ",yTestArr.shape)
    
    y_trainArr = to_categorical(yTrainArr)
    y_testArr = to_categorical(yTestArr)
    print("New train y shape: ", y_trainArr.shape)
    print("New test y shape: ", y_testArr.shape)
    from keras.wrappers.scikit_learn import KerasClassifier
    #we need to modify the batch size for each iteration because we are using stacked LSTMs and the batch size
    #needs to be able to divide into the number of samples
    lstm = lstmModel(trainArr, y_trainArr)
    outModel = lstm.fit(trainArr, y_trainArr, epochs=epochs, validation_data=(testArr,y_testArr), batch_size=batch, verbose=1)
    lossArr.append(outModel.history)
    ypred = lstm.predict_classes(testArr,batch_size=batch)


    y_testArr = (np.ndarray.nonzero(y_testArr)[1])
    conf = confusion_matrix(y_testArr, ypred)
    confArr.append(conf)
    plotConf(conf)

In [None]:
plt.figure()
loss=[0]*epochs
valloss=[0]*epochs
for i in range(6):
    loss = [sum(x) for x in zip(loss, lossArr[i]['loss'])]
    valloss = [sum(x) for x in zip(valloss, lossArr[i]['val_loss'])]


loss = [i/6for i in loss]
valloss = [i/6 for i in valloss]

plt.plot(loss)
plt.plot(valloss)
plt.legend(['train','test'])
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.xticks(np.arange(4), ('1', '2', '3', 4,))
plt.show()

print("final training loss: ", loss[len(loss)-1])
print("final validation loss: ", valloss[len(valloss)-1])

In [None]:
plt.figure()
trainAcc=[0]*epochs
valAcc=[0]*epochs
for i in range(6):
    trainAcc = [sum(x) for x in zip(trainAcc, lossArr[i]['acc'])]
    valAcc = [sum(x) for x in zip(valAcc, lossArr[i]['val_acc'])]


trainAcc = [i/6 for i in trainAcc]
valAcc = [i/6 for i in valAcc]

plt.plot(trainAcc)
plt.plot(valAcc)
plt.xlabel('Epoch')
plt.ylabel('Average Accuracy')
plt.legend(['train','test'])
plt.xticks(np.arange(4), ('1', '2', '3', 4,))
plt.show()

print("final training acc: ", trainAcc[len(trainAcc)-1])
print("final validation acc: ", valAcc[len(valAcc)-1])