In [1]:

import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import itertools
# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential,load_model,model_from_json
from keras.layers import Dense, Dropout, LSTM,BatchNormalization,TimeDistributed,Flatten,Bidirectional


Using TensorFlow backend.


In [6]:
df=pd.read_csv('train_data_gender_alpha.csv',header=None)
df.shape

(170381, 16)

In [7]:
data=np.array(df)
print(data)

[['L4sumit19.txt' -4.533593 2.8898349999999997 ... 2.576351 -2.39371 'M']
 ['L4sumit19.txt' -13.180301 9.44009 ... 9.075501 -6.358237 'M']
 ['L4sumit19.txt' 11.00239 4.290553 ... 4.078790000000001 8.358266 'M']
 ...
 ['L2geetamami15.txt' -57.116985 9.978982 ... 13.281536 -136.356799 'F']
 ['L2geetamami15.txt' 39.529358 4.609792 ... 4.297975 60.14177900000001
  'F']
 ['L2geetamami15.txt' -17.339984 3.501564 ... 5.478219 -28.539515 'F']]


In [8]:
# pick a large window size of 25 cycles
sequence_length = 50

# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    
    data_matrix = id_df[:,seq_cols]
    
    num_elements = data_matrix.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
  

In [9]:

sequence_cols = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

seq_gen = (list(gen_sequence(data[data[0:,0]==id], sequence_length, sequence_cols)) 
          for id in np.unique(data[0:,0]))

seq_array = np.concatenate(list(seq_gen))
seq_array.shape


(48281, 50, 15)

In [10]:
train_final=seq_array[0:35000,:,0:14]
test_final=seq_array[35000:,:,0:14]
from sklearn import preprocessing
train_label=seq_array[0:35000,0,14]
test_label=seq_array[35000:,0,14]
lb = preprocessing.LabelBinarizer()
train_label=lb.fit_transform(train_label)
test_label=lb.fit_transform(test_label)
train_label=train_label.reshape((train_label.shape[0],1))
train_label=train_label.astype('int')
test_label=test_label.astype('int')
train_label.shape
test_label.shape

(13281, 1)

In [11]:
print(train_final.shape)
print(train_label.shape)
print(test_final.shape)
print(test_label.shape)


(35000, 50, 14)
(35000, 1)
(13281, 50, 14)
(13281, 1)


In [12]:
nb_features = train_final.shape[2]
nb_out = train_label.shape[1]

model = Sequential()

model.add(Bidirectional(LSTM( units=256,
         return_sequences=True),
         input_shape=(sequence_length, nb_features),
        ))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(
         units=128,
         return_sequences=True))

model.add(BatchNormalization())
model.add(LSTM(
          units=64,
          return_sequences=False))

model.add(BatchNormalization())
model.add(Dense(units=32))
model.add(Dense(units=nb_out, activation='sigmoid'))
keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 50, 512)           555008    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 512)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 50, 512)           2048      
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 128)           328192    
_________________________________________________________________
batch_normalization_2 (Batch (None, 50, 128)           512       
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                49408     
_________________________________________________________________
batch_normalization_3 (Batch (None, 64)                256       
__________

In [None]:
model.fit(train_final, train_label,validation_split=0.1,epochs=50, batch_size=32,verbose=1)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model_gender_alpha.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_gender_alpha.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('model_gender_alpha.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_gender_alpha.h5")
print("Loaded model from disk")

In [None]:
keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
scores = loaded_model.evaluate(test_final, test_label, verbose=1, batch_size=32)
scores[1]

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    fig=plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    fig.savefig('confusion_matrix_gender_alpha.png',bbox_inches='tight')

In [None]:
# test metrics
scores_test = loaded_model.evaluate(test_final,test_label, verbose=2)
print('Accurracy: {}'.format(scores_test[1]))

# make predictions and compute confusion matrix
y_pred_test = loaded_model.predict_classes(test_final)
y_true_test = test_label



print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm_test = confusion_matrix(y_true_test, y_pred_test)
print(cm_test)

# compute precision and recall
precision_test = precision_score(y_true_test, y_pred_test)
recall_test = recall_score(y_true_test, y_pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

In [None]:
# Plot non-normalized confusion matrix
class_names=['M','F']
plt.figure()
plot_confusion_matrix(cm_test, classes=class_names,
                      title='Confusion matrix')
plt.show()