In [0]:
import numpy as np
import pandas as pd
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import auc, roc_curve

In [3]:
data = pd.read_csv('./DSL-StrongPasswordData.csv')
data.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,2,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,2,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,2,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,2,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,2,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [0]:
from sklearn.decomposition import PCA

def load_data():
    df = pd.read_csv('./DSL-StrongPasswordData.csv')

    H_columns  = [col for col in df.columns if col.startswith('H')]
    DD_columns = [col for col in df.columns if col.startswith('DD')]
    UD_columns = [col for col in df.columns if col.startswith('UD')]

    data = {}
    data['total'] = df.drop(columns=['subject', 'sessionIndex', 'rep'])
    
    return data, df['subject'].values

In [14]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize

from keras.callbacks import CSVLogger


def nn_model(input_dim, output_dim, nodes=40, dropout_rate=None):
    """Create neural network model with two hidden layers"""
    model = Sequential()
    model.add(Dense(nodes, input_dim=input_dim, activation='relu'))
    if dropout_rate: model.add(Dropout(dropout_rate))
    model.add(Dense(nodes, activation='relu'))
    if dropout_rate: model.add(Dropout(dropout_rate))

    if output_dim == 1:
        model.add(Dense(output_dim, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam', metrics=['accuracy'])
    else:
        model.add(Dense(output_dim, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam', metrics=['accuracy'])
    return model


if __name__ == '__main__':
    data, y = load_data()

    # One hot encoding of target vector
    Y = pd.get_dummies(y).values
    n_classes = Y.shape[1]

    for nodes in [300]:
        for key, X in data.items():
            print('Running : ', key, nodes, X.shape)

            # Split data into training and testing data
            X_train, X_test, Y_train, Y_test = train_test_split(
                X, Y, test_size=0.2, random_state=1, stratify=y)
            
            # Normalize data with mean 0 and std 1
            X_scaled = normalize(X_train)

            # Add callback that streams epoch results to a csv file
            # https://keras.io/callbacks/
            csv_logger = CSVLogger('training_{}_{}.log'.format(
                key, nodes))

            # Train the neural network model
            n_features = X.shape[1]
            model = nn_model(n_features, n_classes, nodes, 0.2)
            history = model.fit(X_scaled, Y_train,
                                epochs=100,
                                batch_size=5,
                                verbose=1,
                                callbacks=[csv_logger])

            # Serialize model to JSON
            model_json = model.to_json()
            with open('model_{}_{}.json'.format(
                key, nodes), 'w') as f:
                f.write(model_json)

            # Serialize weights to HDF5
            model.save_weights('model_{}_{}.h5'.format(
                key, nodes))

Running :  total 300 (20400, 31)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
E

In [0]:
import json

with open('./model_total_300.json', 'r') as json_file:
    architecture = json.load(json_file)
    classifier = model_from_json(json.dumps(architecture))

In [40]:
from keras.models import model_from_json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve
from collections import defaultdict
from sklearn.preprocessing import normalize

key_list = ['total']
nodes_list = [300]
models_dict = defaultdict(dict)

for key in key_list:
    X = data[key]
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=1, stratify=y)
        
    for nodes in nodes_list:
        json_filepath = 'model_{}_{}.json'.format(key, nodes)
        hdf5_filepath = 'model_{}_{}.h5'.format(key, nodes)
        with open(json_filepath, 'r') as f:
            model = model_from_json(f.read())
            model.load_weights(hdf5_filepath)
            model.compile(loss='categorical_crossentropy',
                          optimizer='adam', metrics=['accuracy'])
            if nodes == 300:
                print('Data : {:>5s}, Nodes : {}'
                      .format(key, nodes))
                score = model.evaluate(normalize(X_train), 
                                       Y_train, verbose=0)
                print('  Accuracy for train set : {:.4f}'
                      .format(score[1]))
                score = model.evaluate(normalize(X_test), 
                                       Y_test, verbose=0)
                print('  Accuracy for test set  : {:.4f}'
                      .format(score[1]))
            
            models_dict[key][nodes] = model

Data : total, Nodes : 300
  Accuracy for train set : 0.9864
  Accuracy for test set  : 0.9056


In [126]:
from sklearn.metrics import accuracy_score, roc_curve, auc

plt.figure(figsize=(10, 10))
for i, key in enumerate(key_list):
    X = data[key]
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=1, stratify=y)
    
    model = models_dict[key][nodes]
    Y_pred = model.predict(normalize(X_test))
    fpr, tpr, thresholds = roc_curve(
        Y_test.flatten(), Y_pred.flatten())
    

    np.savetxt('nn.out', (fpr, tpr, thresholds))

    print(auc(fpr, tpr))

0.998205276816609


<matplotlib.figure.Figure at 0x7f7ab95459b0>