In [117]:
%pylab inline

import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

import datetime, time

Populating the interactive namespace from numpy and matplotlib


In [118]:
# Read data
wnv_train = pd.read_csv('../working/train_f.csv')
wnv_test = pd.read_csv('../working/test_f.csv')

In [119]:
# convert list of labels to binary class matrix
labels = wnv_train.ix[:,1].values.astype('int')
yr_train = np_utils.to_categorical(labels)

In [120]:
# raw data
#yr_train = wnv_train[:,1].astype(int)
Xr_train =(wnv_train.ix[:,2:].values).astype('float64')
Xr_test = (wnv_test.ix[:,1:].values).astype('float64')

# normalize data
scaler = StandardScaler()
scaler.fit(Xr_train)
Xn_train = scaler.transform(Xr_train)
Xn_test = scaler.transform(Xr_test)

X_train, X_test, y_train, y_test = train_test_split(Xn_train, yr_train)

In [122]:
input_dim = Xn_train.shape[1]
nb_classes = y_train.shape[1]
hidden_dim = 512

hidden_layers = 2
dropout = 0.25

In [123]:
model = None

# create model
model = Sequential()
model.add(Dense(input_dim, hidden_dim, init='lecun_uniform'))
model.add(Activation('relu'))
model.add(Dropout(dropout))
for i in range(hidden_layers - 1):
    model.add(Dense(hidden_dim, hidden_dim, init='lecun_uniform'))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
model.add(Dense(hidden_dim, nb_classes, init='lecun_uniform'))
model.add(Activation('softmax'))

In [124]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [None]:
print("Training...")
model.fit(X_train, y_train, nb_epoch=10, batch_size=16, validation_split=0.0, show_accuracy=True, verbose=1)

Training...
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7

In [108]:
preds = model.predict_classes(X_test, verbose=0)
proba = model.predict_proba(X_test, verbose=0)
print y_test

[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


In [109]:
# accuracy
matches = (preds == y_test[:,1])
print matches.sum() / float(len(matches))

# f1 score
from sklearn import metrics
print metrics.classification_report( y_test[:,1], preds)

# confusion matrix
print metrics.confusion_matrix( y_test[:,1], preds)

# auc
print metrics.roc_auc_score(y_test[:,1], proba[:,1])

0.948610582413
             precision    recall  f1-score   support

        0.0       0.95      1.00      0.97      2491
        1.0       0.56      0.04      0.07       136

avg / total       0.93      0.95      0.93      2627

[[2487    4]
 [ 131    5]]
0.856114364654


In [110]:
np.max(proba[:,1])

0.90986024435449553

In [112]:
proba = model.predict_proba(Xn_test)
print np.max(proba[:,1])

0.99736629842


In [116]:
st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S')

def write_preds(proba, fname):
    pd.DataFrame({"Id": list(range(1,len(proba)+1)), "WnvPresent": proba}).to_csv(fname, index=False, header=True)

#write_preds(preds[range(28000)], "keras-mlp_" + st + ".csv")
write_preds(proba[:,1], "../working/keras_test_" + st + ".csv")