In [1]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_hdf('aa_one_hot_highlow.h5')

In [3]:
data.head()

Unnamed: 0,class,prest_id,conc_cf,aa_one_hot
1,0,140099,2.9154,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,0,140225,1.4877,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,0,140325,1.5029,"[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,..."
8,0,140354,0.92003,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
10,0,140500,2.2183,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [4]:
data.tail()

Unnamed: 0,class,prest_id,conc_cf,aa_one_hot
45201,1,4550047,12.28,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
45202,1,4550053,11.18,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
45203,1,4550054,10.94,"[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,..."
45204,1,4560007,11.3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,..."
45205,1,4560013,9.38,"[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [5]:
data['aa_one_hot'][1].shape

(149, 20)

In [6]:
# create a 3D array of one one hot encodings
max_len = 149
width = 20
num_samples = data.shape[0]

X = np.zeros((num_samples, max_len, width))
for idx, aa_one_hot in enumerate(data['aa_one_hot'].values):
    X[idx, :, :] = aa_one_hot

    
y = data['class'].values

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [24]:
def make_model(x_train, y_train, x_test, y_test, filt_width=3, epochs=1):

    # simple model per Yoon Kim (2014)
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Flatten
    from keras.layers import Conv1D, GlobalMaxPooling1D, AveragePooling1D
    model = Sequential()
    model.add(Conv1D(10, filt_width, activation='relu', input_shape=(149, 20)))
    #model.add(AveragePooling1D(pool_size=2, strides=None, padding='valid'))
    #model.add(GlobalMaxPooling1D())
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(32))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train, batch_size=50, epochs=epochs,
              validation_data=(x_test, y_test), verbose=2)
    model.save('model_no_pool'+ str(filt_width) + '.h5')
    print('model saved as: ', 'model_no_pool'+ str(filt_width) + '.h5')
    return

In [27]:
epochs = 25

In [28]:
make_model(x_train, y_train, x_test, y_test, filt_width=3, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 9s - loss: 0.7022 - acc: 0.5250 - val_loss: 0.6838 - val_acc: 0.5599
Epoch 2/25
 - 7s - loss: 0.6687 - acc: 0.5920 - val_loss: 0.6605 - val_acc: 0.6076
Epoch 3/25
 - 7s - loss: 0.6530 - acc: 0.6124 - val_loss: 0.6568 - val_acc: 0.6186
Epoch 4/25
 - 7s - loss: 0.6442 - acc: 0.6248 - val_loss: 0.6496 - val_acc: 0.6222
Epoch 5/25
 - 7s - loss: 0.6386 - acc: 0.6294 - val_loss: 0.6486 - val_acc: 0.6238
Epoch 6/25
 - 7s - loss: 0.6326 - acc: 0.6435 - val_loss: 0.6470 - val_acc: 0.6266
Epoch 7/25
 - 7s - loss: 0.6318 - acc: 0.6421 - val_loss: 0.6476 - val_acc: 0.6266
Epoch 8/25
 - 7s - loss: 0.6276 - acc: 0.6480 - val_loss: 0.6479 - val_acc: 0.6248
Epoch 9/25
 - 7s - loss: 0.6249 - acc: 0.6485 - val_loss: 0.6551 - val_acc: 0.6201
Epoch 10/25
 - 7s - loss: 0.6255 - acc: 0.6521 - val_loss: 0.6511 - val_acc: 0.6217
Epoch 11/25
 - 7s - loss: 0.6247 - acc: 0.6509 - val_loss: 0.6464 - val_acc: 0.6275
Epoch 12/25
 - 7s - loss: 0.6208 - a

In [29]:
make_model(x_train, y_train, x_test, y_test, filt_width=4, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 9s - loss: 0.6977 - acc: 0.5401 - val_loss: 0.6684 - val_acc: 0.5894
Epoch 2/25
 - 7s - loss: 0.6609 - acc: 0.6043 - val_loss: 0.6538 - val_acc: 0.6136
Epoch 3/25
 - 7s - loss: 0.6490 - acc: 0.6242 - val_loss: 0.6479 - val_acc: 0.6254
Epoch 4/25
 - 7s - loss: 0.6371 - acc: 0.6353 - val_loss: 0.6471 - val_acc: 0.6245
Epoch 5/25
 - 7s - loss: 0.6335 - acc: 0.6406 - val_loss: 0.6467 - val_acc: 0.6301
Epoch 6/25
 - 7s - loss: 0.6297 - acc: 0.6394 - val_loss: 0.6462 - val_acc: 0.6213
Epoch 7/25
 - 7s - loss: 0.6289 - acc: 0.6502 - val_loss: 0.6444 - val_acc: 0.6279
Epoch 8/25
 - 7s - loss: 0.6233 - acc: 0.6505 - val_loss: 0.6461 - val_acc: 0.6238
Epoch 9/25
 - 7s - loss: 0.6226 - acc: 0.6507 - val_loss: 0.6453 - val_acc: 0.6282
Epoch 10/25
 - 7s - loss: 0.6198 - acc: 0.6568 - val_loss: 0.6472 - val_acc: 0.6229
Epoch 11/25
 - 7s - loss: 0.6193 - acc: 0.6556 - val_loss: 0.6477 - val_acc: 0.6238
Epoch 12/25
 - 7s - loss: 0.6199 - a

In [30]:
make_model(x_train, y_train, x_test, y_test, filt_width=5, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 10s - loss: 0.6978 - acc: 0.5351 - val_loss: 0.6750 - val_acc: 0.5822
Epoch 2/25
 - 8s - loss: 0.6620 - acc: 0.5983 - val_loss: 0.6532 - val_acc: 0.6189
Epoch 3/25
 - 8s - loss: 0.6465 - acc: 0.6265 - val_loss: 0.6538 - val_acc: 0.6148
Epoch 4/25
 - 8s - loss: 0.6356 - acc: 0.6340 - val_loss: 0.6464 - val_acc: 0.6268
Epoch 5/25
 - 8s - loss: 0.6328 - acc: 0.6448 - val_loss: 0.6469 - val_acc: 0.6272
Epoch 6/25
 - 8s - loss: 0.6256 - acc: 0.6492 - val_loss: 0.6465 - val_acc: 0.6204
Epoch 7/25
 - 8s - loss: 0.6238 - acc: 0.6483 - val_loss: 0.6452 - val_acc: 0.6223
Epoch 8/25
 - 8s - loss: 0.6188 - acc: 0.6534 - val_loss: 0.6445 - val_acc: 0.6251
Epoch 9/25
 - 8s - loss: 0.6160 - acc: 0.6609 - val_loss: 0.6442 - val_acc: 0.6270
Epoch 10/25
 - 8s - loss: 0.6132 - acc: 0.6660 - val_loss: 0.6421 - val_acc: 0.6285
Epoch 11/25
 - 9s - loss: 0.6116 - acc: 0.6639 - val_loss: 0.6473 - val_acc: 0.6259
Epoch 12/25
 - 9s - loss: 0.6098 - 

In [31]:
make_model(x_train, y_train, x_test, y_test, filt_width=6, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 10s - loss: 0.6979 - acc: 0.5411 - val_loss: 0.6722 - val_acc: 0.5816
Epoch 2/25
 - 9s - loss: 0.6576 - acc: 0.6074 - val_loss: 0.6510 - val_acc: 0.6166
Epoch 3/25
 - 9s - loss: 0.6433 - acc: 0.6279 - val_loss: 0.6467 - val_acc: 0.6231
Epoch 4/25
 - 9s - loss: 0.6372 - acc: 0.6325 - val_loss: 0.6432 - val_acc: 0.6298
Epoch 5/25
 - 9s - loss: 0.6284 - acc: 0.6443 - val_loss: 0.6432 - val_acc: 0.6297
Epoch 6/25
 - 9s - loss: 0.6243 - acc: 0.6506 - val_loss: 0.6454 - val_acc: 0.6287
Epoch 7/25
 - 9s - loss: 0.6204 - acc: 0.6538 - val_loss: 0.6436 - val_acc: 0.6226
Epoch 8/25
 - 9s - loss: 0.6181 - acc: 0.6579 - val_loss: 0.6477 - val_acc: 0.6256
Epoch 9/25
 - 9s - loss: 0.6177 - acc: 0.6569 - val_loss: 0.6429 - val_acc: 0.6265
Epoch 10/25
 - 9s - loss: 0.6107 - acc: 0.6623 - val_loss: 0.6434 - val_acc: 0.6293
Epoch 11/25
 - 9s - loss: 0.6063 - acc: 0.6699 - val_loss: 0.6432 - val_acc: 0.6312
Epoch 12/25
 - 9s - loss: 0.6043 - 

In [32]:
make_model(x_train, y_train, x_test, y_test, filt_width=8, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 11s - loss: 0.6904 - acc: 0.5451 - val_loss: 0.6579 - val_acc: 0.6107
Epoch 2/25
 - 10s - loss: 0.6498 - acc: 0.6220 - val_loss: 0.6462 - val_acc: 0.6332
Epoch 3/25
 - 10s - loss: 0.6372 - acc: 0.6382 - val_loss: 0.6455 - val_acc: 0.6200
Epoch 4/25
 - 10s - loss: 0.6303 - acc: 0.6456 - val_loss: 0.6427 - val_acc: 0.6260
Epoch 5/25
 - 10s - loss: 0.6195 - acc: 0.6532 - val_loss: 0.6439 - val_acc: 0.6276
Epoch 6/25
 - 10s - loss: 0.6159 - acc: 0.6603 - val_loss: 0.6384 - val_acc: 0.6390
Epoch 7/25
 - 10s - loss: 0.6051 - acc: 0.6704 - val_loss: 0.6337 - val_acc: 0.6433
Epoch 8/25
 - 10s - loss: 0.6007 - acc: 0.6778 - val_loss: 0.6301 - val_acc: 0.6503
Epoch 9/25
 - 10s - loss: 0.5940 - acc: 0.6792 - val_loss: 0.6299 - val_acc: 0.6455
Epoch 10/25
 - 10s - loss: 0.5890 - acc: 0.6827 - val_loss: 0.6293 - val_acc: 0.6464
Epoch 11/25
 - 10s - loss: 0.5860 - acc: 0.6847 - val_loss: 0.6267 - val_acc: 0.6464
Epoch 12/25
 - 10s - loss

In [33]:
make_model(x_train, y_train, x_test, y_test, filt_width=10, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 13s - loss: 0.6955 - acc: 0.5397 - val_loss: 0.6656 - val_acc: 0.6045
Epoch 2/25
 - 11s - loss: 0.6546 - acc: 0.6108 - val_loss: 0.6474 - val_acc: 0.6229
Epoch 3/25
 - 11s - loss: 0.6376 - acc: 0.6379 - val_loss: 0.6451 - val_acc: 0.6288
Epoch 4/25
 - 11s - loss: 0.6291 - acc: 0.6473 - val_loss: 0.6400 - val_acc: 0.6306
Epoch 5/25
 - 11s - loss: 0.6210 - acc: 0.6576 - val_loss: 0.6403 - val_acc: 0.6335
Epoch 6/25
 - 11s - loss: 0.6166 - acc: 0.6555 - val_loss: 0.6394 - val_acc: 0.6318
Epoch 7/25
 - 11s - loss: 0.6073 - acc: 0.6686 - val_loss: 0.6367 - val_acc: 0.6416
Epoch 8/25
 - 11s - loss: 0.6055 - acc: 0.6724 - val_loss: 0.6348 - val_acc: 0.6409
Epoch 9/25
 - 11s - loss: 0.5942 - acc: 0.6789 - val_loss: 0.6360 - val_acc: 0.6343
Epoch 10/25
 - 11s - loss: 0.5875 - acc: 0.6835 - val_loss: 0.6259 - val_acc: 0.6470
Epoch 11/25
 - 11s - loss: 0.5799 - acc: 0.6872 - val_loss: 0.6201 - val_acc: 0.6503
Epoch 12/25
 - 11s - loss

In [34]:
make_model(x_train, y_train, x_test, y_test, filt_width=12, epochs=epochs)

Train on 15822 samples, validate on 6781 samples
Epoch 1/25
 - 13s - loss: 0.6911 - acc: 0.5423 - val_loss: 0.6566 - val_acc: 0.6111
Epoch 2/25
 - 12s - loss: 0.6470 - acc: 0.6221 - val_loss: 0.6461 - val_acc: 0.6248
Epoch 3/25
 - 11s - loss: 0.6297 - acc: 0.6425 - val_loss: 0.6437 - val_acc: 0.6322
Epoch 4/25
 - 11s - loss: 0.6136 - acc: 0.6640 - val_loss: 0.6278 - val_acc: 0.6362
Epoch 5/25
 - 11s - loss: 0.5998 - acc: 0.6728 - val_loss: 0.6230 - val_acc: 0.6524
Epoch 6/25
 - 11s - loss: 0.5909 - acc: 0.6832 - val_loss: 0.6172 - val_acc: 0.6552
Epoch 7/25
 - 11s - loss: 0.5827 - acc: 0.6895 - val_loss: 0.6120 - val_acc: 0.6586
Epoch 8/25
 - 11s - loss: 0.5719 - acc: 0.6968 - val_loss: 0.6132 - val_acc: 0.6579
Epoch 9/25
 - 11s - loss: 0.5661 - acc: 0.6976 - val_loss: 0.6129 - val_acc: 0.6506
Epoch 10/25
 - 11s - loss: 0.5609 - acc: 0.7034 - val_loss: 0.6155 - val_acc: 0.6580
Epoch 11/25
 - 11s - loss: 0.5580 - acc: 0.7056 - val_loss: 0.6166 - val_acc: 0.6573
Epoch 12/25
 - 11s - loss

## Make an ensemble with these models

In [35]:
#load all the models
import keras
model_no_pool3 = keras.models.load_model('model_no_pool3.h5')
model_no_pool4 = keras.models.load_model('model_no_pool4.h5')
model_no_pool5 = keras.models.load_model('model_no_pool5.h5')
model_no_pool6 = keras.models.load_model('model_no_pool6.h5')
model_no_pool8 = keras.models.load_model('model_no_pool8.h5')
model_no_pool10 = keras.models.load_model('model_no_pool10.h5')
model_no_pool12 = keras.models.load_model('model_no_pool12.h5')


In [36]:
x = x_train
y = y_train

y_pred3 = model_no_pool3.predict(x)
y_pred4 = model_no_pool4.predict(x)
y_pred5 = model_no_pool5.predict(x)
y_pred6 = model_no_pool6.predict(x)
y_pred8 = model_no_pool8.predict(x)
y_pred10 = model_no_pool10.predict(x)
y_pred12 = model_no_pool12.predict(x)

y_pred_total = np.round((y_pred3 + y_pred4 + y_pred5 + y_pred6 + y_pred8 + y_pred10 + y_pred12)/7)

z = y_pred_total == np.reshape(y, [-1,1])

print('Train acc')
np.sum(z)/len(z)

Train acc


0.7737327771457464

In [37]:
x = x_test
y = y_test

y_pred3 = model_no_pool3.predict(x)
y_pred4 = model_no_pool4.predict(x)
y_pred5 = model_no_pool5.predict(x)
y_pred6 = model_no_pool6.predict(x)
y_pred8 = model_no_pool8.predict(x)
y_pred10 = model_no_pool10.predict(x)
y_pred12 = model_no_pool12.predict(x)

y_pred_total = np.round((y_pred3 + y_pred4 + y_pred5 + y_pred6 + y_pred8 + y_pred10 + y_pred12)/7)

z = y_pred_total == np.reshape(y, [-1,1])

print('Test acc')
np.sum(z)/len(z)

Test acc


0.66376640613478843

# Filter Vizualization

In [None]:
#try to vizulaize the filters
weights = model_no_pool8.get_weights()

In [None]:
weights[0].shape

In [None]:
#I'm expecting 10 filters that are 8 x 20
#if I'm interpreting this correctly the weights[0] is and array [8x20 filter, 10 filters]

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(30,5))
for ind in range(1,11):
    w = weights[0][:,:,ind-1]
    plt.subplot(2,5,ind)
    plt.imshow(w)
    plt.colorbar()

since the data only has a single hot pixel in each row, I would expect to see similar patterns in the filters, but this doesn't look much like that so maybe there is some more work to be done here or my expecation is wrong.