In [4]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import keras
from keras.layers import Dense, Dropout, BatchNormalization
from keras.models import Sequential
from sklearn.metrics import brier_score_loss, accuracy_score
from keras.models import load_model

def decodePhed(x):
    return 1-10**(-x/10.0)

In [27]:
test_file = "sims/tcompared_tmapped100_tsim100.json"

In [28]:
def json2csv(file):
    df_dict = {
        'correctly_mapped': list(),
        'mapping_quality': list(),
        'score': list(),
        'secondary_score' : list(),
        'secondary_score_size':list(),
        'identity': list()
    }
    with open(file, "r+") as f:
        line = f.readline()
        line_dict = json.loads(line) 
        i = 0
        while(line != ""):
            line_dict = json.loads(line)
            if 'correctly_mapped' in line_dict:
                df_dict['correctly_mapped'].append(1)
            else:
                df_dict['correctly_mapped'].append(0)
                
            if 'mapping_quality' in line_dict:
                df_dict['mapping_quality'].append(line_dict['mapping_quality'])
            else:
                df_dict['mapping_quality'].append(0)
            
            if 'score' in line_dict:
                df_dict['score'].append(line_dict['score'])
            else:
                df_dict['score'].append(0)
                
            if 'identity' in line_dict:
                df_dict['identity'].append(line_dict['identity'])
            else:
                df_dict['identity'].append(0)
                
            if 'secondary_score' in line_dict:
                df_dict['secondary_score'].append(line_dict['secondary_score'][0])
                df_dict['secondary_score_size'].append(len(line_dict['secondary_score']))
            else:
                df_dict['secondary_score'].append(0)
                df_dict['secondary_score_size'].append(0)
            line = f.readline()
            i += 1
        print(i)

    return pd.DataFrame(df_dict)
    
df = json2csv(test_file)

1000000


In [29]:
df.head()

Unnamed: 0,correctly_mapped,mapping_quality,score,secondary_score,secondary_score_size,identity
0,1,15,105,100,1,0.99
1,1,60,105,0,0,0.99
2,1,60,105,0,1,0.99
3,1,60,105,0,0,0.99
4,1,60,110,0,0,1.0


In [30]:
ndf = df.copy()

In [31]:
ndf['mapping_quality'] = df.mapping_quality/60.0
ndf['score'] = df.score/df.score.max()
ndf['secondary_score'] = df.secondary_score/df.secondary_score.max()
ndf['secondary_score_size'] = df.secondary_score_size/df.secondary_score_size.max()

In [8]:
ndf.head()

Unnamed: 0,correctly_mapped,mapping_quality,score,secondary_score,secondary_score_size,identity
0,1,0.25,0.954545,0.909091,0.007874,0.99
1,1,1.0,0.954545,0.0,0.0,0.99
2,1,1.0,0.954545,0.0,0.007874,0.99
3,1,1.0,0.954545,0.0,0.0,0.99
4,1,1.0,1.0,0.0,0.0,1.0


In [9]:
incorrect_amount = ndf[ndf.correctly_mapped == 0]['correctly_mapped'].count()
incorrect_amount

2366

In [9]:
train_incorrect_amount = (incorrect_amount * 0.8).astype(np.int64)
test_incorrect_amount = (incorrect_amount * 0.2).astype(np.int64)
train_incorrect_amount,test_incorrect_amount 

(1940, 485)

In [10]:
X = ndf.iloc[:, 1:6]
y = ndf.iloc[:, :1]


In [11]:
y.head()

Unnamed: 0,correctly_mapped
0,1
1,1
2,1
3,1
4,1


In [11]:
permu_index = np.random.permutation(X.shape[0])

In [12]:
X = X.iloc[permu_index, :]
y = y.iloc[permu_index]

In [13]:
X_train_correct = X[y.correctly_mapped == 1].iloc[:train_incorrect_amount]
y_train_correct = y[y.correctly_mapped == 1].iloc[:train_incorrect_amount]

X_train_incorrect = X[y.correctly_mapped == 0].iloc[:train_incorrect_amount]
y_train_incorrect = y[y.correctly_mapped == 0].iloc[:train_incorrect_amount]

X_test_correct = X[y.correctly_mapped == 1].iloc[train_incorrect_amount:train_incorrect_amount+test_incorrect_amount]
y_test_correct = y[y.correctly_mapped == 1].iloc[train_incorrect_amount:train_incorrect_amount+test_incorrect_amount]

X_test_incorrect = X[y.correctly_mapped == 0].iloc[train_incorrect_amount:train_incorrect_amount+test_incorrect_amount]
y_test_incorrect = y[y.correctly_mapped == 0].iloc[train_incorrect_amount:train_incorrect_amount+test_incorrect_amount]


print(X_train_correct.shape)
print(X_train_incorrect.shape)
print(X_test_correct.shape)
print(X_test_incorrect.shape)


(1940, 5)
(1940, 5)
(485, 5)
(485, 5)


In [14]:
X_train = np.append(X_train_correct, X_train_incorrect, axis=0)
X_test = np.append(X_test_correct, X_test_incorrect, axis=0)

y_train = np.append(y_train_correct, y_train_incorrect)
y_test = np.append(y_test_correct, y_test_incorrect)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3880, 5)
(970, 5)
(3880,)
(970,)


In [15]:
y_train_class = np.zeros((2*train_incorrect_amount, 2))

y_train_class[y_train==1, :] = [1, 0]
y_train_class[y_train==0, :] = [0, 1]

In [16]:
y_test_class = np.zeros((2*test_incorrect_amount, 2))

y_test_class[y_test==1, :] = [1, 0]
y_test_class[y_test==0, :] = [0, 1]

In [17]:
y_train_class, y_test_class

(array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]]), array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]]))

#### Neural Network HyperParameters

In [18]:
input_layer = 5
output_layer = 2 

h_layer1 = 8
dropout1 = 0.25

h_layer2 = 16
dropout2 = 0.5

h_layer3 = 16
dropout3 = 0.5

h_layer4 = 8
dropout4 = 0.5

#### Neural Network Architecture

In [19]:
model = Sequential()

model.add(Dense(h_layer1, activation='relu', input_shape=(input_layer, )))
model.add(BatchNormalization())
model.add(Dropout(dropout1))

model.add(Dense(h_layer2, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout2))

model.add(Dense(h_layer3, activation='relu'))
model.add(Dropout(dropout3))

model.add(Dense(h_layer4, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout4))

model.add(Dense(output_layer, activation='softmax'))

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 48        
_________________________________________________________________
batch_normalization_1 (Batch (None, 8)                 32        
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                144       
_________________________________________________________________
batch_normalization_2 (Batch (None, 16)                64        
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
__________

In [21]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train_class, batch_size=128, epochs=30, 
          verbose=1, validation_data=(X_test, y_test_class), shuffle=True)

Train on 3880 samples, validate on 970 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f8e3eb8d978>

In [39]:
X_sample = X_test[y_test == 0][0:100]

model.predict(X_sample)

array([[2.9785424e-03, 9.9702150e-01],
       [4.0060543e-03, 9.9599397e-01],
       [4.0610153e-03, 9.9593902e-01],
       [2.9079167e-03, 9.9709201e-01],
       [3.6337967e-03, 9.9636614e-01],
       [3.7259595e-03, 9.9627399e-01],
       [3.4123980e-03, 9.9658763e-01],
       [2.6794891e-03, 9.9732047e-01],
       [4.0731775e-03, 9.9592680e-01],
       [4.1578365e-03, 9.9584216e-01],
       [4.5123817e-03, 9.9548763e-01],
       [3.1134614e-03, 9.9688655e-01],
       [3.3317728e-03, 9.9666828e-01],
       [1.5628597e-03, 9.9843711e-01],
       [2.6183671e-03, 9.9738163e-01],
       [2.4493532e-02, 9.7550642e-01],
       [4.1501536e-03, 9.9584985e-01],
       [4.1450365e-03, 9.9585491e-01],
       [2.9354070e-03, 9.9706465e-01],
       [1.3714946e-03, 9.9862850e-01],
       [3.7143626e-03, 9.9628568e-01],
       [3.7166786e-03, 9.9628335e-01],
       [4.1655381e-03, 9.9583447e-01],
       [4.0026437e-03, 9.9599731e-01],
       [3.8617493e-03, 9.9613827e-01],
       [1.4854586e-03, 9.

In [40]:
model.save('model_2.model')
model.save_weights('model_2-w.weights')

In [12]:
model = load_model('model_2.h5')

In [13]:
y_pred = model.predict(ndf.iloc[:, 1:6].values) 

In [14]:
orig = decodePhed(df.mapping_quality.values)
labels = df.correctly_mapped.values
nn_pred = y_pred[:, 0]

In [59]:
indx = ((y_pred[:, 0]  >= 0.90) | (y_pred[:, 0]  <= 0.10))

In [57]:
nn_pred[indx]

array([0.99913365, 0.999366  , 0.99913365, ..., 0.9991448 , 0.99913365,
       0.99950445], dtype=float32)

In [15]:
brier_score_loss(labels, nn_pred)

0.008267885534199824

In [16]:
brier_score_loss(labels, orig) 

0.0017856050625612286

In [17]:
nn_pred[nn_pred >= 0.5] = 1
nn_pred[nn_pred < 0.5] = 0

In [18]:
accuracy_score(labels, nn_pred)

0.991138

In [19]:
orig[orig >= 0.5] = 1
orig[orig < 0.5] = 0

In [20]:
accuracy_score(labels, orig)

0.998043

In [23]:
accuracy_score(labels[labels == 0], nn_pred[labels == 0])

0.9928148774302621

In [24]:
accuracy_score(labels[labels == 0], orig[labels == 0])

0.9065934065934066

In [25]:
accuracy_score(labels[labels == 1], nn_pred[labels == 1])

0.9911340230986514

In [26]:
accuracy_score(labels[labels == 1], orig[labels == 1])

0.9982598828829009