In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
%matplotlib inline
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from lbs.dl.encoders import enc_seq_onehot, enc_label, enc_pssm
import pickle
from sklearn.utils import shuffle

In [2]:
df = pickle.load(open('./../3_Train_Test_Split/out/train_data.p', 'rb'))

In [3]:
def encode(df):
    sequences = []
    labels = []
    entries = []
    for entry, seq, label in zip(df.index, df['sequence'].tolist(), df['socket_assignment'].tolist()):
        if len(seq) <= 500:
            pad_left = random.randint(0, 500 - len(seq))
            sequences.append(np.concatenate((enc_seq_onehot(seq, pad_length=500, pad_left=pad_left),
                                           enc_pssm('/home/db/psiblast/PSSM_3_IT/%s.pssm' % entry, pad_length=500, pad_left=pad_left)), axis=1))
            labels.append(enc_label(label, pad_length=500, pad_left=pad_left))
            entries.append(entry)
    return (np.asarray(sequences), np.asarray(labels), entries)

In [4]:
def decode(pred, enc_seq):
    decoded_preds = []
    for entry_pred, entry in zip(pred, enc_seq):
        decoded_pred = []
        for pos_pred, pos_seq in zip(entry_pred, entry):
            if not np.array_equal(pos_seq, np.zeros(40)):
                decoded_pred.append(pos_pred[1])
        decoded_preds.append(np.asarray(decoded_pred))
    return np.asarray(decoded_preds)

In [5]:
from lbs.dl.logger import Logger
from lbs.dl.metrics import total_accuracy
from keras.models import Model
from keras.models import Sequential
from keras.layers import SpatialDropout1D, SpatialDropout2D, MaxPooling1D, MaxPooling2D, Dense, TimeDistributed, Convolution1D, BatchNormalization,Input,merge,LSTM, Dropout,Embedding,Bidirectional
from keras.optimizers import Adam
from keras.regularizers import l2
import keras.backend as K
import numpy as np
def wcc(weights):
    weights = K.variable(weights)
    def loss(y_true, y_pred):
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss
    return loss
weights = np.array([1,5])

Using TensorFlow backend.


In [6]:
def Model():
    lr = 0.0005
    model = Sequential()
    model.add(Convolution1D(64, 28, padding='same', activation='relu', kernel_regularizer=l2(0.0001), input_shape=(500, 40)))
    model.add(Dropout(0.5))
    model.add(Convolution1D(64, 21, padding='same',  activation='relu', kernel_regularizer=l2(0.0001)))
    model.add(Dropout(0.25))
    model.add(Dense(128))
    model.add(Dense(2, activation='softmax', name='out'))
    adam = Adam(lr=lr)
    model.compile(optimizer=adam, loss=wcc(weights), metrics=[total_accuracy])
    return model

In [7]:
cv_results = {}
for i in range(1, 6):
    print('CV #%s:' % i)
    ch1 = Logger(lab_pos=1, out_path='models/final_seq_pssm/', out_fn='final_seq_pssm_%s.h5' % (i), out_log='final_seq_pssm.txt' )
    callbacks_list = [ch1]
    model = Model()
    ### Get data
    val_df = df[df['val'] == i]
    train_df = df[df['val'] != i]
    train_data= encode(train_df)
    valid_data = encode(val_df)
    ### Train
    h = model.fit(train_data[0], train_data[1],
                  validation_data = (valid_data[0], valid_data[1]),
                  batch_size=64,
                  epochs=50,
                  verbose=0, callbacks=callbacks_list)
    ### Predict
    model.load_weights('models/final_seq_pssm/final_seq_pssm_%s.h5' % i)
    preds = model.predict(valid_data[0])
    ### Decode predictions and add results to dict
    decoded_preds = decode(preds, valid_data[0])
    for entry, decoded_pred in zip(valid_data[2], decoded_preds):
        assignment  = ''.join(str(int(label)) for label in np.rint(decoded_pred))
        cv_results[entry] = assignment

CV #1:
Best F1 score: 0.404 (prec: 0.384, sens: 0.426)
Best F1 score: 0.412 (prec: 0.300, sens: 0.657)
Best F1 score: 0.441 (prec: 0.320, sens: 0.712)
Best F1 score: 0.488 (prec: 0.455, sens: 0.526)
Best F1 score: 0.495 (prec: 0.434, sens: 0.575)
Best F1 score: 0.505 (prec: 0.499, sens: 0.512)
Best F1 score: 0.508 (prec: 0.434, sens: 0.613)
Best F1 score: 0.511 (prec: 0.471, sens: 0.558)
Best F1 score: 0.511 (prec: 0.448, sens: 0.594)
Best F1 score: 0.513 (prec: 0.480, sens: 0.551)
Best F1 score: 0.517 (prec: 0.472, sens: 0.570)
CV #2:
Best F1 score: 0.409 (prec: 0.334, sens: 0.526)
Best F1 score: 0.494 (prec: 0.442, sens: 0.561)
Best F1 score: 0.525 (prec: 0.452, sens: 0.627)
Best F1 score: 0.535 (prec: 0.464, sens: 0.631)
Best F1 score: 0.536 (prec: 0.447, sens: 0.669)
Best F1 score: 0.537 (prec: 0.509, sens: 0.569)
Best F1 score: 0.538 (prec: 0.503, sens: 0.578)
Best F1 score: 0.542 (prec: 0.508, sens: 0.581)
CV #3:
Best F1 score: 0.346 (prec: 0.543, sens: 0.254)
Best F1 score: 0.44

In [8]:
len(cv_results), df.shape

(10438, (10438, 25))

In [9]:
df_deepcoil = pd.DataFrame.from_dict(cv_results, orient='index')
df_deepcoil.columns = ['deepcoil_assignment']
df = pd.concat([df, df_deepcoil], axis=1, sort=True)

In [10]:
df.to_pickle('out/pssm/cv_results.p')
df.to_csv('out/pssm/cv_results.csv')

In [11]:
df_test = pickle.load(open('./../3_Train_Test_Split/out/test_data.p', 'rb'))

In [12]:
test_data = encode(df_test)

In [13]:
test_ensemble_results = {}
for i in range(1, 6):
    model.load_weights('models/final_seq_pssm//final_seq_pssm_%s.h5' % i)
    preds = model.predict(test_data[0])
    decoded_preds = decode(preds, test_data[0])
    for decoded_pred, entry in zip(decoded_preds, test_data[2]):
        if i == 1:
            test_ensemble_results[entry] = decoded_pred
        else:
            test_ensemble_results[entry] = np.vstack((test_ensemble_results[entry], decoded_pred))

In [14]:
test_avg_results = {}
for key, value in test_ensemble_results.items():
    avg_results = np.average(value, axis=0)
    assignment  = ''.join(str(int(label)) for label in np.rint(avg_results))
    test_avg_results[key] = assignment

In [15]:
len(test_avg_results), df_test.shape

(1193, (1193, 24))

In [16]:
df_deepcoil = pd.DataFrame.from_dict(test_avg_results, orient='index')
df_deepcoil.columns = ['deepcoil_assignment']
df_test = pd.concat([df_test, df_deepcoil], axis=1, sort=True)

In [17]:
df_test.to_pickle('out/pssm/test_results.p')
df_test.to_csv('out/pssm/test_results.csv')

#### Li (2016) benchmark dataset

In [18]:
df_test = pickle.load(open('./../3_Train_Test_Split/out/li2016.p', 'rb'))

In [19]:
test_data = encode(df_test)

In [20]:
test_ensemble_results = {}
for i in range(1, 6):
    model.load_weights('models/final_seq_pssm/final_seq_pssm_%s.h5' % i)
    preds = model.predict(test_data[0])
    decoded_preds = decode(preds, test_data[0])
    for decoded_pred, entry in zip(decoded_preds, test_data[2]):
        if i == 1:
            test_ensemble_results[entry] = decoded_pred
        else:
            test_ensemble_results[entry] = np.vstack((test_ensemble_results[entry], decoded_pred))

In [21]:
test_avg_results = {}
for key, value in test_ensemble_results.items():
    avg_results = np.average(value, axis=0)
    assignment  = ''.join(str(int(label)) for label in np.rint(avg_results))
    test_avg_results[key] = assignment

In [22]:
len(test_avg_results), df_test.shape

(518, (518, 24))

In [23]:
df_deepcoil = pd.DataFrame.from_dict(test_avg_results, orient='index')
df_deepcoil.columns = ['deepcoil_assignment']
df_test = pd.concat([df_test, df_deepcoil], axis=1, sort=True)

In [24]:
df_test.to_pickle('out/pssm/li2016_results.p')
df_test.to_csv('out/pssm/li2016_results.csv')