In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("..") 

from ndac.data_processing import encode_sequence
from ndac.predict import train_clstm
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Read previously trained model

In [2]:
DF_test = pd.read_csv('DF_test.csv', index_col=0)  # filtered in sastry_preprocessing.ipynb
aa_model = load_model('aa_model.h5')  # trained in ndac_workflow.ipynb

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


## predict expression for unseen PrESTs

In [3]:
X_test, y_test = encode_sequence(DF_test['aa_seq'], DF_test['expressed'],
                                 max_length=200)
pred = aa_model.predict_proba(X_test)
DF_test.loc[:, 'pred'] = pred

We now have a predicted expression probability for each PrEST. Our knowledge of it's true expression ('expressed') will be used for a retrospective analysis to estimate experimental savings.

In [4]:
DF_test[DF_test.uniprot_id == 'O43506'][['uniprot_id', 'aa_seq', 'pred', 'expressed']].sort_values('pred', ascending=False)

Unnamed: 0,uniprot_id,aa_seq,pred,expressed
33072,O43506,DAFYHPLEVDVILTGIDIWTASNPLPTSGDLDNVLEDFSIWKNYNL...,0.896197,1.0
33074,O43506,MVQLHQDTDPQIPKGQPCTLNSSEGGARPAVPHTLFSSALDRWLHN...,0.321429,1.0
5,O43506,FVGWWTHQRFVELVVVVDNIRYLFSQSNATTVQHEVFNVVNIVDSF...,0.160074,0.0
33073,O43506,IAHQMELQLSYNFTLKQSSFVGWWTHQRFVELVVVVDNIRYLFSQS...,0.062105,0.0


### Original Data Statistics

In [5]:
print('Original total number of experiments:',len(DF_test))
print('Original total number of proteins:',len(DF_test.uniprot_id.unique()))

Original total number of experiments: 11562
Original total number of proteins: 4759


In [6]:
print('Original Number of passed experiments:',len(DF_test[DF_test.expressed == 1.0]))
print('Original Pass rate: %.2f%%'%(np.true_divide(len(DF_test[DF_test.expressed == 1.0]),len(DF_test))*100))
print('Number of proteins with >1 high expression:',len(DF_test[DF_test.expressed == 1.0].uniprot_id.unique()))

Original Number of passed experiments: 5684
Original Pass rate: 49.16%
Number of proteins with >1 high expression: 3587


### Grab only the top predicted prESTs for each protein

In [7]:
n = 4
np.random.seed(0)

output_df = pd.DataFrame(columns=['prest_id','uniprot_id','expressed','pred','aa_seq'])
remaining_df = DF_test.copy()

for i in range(n):
    print('Iteration',i)
    new_output_df = remaining_df.sort_values(['uniprot_id','pred'],ascending=[1,0]).drop_duplicates('uniprot_id')
    output_df = pd.concat([output_df,new_output_df])
   
    pred_pos_proteins = set(output_df[output_df.pred > 0.5].uniprot_id)
    true_pos_proteins = set(output_df[output_df.expressed == 1.0].uniprot_id)
    print('Total number of proposed experiments:',len(output_df))
    print('Total number of expressed proteins:',len(true_pos_proteins))
    print('Overall pass rate:',np.true_divide(len(true_pos_proteins),len(output_df)))

    # Prepare for next iteration
    remaining_df = remaining_df.drop(new_output_df.index)
    remaining_df = remaining_df[remaining_df.uniprot_id.isin(true_pos_proteins)==False]

    print
print('Percent saved experiments:',(1 - np.true_divide(len(output_df),len(DF_test)))*100,'%')

Iteration 0
Total number of proposed experiments: 4759
Total number of expressed proteins: 3079
Overall pass rate: 0.6469846606429922
Iteration 1
Total number of proposed experiments: 6439
Total number of expressed proteins: 3543
Overall pass rate: 0.5502407206087901
Iteration 2
Total number of proposed experiments: 6816
Total number of expressed proteins: 3586
Overall pass rate: 0.5261150234741784
Iteration 3
Total number of proposed experiments: 6896
Total number of expressed proteins: 3587
Overall pass rate: 0.5201566125290024
Percent saved experiments: 40.3563397336101 %
