### Validation using an independent dataset (MMRF-TARGET)

In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load dataset
df= pd.read_csv('./MMRF_TARGET_dataset.csv')
df.shape

(556, 33)

In [3]:
cols = ['downstream_exon_count', 'last_exon', 'PTC_to_start_codon',
        'dist_to_stop_codon', 'PTC_exon_length', 'PTC_to_intron','upstream_exon_count',
        'mRNA_half_life','50nt_to_last_EJ','LOEUF', 'AF', '5UTR_length', '3UTR_length', 'Transcript_length']

df[cols].head(1)

Unnamed: 0,downstream_exon_count,last_exon,PTC_to_start_codon,dist_to_stop_codon,PTC_exon_length,PTC_to_intron,upstream_exon_count,mRNA_half_life,50nt_to_last_EJ,LOEUF,AF,5UTR_length,3UTR_length,Transcript_length
0,19,0,4189,4718,990,31,10,315.133085,0,0.557,0.0,349,4719,9257


In [4]:
df[cols].isnull().sum()

downstream_exon_count     0
last_exon                 0
PTC_to_start_codon        0
dist_to_stop_codon        0
PTC_exon_length           0
PTC_to_intron             0
upstream_exon_count       0
mRNA_half_life            0
50nt_to_last_EJ           0
LOEUF                    74
AF                        0
5UTR_length               0
3UTR_length               0
Transcript_length         0
dtype: int64

In [5]:
df = df.loc[df['LOEUF'].notnull()].reset_index(drop=True)
df.shape

(482, 33)

In [6]:
regr = pickle.load(open('./nmd_eff_predictor', 'rb'))

def predict_NMD_efficiency(downstream_exon_count:int, last_exon:int, PTC_to_start_codon:int, 
                           dist_to_stop_codon:int, PTC_exon_length:int, PTC_to_intron:int, upstream_exon_count:int, 
                           mRNA_half_life:float, c50nt_to_last_EJ:int, LOEUF:float, AF:float,
                           UTR5_length:int, UTR3_length:int, Transcript_length:int) -> float:
    
    input_value =[[downstream_exon_count, last_exon, PTC_to_start_codon,
                   dist_to_stop_codon, PTC_exon_length, PTC_to_intron, upstream_exon_count,
                   mRNA_half_life, c50nt_to_last_EJ, LOEUF, AF,
                   UTR5_length, UTR3_length, Transcript_length]]
    
    assert sum([np.isnan(value) for value in input_value[0]]) == 0, 'Replace NaN value'
    
    pred = regr.predict(input_value)[0]
    
    return pred


def predict_NMD_efficiency_df(examples):
    df = pd.DataFrame(examples)

    assert df.isnull().sum().sum() == 0, 'Replace NaN value'

    pred = regr.predict(df)

    return pred

In [7]:
pred = regr.predict(df[cols])
c, p = stats.pearsonr(pred, df['NMD_efficiency'])
print("Pearson R :" , c)
print("P-value : " , p)

Pearson R : 0.46692101222307253
P-value :  1.8041235831646276e-27
