### NMD efficiency Predictor upon RandomForestRegressor

In [1]:
%matplotlib inline
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [9]:
'''
Description of columns
        downstream_exon_count (as Downstream exon count): The number of exons downstream of the PTC
        last_exon (as Last exon): 1 if the PTC is located on the last exon; 0 otherwise
        PTC_to_start_codon (as Dist PTC to start codon): The distance between the PTC to start codon
        dist_to_stop_codon (as Dist PTC to normal stop codon): The distance between the PTC and normal stop codon
        PTC_exon_length (as PTC-containing exon length): The length of the PTC-containing exon
        PTC_to_intron (as Dist PTC to downstream EJ): The distance between the PTC and downstream exon junction
        upstream_exon_count (as Upstream exon count): The number of exons upstream of the PTC
        mRNA_half_life (as mRNA half-life): The half-life of mRNA
        50nt_to_last_EJ (as Last 50nt penultimate exon): 1 if the PTC is located on the last exon; 0 otherwise
        LOEUF (as LOEUF): Gene-level degree of mutational constraints
        AF: (as Allele frequency): Allele frequency in gnomAD database
        UTR5_length: (as 5'UTR length) : The length of 5'UTR
        UTR3_length: (as 3'UTR length) : The length of 3'UTR
        Transcript_length: (as Transcript length) : The length of the transcript
'''

regr = pickle.load(open('./nmd_eff_predictor', 'rb'))

def predict_NMD_efficiency(downstream_exon_count:int, last_exon:int, PTC_to_start_codon:int, 
                           dist_to_stop_codon:int, PTC_exon_length:int, PTC_to_intron:int, upstream_exon_count:int, 
                           mRNA_half_life:float, c50nt_to_last_EJ:int, LOEUF:float, AF:float,
                           UTR5_length:int, UTR3_length:int, Transcript_length:int) -> float:
    
    input_value =[[downstream_exon_count, last_exon, PTC_to_start_codon,
                   dist_to_stop_codon, PTC_exon_length, PTC_to_intron, upstream_exon_count,
                   mRNA_half_life, c50nt_to_last_EJ, LOEUF, AF,
                   UTR5_length, UTR3_length, Transcript_length]]
    
    assert sum([np.isnan(value) for value in input_value[0]]) == 0, 'Replace NaN value'
    
    pred = regr.predict(input_value)[0]
    
    return pred


def predict_NMD_efficiency_df(examples):
    df = pd.DataFrame(examples)

    assert df.isnull().sum().sum() == 0, 'Replace NaN value'

    pred = regr.predict(df)

    return pred



### Predict the NMD efficientcy with a single instance

In [4]:
predict_NMD_efficiency(3,        # downstream_exon_count: int
                       0,        # last_exon: int
                       2625,     # PTC_to_start_codon: int
                       963,      # dist_to_stop_codon: int
                       1399,     # PTC_exon_length: int
                       586,      # PTC_to_intron: int: int
                       15,       # upstream_exon_count
                       332.338,  # mRNA_half_life: float
                       0,        # 50nt_to_last_EJ: int
                       0.25,     # LOEUF: float
                       0.000016, # AF: float
                       125,      # UTR5_length : Int
                       964,      # UTR3_length : Int
                       3714      # Transcript_length : Int   
                      ) 


0.8102955147698261

### Predict the NMD efficientcy with a dataset containing multiple instances

In [10]:
example_dataset= [[3, 0, 2625, 963, 1399, 586, 15, 332.338457, 0, 0.25, 0.000016, 125, 964, 3714],
                  [4, 0, 2337, 579, 86, 29, 20, 333.74517, 0, 0.117, 0.000015, 123, 580, 3040]]

predict_NMD_efficiency_df(example_dataset)



   0   1     2    3     4    5   6           7   8      9         10   11  \
0   3   0  2625  963  1399  586  15  332.338457   0  0.250  0.000016  125   
1   4   0  2337  579    86   29  20  333.745170   0  0.117  0.000015  123   

    12    13  
0  964  3714  
1  580  3040  


array([0.81029551, 1.69715546])