In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy import stats
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


In [2]:
df= pd.read_csv('tcga_dataset.csv')
df.shape

(4257, 49)

In [3]:
cols = ['downstream_exon_count', 'last_exon', 'PTC_to_start_codon',
        'dist_to_stop_codon', 'PTC_exon_length', 'PTC_to_intron','upstream_exon_count',
        'mRNA_half_life','50nt_to_last_EJ','LOEUF', 'AF', '5UTR_length', '3UTR_length', 'Transcript_length']

df[cols].isnull().sum()

downstream_exon_count      0
last_exon                  0
PTC_to_start_codon         0
dist_to_stop_codon         0
PTC_exon_length            0
PTC_to_intron              0
upstream_exon_count        0
mRNA_half_life             0
50nt_to_last_EJ            0
LOEUF                    130
AF                         0
5UTR_length                0
3UTR_length                0
Transcript_length          0
dtype: int64

In [4]:
df = df.loc[df['LOEUF'].notnull()].reset_index(drop=True)
df.shape

(4127, 49)

In [5]:
df.head(1)

Unnamed: 0,Cancer_type,Cancer_type_count,NMF_cluster,build,chromosome,start,end,Hugo_Symbol,Transcript_ID,HGVSc,...,depth_RNA,VAF_DNA_RNA_ratio,NMD_efficiency,AF,AF Group,LOEUF,LOEUF_bin,5UTR_length,3UTR_length,Transcript_length
0,ACC,12,1,GRCh38,chr12,98546362,98546362,TMPO,ENST00000556029,c.994G>T,...,44.0,0.528592,0.919772,0.0,[0],0.737,3.0,356,370,1722


### Correlation analysis

In [6]:
crr = [];pvalue = []
for c in cols:
    c, p = stats.pearsonr(df[c], df['NMD_efficiency'])
    crr.append(c); pvalue.append(p)

In [7]:
pd.DataFrame({'name':cols,'corr': crr, 'p-value':pvalue})#.to_csv('tmp.csv')

Unnamed: 0,name,corr,p-value
0,downstream_exon_count,0.258525,5.2388589999999995e-64
1,last_exon,-0.478237,6.316310999999999e-235
2,PTC_to_start_codon,-0.076805,7.817535e-07
3,dist_to_stop_codon,0.090351,6.083344e-09
4,PTC_exon_length,-0.180007,2.143971e-31
5,PTC_to_intron,-0.171502,1.3035140000000001e-28
6,upstream_exon_count,0.012724,0.4138306
7,mRNA_half_life,0.124783,8.602908e-16
8,50nt_to_last_EJ,-0.174488,1.424919e-29
9,LOEUF,-0.123524,1.672111e-15


### Overall performance using cross-validation

In [8]:
regr = RandomForestRegressor(max_features = 3,n_estimators=10000, n_jobs=-1)
scores = cross_val_score(regr, df[cols], df['NMD_efficiency'], cv=5)
np.mean(scores)

0.4373697587087708

### Feature importance upon random forest regressor

In [10]:
regr = RandomForestRegressor(max_features = 3,n_estimators=10000, n_jobs=-1)

X = df[cols]
y = df['NMD_efficiency']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

0.45930321592201606

In [11]:
importance_df = pd.DataFrame({'feature':regr.feature_names_in_, 
                              'importance': regr.feature_importances_})

importance_df 

Unnamed: 0,feature,importance
0,downstream_exon_count,0.133673
1,last_exon,0.079472
2,PTC_to_start_codon,0.093779
3,dist_to_stop_codon,0.083282
4,PTC_exon_length,0.0906
5,PTC_to_intron,0.085233
6,upstream_exon_count,0.050848
7,mRNA_half_life,0.060456
8,50nt_to_last_EJ,0.019892
9,LOEUF,0.065102
