In [47]:
import os
import json

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.pipeline import FeatureUnion


import sys

data_path = '/home/hmbaghda/orcd/pool/metastatic_potential/'
random_state = 42

# custom classes
sys.path.insert(1, '../../')
from utils import ProteinFeatureSelector, RNAFeatureSelector, MeanCenterer, ModalitySelector

Load files:

In [48]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0)
expr_joint = X.copy()

mp_joint=pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean']
y = mp_joint.values.ravel()

expr_protein = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0)
expr_rna = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0)

protein_cols = expr_protein.columns
rna_cols = expr_rna.columns

# with open("protein_cols.txt", "w") as f: f.writelines(f"{item}\n" for item in protein_cols)
# with open("rna_cols.txt", "w") as f: f.writelines(f"{item}\n" for item in rna_cols)


X_protein = X[protein_cols].values
X_rna = X[rna_cols].values


Mapping of feature names between protein and rna:

In [49]:
##################get mapper##################
with open(os.path.join(data_path, 'processed', 'uniprot_mapper.json'), "r") as json_file:
    uid_mapper = json.load(json_file)
    
# manually mapped some that failed to map using uniprot ID
manual_map = {'Q9TNN7': 'HLA-C',
'P16189': 'HLA-A',
'P30456': 'HLA-A',
'P30443': 'HLA-A',
'P05534': 'HLA-A',
'P18462': 'HLA-A',
'P01892': 'HLA-A',
'P13746': 'HLA-A',
'P01891': 'HLA-A',
'P30483': 'HLA-B',
'P30484': 'HLA-B',
'P03989': 'HLA-B',
'P30460': 'HLA-B',
'P30461': 'HLA-B',
'Q95365': 'HLA-B',
'P16188': 'HLA-A',
'Q95604': 'HLA-C',
'Q07000': 'HLA-C',
'P30499': 'HLA-C',
'P30501': 'HLA-C',
'P30504': 'HLA-C',
'Q95IE3': 'HLA-DRB1',
'P04229': 'HLA-DRB1',
'P20039': 'HLA-DRB1',
'P13760': 'HLA-DRB1',
'Q5Y7A7': 'HLA-DRB1',
'Q9GIY3': 'HLA-DRB1',
'Q9TQE0': 'HLA-DRB1',
'Q30134': 'HLA-DRB1'}
    
##################map each feature to its gene symbol##################    
protein_names = []
for protein_id in protein_cols:
    uniprot_id = protein_id.split('|')[1].split('-')[0]
    if pd.isna(uid_mapper[uniprot_id]):
        gene_name = protein_id.split('|')[-1].split('_HUMAN')[0]
        if gene_name[0].isdigit():
            gene_name = manual_map[uniprot_id]
    else:
        gene_name = uid_mapper[uniprot_id]
    protein_names.append(gene_name)
    
    
rna_names = [rna_id.split(' (')[0] for rna_id in rna_cols]



protein_map = dict(zip(protein_cols, protein_names)) # maps protein feature name --> gene symbol
rna_map = dict(zip(rna_cols, rna_names)) # maps rna feature name --> gene symbol

# maps rna feature name --> protein feature name (including one to many mappings)
feature_map = {}
for rf, rn in rna_map.items():
    feature_map[rf] = [pf for pf,pv in protein_map.items() if pv == rn]

Initialize and fit the consense joint linear SVR model:

In [50]:
# ID'd hyperparams
n_features_best_protein = 1475
n_features_best_rna = 16371

C_best = 5.8358063592204745
epsilon_best = 0.5015578182902036

In [51]:
# model initialization
protein_pipeline = Pipeline([
    ("select_protein", ModalitySelector(modality="protein")),
    ("feature_selection_protein", ProteinFeatureSelector(method="top_residuals", n_features=n_features_best_protein)),
    ("mean_centering_protein", MeanCenterer()),  # Mean centering for protein data
])

# RNA-specific pipeline
rna_pipeline = Pipeline([
    ("select_rna", ModalitySelector(modality="rna")),
    ("feature_selection_rna", RNAFeatureSelector(method="top_residuals", n_features=n_features_best_rna)),
    ("mean_centering_rna", MeanCenterer()),  # Mean centering for RNA data
])

# Combine both pipelines
combined_pipeline = FeatureUnion([
    ("protein_pipeline", protein_pipeline),
    ("rna_pipeline", rna_pipeline),
])

best_steps = [
    ("feature_processing", combined_pipeline),
]
best_steps.append(("model", SVR(
    kernel='linear',
    C=C_best,
    epsilon=epsilon_best
)))
best_pipeline = Pipeline(best_steps)


In [52]:
# model fitting -- on full dataset
X = (X_protein, X_rna)
best_pipeline.fit(X, y)

# sanity check of same fitted coefs
model_coefs = pd.read_csv(os.path.join(data_path, 'interim', 'joint_features.csv'), 
                          index_col = 0)
assert np.allclose(model_coefs['SVM coefficient'].values, 
                   best_pipeline.named_steps['model'].coef_.flatten()), 'Inconsitency between HB fit and NM fit'

Load genetic alg identified interactions: 

In [56]:
hc = pd.read_csv(os.path.join(data_path, 'processed', 'joint_ols_perm_interactions.csv'), 
                 index_col = 0)
hc = hc[hc.bh_fdr_perm <= 0.2].copy()
hc.head()

Unnamed: 0,feature_1,feature_2,coef_perm,pval_perm,bh_fdr_perm,feature_type,coef_OLS,t_statistic_OLS,feature_1_rank,feature_1_gene_name,feature_2_rank,feature_2_gene_name
0,WDR36 (134430),FYCO1 (79443),-0.002153,0.053,0.151923,all,-0.170331,2.92677,17276,WDR36,5132,FYCO1
1,STAT5B (6777),SP1 (6667),-0.002279,0.018,0.1125,cancer_cell_map,-0.182883,3.056964,12420,STAT5B,7582,SP1
3,SREBF2 (6721),ZC3H10 (84872),-0.001728,0.063,0.151923,all,-0.194467,3.142241,15968,SREBF2,11608,ZC3H10
4,TBC1D5 (9779),SLC22A13 (9390),-0.000436,0.062,0.151923,all,-0.84243,2.802959,5020,TBC1D5,16550,SLC22A13
5,PLXNA3 (55558),TBC1D5 (9779),-0.003814,0.13,0.193243,all,-0.07208,2.703746,1092,PLXNA3,5020,TBC1D5


In [63]:
feature_names = sorted(set(hc.feature_1.tolist() + hc.feature_2.tolist()))

print('{} of {} interacting features are from RNA'.format(len(set(feature_names).intersection(rna_map)), 
                                                           len(feature_names)))
print('{} of {} interacting features are from protein'.format(len(set(feature_names).intersection(protein_map)), 
                                                           len(feature_names)))
                                                           

39 of 39 interacting features are from RNA
0 of 39 interacting features are from protein


In [92]:
# get the corresponding protein interacting pairs if they were included in the feature selection
# this could serve as a comparison to the interaction effects of the transcriptomics
for row in hc.itertuples(index=True):
    r1, r2 = row.feature_1, row.feature_2 # selected interaction features
    
    p1, p2 = feature_map[r1], feature_map[r2] # corresponding protein features
    
    # check if the corresponding protein pair was selected in the model
    a = any([p_ in model_coefs.feature_name for p_ in p1])
    b = any([p_ in model_coefs.feature_name for p_ in p2])
    if a and b:
        print('The corresponding protein feature pair was selected for in the model')
        

Unfortunately, none of both interacting RNA features in a interaction pair was selected for in protein form in the model. 




In [97]:
# fns = [
#     os.path.join(data_path, 'processed', 'joint_ols_perm_interactions.csv'), 
#     os.path.join(data_path, 'interim', 'joint_features.csv'), 
#     os.path.join(data_path, 'processed', 'uniprot_mapper.json'), 
#     os.path.join(data_path, 'processed',  'expr_joint.csv'), 
#     os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), 
#     os.path.join(data_path, 'processed',  'expr.csv'), 
#     os.path.join(data_path, 'processed',  'expr_protein.csv'),

# ]
# for fn in fns:
#     print('scp hmbaghda@orcd-login001.mit.edu:{} Downloads/for_nm/.'.format(fn))
# print()