In [1]:
import os

from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectFromModel

from sklearn.svm import SVR

from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import pearsonr
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin

import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.insert(1, '../')
from utils import get_stats, mixup, write_pickled_object
from utils import FeatureSelector, MeanCenterer

In [2]:
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
random_state = 42 + 5

n_cores = 30
os.environ["OMP_NUM_THREADS"] = str(n_cores)
os.environ["MKL_NUM_THREADS"] = str(n_cores)
os.environ["OPENBLAS_NUM_THREADS"] = str(n_cores)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(n_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(n_cores)

In [3]:
class ModalitySelector(BaseEstimator, TransformerMixin):
    def __init__(self, modality):
        if modality not in ['protein', 'rna']:
            raise ValueError("modality must be 'protein' or 'rna'")
        self.modality = modality

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X is expected to be a tuple: (X_protein, X_rna)
        if self.modality == 'protein':
            return X[0]  # Return X_protein
        elif self.modality == 'rna':
            return X[1]  # Return X_rna

In [4]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0)
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean'].values.ravel()

expr_protein = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0)
expr_rna = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0)

protein_cols = expr_protein.columns
rna_cols = expr_rna.columns

X_protein = X[protein_cols].values
X_rna = X[rna_cols].values

X = X.values

Let's take a look at the results from the model selection:

In [4]:
res = pd.read_csv(os.path.join(data_path, 'interim', 'pipeline_model_selection_joint.csv'), index_col = 0)

In [6]:
res.test_corr.median()

np.float64(0.4999762101515235)

In [7]:
res.test_corr.mean()

np.float64(0.4187227101546025)

Select a best consensus model and re-run on new folds to see the performance:

In [23]:
C_best = []
epsilon_best = []
n_features_protein_best = []
# n_features_rna_best = []
for bp in res.best_params:
    print(bp)
    print('------------------------------------------------------')
    C_best.append(float(bp.split(', ')[2].split(': ')[1]))
    epsilon_best.append(float(bp.split(', ')[-1].split(': ')[1][:-1]))
    n_features_protein_best.append(int(bp.split(', ')[0].split(': ')[1]))
#     n_features_rna_best.append(int(bp.split(', ')[1].split(': ')[1]))

{'FeatureSelector__n_features_protein': 1000, 'FeatureSelector__n_features_rna': 19138, 'SVR__C': 0.026086387887828258, 'SVR__epsilon': 0.9286935961490541}
------------------------------------------------------
{'FeatureSelector__n_features_protein': 250, 'FeatureSelector__n_features_rna': 19138, 'SVR__C': 0.008704273636852068, 'SVR__epsilon': 0.17757859001378923}
------------------------------------------------------
{'FeatureSelector__n_features_protein': 12755, 'FeatureSelector__n_features_rna': 19138, 'SVR__C': 0.0008197823321850357, 'SVR__epsilon': 0.5662105975031672}
------------------------------------------------------
{'FeatureSelector__n_features_protein': 250, 'FeatureSelector__n_features_rna': 19138, 'SVR__C': 0.6605425628026751, 'SVR__epsilon': 0.8391358275197625}
------------------------------------------------------
{'FeatureSelector__n_features_protein': 12755, 'FeatureSelector__n_features_rna': 5000, 'SVR__C': 0.0002904955381806338, 'SVR__epsilon': 0.00861497068083276}

# Start

Correlation of shared features:

In [195]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0)
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean'].values.ravel()

expr_protein = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0)
expr_rna = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0)

protein_cols = expr_protein.columns
rna_cols = expr_rna.columns

X_protein = X[protein_cols]
X_rna = X[rna_cols]


prot_names = [x.split('|')[-1].split('_HUMAN')[0] for x in X_protein.columns]
rna_names = [x.split(' (')[0] for x in X_rna.columns]
common_features = sorted(set(prot_names).intersection(rna_names))
print('{} features are shared between the {} in rna and {} in protein'.format(len(common_features), 
                                                                             len(rna_names), 
                                                                             len(prot_names)))

prot_feature_map = dict(zip(prot_names, X_protein.columns))
rna_feature_map = dict(zip(rna_names, X_rna.columns))

corr_res = pd.DataFrame(columns = ['feature', 'pearson_corr'])
for feature in common_features:
    corr_res.loc[corr_res.shape[0], :] = [feature, stats.pearsonr(X_rna[rna_feature_map[feature]], 
                                                                  X_protein[prot_feature_map[feature]]).statistic]
    
corr_res.pearson_corr = corr_res.pearson_corr.abs()    
corr_res.sort_values(by = 'pearson_corr', ascending = False, inplace = True)

filter_features = 5541
exclude_features = [prot_feature_map[ef] for ef in corr_res.iloc[:filter_features, :].feature.tolist()]


X_protein = X_protein.drop(columns = exclude_features)
X = X[X_rna.columns.tolist() + X_protein.columns.tolist()].values
X_rna = X_rna.values
X_protein = X_protein.values


5541 features are shared between the 19138 in rna and 12755 in protein


Correlation with multiple features:

In [28]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0)
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean'].values.ravel()

expr_protein = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0)
expr_rna = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0)

protein_cols = expr_protein.columns
rna_cols = expr_rna.columns

X_protein = X[protein_cols]
X_rna = X[rna_cols]

rna_df = X_rna
protein_df = X_protein

cov_matrix = np.cov(rna_df.values.T, protein_df.values.T)[:rna_df.shape[1], rna_df.shape[1]:]
rna_std = rna_df.std()
protein_std = protein_df.std()

correlation_matrix = cov_matrix / (rna_std.values[:, None] @ protein_std.values[None, :])
correlation_matrix = pd.DataFrame(correlation_matrix, 
                                  columns = protein_df.columns, 
                                  index = rna_df.columns).abs()

threshold = 0.4
n = 5
exclude_features = [
    col for col in correlation_matrix.columns if (correlation_matrix[col] > threshold).sum() >= n
]
print(len(exclude_features))


X_protein = X_protein.drop(columns = exclude_features)
X = X[X_rna.columns.tolist() + X_protein.columns.tolist()].values
X_rna = X_rna.values
X_protein = X_protein.values

4217


Most important features from each:

In [137]:
import sys
sys.path.insert(1, '../')
from utils import read_pickled_object

In [145]:
best_pipeline_rna = read_pickled_object(os.path.join(data_path, 'processed', 'best_model.pickle'))
best_pipeline_protein = read_pickled_object(os.path.join(data_path, 'processed', 'best_model_protein.pickle'))

In [146]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0)
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean'].values.ravel()

expr_protein = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0)
expr_rna = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0)

protein_cols = expr_protein.columns
rna_cols = expr_rna.columns

X_protein = X[protein_cols].values
X_rna = X[rna_cols].values

In [147]:
best_pipeline_rna.fit(X_rna, y)
best_pipeline_protein.fit(X_protein, y)

svm_rna = best_pipeline_rna.steps[2][1]
svm_protein = best_pipeline_protein.steps[2][1]

feature_rank_rna = np.argsort(np.abs(svm_rna.coef_[0]))[::-1]
feature_rank_protein = np.argsort(np.abs(svm_protein.coef_[0]))[::-1]

top_n_rna = 7500#int(np.round(X_rna.shape[1]/3))
top_n_protein = 2500#int(np.round(X_protein.shape[1]/3))

top_rna = feature_rank_rna[:top_n_rna]
top_protein = feature_rank_protein[:top_n_protein]

X_protein = X_protein[:, top_protein]
# X_rna = X_rna[:, top_rna]


Looks like consistently, the best performing model uses all features and a linear SVM. We will take the median C and epsiolon value across folds:

In [148]:
# X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0)
# y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean'].values.ravel()

# expr_protein = pd.read_csv(os.path.join(data_path, 'processed',  'expr_protein.csv'), index_col = 0)
# expr_rna = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0)

# protein_cols = expr_protein.columns
# rna_cols = expr_rna.columns

# X_protein = X[protein_cols].values
# X_rna = X[rna_cols].values

# # X = X.values

In [149]:
protein_pipeline = Pipeline([
    ("select_protein", ModalitySelector(modality="protein")),
#     ("feature_selection_protein", FeatureSelector(method="top_n_cv", 
#                                                   n_features=0)),
    ("mean_centering_protein", MeanCenterer()),  # Mean centering for protein data
#     ("feature_reduction", PLSRegression_X(n_components=100))
])

# RNA-specific pipeline
rna_pipeline = Pipeline([
    ("select_rna", ModalitySelector(modality="rna")),
#     ("feature_selection_rna", FeatureSelector(method="top_n_cv", 
#                                               n_features=X_rna.shape[1])),
    ("mean_centering_rna", MeanCenterer()),  # Mean centering for RNA data
#     ("feature_reduction", PLSRegression_X(n_components=100))
])

# Combine both pipelines
combined_pipeline = FeatureUnion([
    ("protein_pipeline", protein_pipeline),
    ("rna_pipeline", rna_pipeline),
])

# Add the model
best_steps = [
    ("feature_processing", combined_pipeline),
]

# best_steps.append(("feature_reduction", PLSRegression(n_components=100)))

best_steps.append(("model", SVR(
    kernel='linear',
    C=np.median(C_best),
    epsilon=np.median(epsilon_best)
)))
best_pipeline = Pipeline(best_steps)
# write_pickled_object(best_pipeline, 
#                     os.path.join(data_path, 'processed', 'best_model_joint.pickle'))

In [150]:
np.random.seed(random_state + 1)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=random_state+1)
n_synthetic = 1000
augment = False
# res = {}

baseline_linear = LinearRegression(n_jobs = n_cores)

results = []
for k, (train_idx, test_idx) in tqdm(enumerate(outer_cv.split(X_rna, y))):
#     X_train_all, X_test_all = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    X_train_rna, X_test_rna = X_rna[train_idx], X_rna[test_idx]
    X_train_protein, X_test_protein = X_protein[train_idx], X_protein[test_idx]
    
    X_train = (X_train_protein, X_train_rna)
    X_test = (X_test_protein, X_test_rna)

    train_corr, test_corr, train_mse, test_mse = get_stats(best_pipeline, y_train, y_test, X_train, X_test)
    
#     # random - y
#     y_train_rand = np.random.permutation(y_train)
#     res_y_rand = get_stats(best_pipeline, y_train_rand, y_test, X_train, X_test)
#     _, test_corr_y_rand, _, test_mse_y_rand = res_y_rand    
    
#     # random - X (features)
#     X_train_rand_protein = X_train_protein[:, np.random.permutation(X_train_protein.shape[1])]
#     X_train_rand_rna = X_train_rna[:, np.random.permutation(X_train_rna.shape[1])]
#     X_train_rand =  (X_train_rand_protein, X_train_rand_rna)
#     res_X_rand = get_stats(best_pipeline, y_train, y_test, X_train_rand, X_test)
#     _, test_corr_X_rand, _, test_mse_X_rand = res_X_rand  
    
#     # linear simple
#     linear_res = get_stats(baseline_linear, y_train, y_test, X_train_all, X_test_all)
#     _, test_corr_linear, _, test_mse_linear = linear_res
    

    results.append({
        "fold": k,
        "train_corr": train_corr,
        "test_corr": test_corr,
        'train_mse': train_mse, 
        'test_mse': test_mse,
#         'random_y_test_corr': test_corr_y_rand, 
#         'random_y_test_mse': test_mse_y_rand, 
#         'random_X_test_corr': test_corr_X_rand, 
#         'random_X_test_mse': test_mse_X_rand, 
#         'linear_baseline_test_corr': test_corr_linear, 
#         'linear_baseline_test_mse': test_mse_linear, 
        })
    best_res_df = pd.DataFrame(results)
#     res[k] = {'test': y_test, 'pred': y_test_pred, 'train': y_train}

10it [00:09,  1.02it/s]


In [151]:
best_res_df.test_corr.mean()

np.float64(0.437142887730199)

In [152]:
best_res_df.test_corr.median()

np.float64(0.43007705399299956)

In [22]:
# viz_df = best_res_df[[col for col in best_res_df if 'test' in col or col == 'fold']]
# viz_df_corr = viz_df[[col for col in viz_df if 'corr' in col]]
# viz_df_mse = viz_df[[col for col in viz_df if 'mse' in col]]
# viz_dfs = {'Pearson Correlation': viz_df_corr, 
#           'Mean Squared Error': viz_df_mse}

In [102]:
# fig, ax = plt.subplots(ncols = 2, figsize = (10,5))

# for i, (metric, viz_df) in enumerate(viz_dfs.items()):
#     viz_df = pd.melt(viz_df, value_name=metric, var_name = 'Model Type')
#     sns.violinplot(data = viz_df, x = 'Model Type', y = metric, ax = ax[i])
#     ax[i].set_xticklabels(ax[i].get_xticklabels(), 
#                      rotation = 30, ha = 'center')

In [24]:
best_res_df.test_corr.median()

np.float64(0.4259590925001584)

In [25]:
best_res_df.test_corr.median()

np.float64(0.4259590925001584)

# To do:
1) try a different seed?
2) why is the model selection pipeline so much better than the best model here, when it wasn't anywhere else?