In [32]:
import os
import pickle
import pathlib

from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import pearsonr
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
random_state = 42 + 3

n_cores = 30
os.environ["OMP_NUM_THREADS"] = str(n_cores)
os.environ["MKL_NUM_THREADS"] = str(n_cores)
os.environ["OPENBLAS_NUM_THREADS"] = str(n_cores)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(n_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(n_cores)

In [3]:
def write_pickled_object(object_, file_name: str) -> None:
    if '.' in file_name:
        p = pathlib.Path(file_name)
        extensions = "".join(p.suffixes)
        file_name = str(p).replace(extensions, '.pickle')
    else:
        file_name = file_name + '.pickle'

    with open(file_name, 'wb') as handle:
        pickle.dump(object_, handle)

In [4]:
# Feature selection transformer
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, method='top_n_cv', n_features=None):
        if method not in ['top_n_cv']:#, 'all_features']:
            raise ValueError('Incorrect feature selection method implemented')
        self.method = method
        self.n_features = n_features

    def fit(self, X, y=None):
        if self.method == 'top_n_cv':
            self.coefficient_of_variation_ = np.std(X, axis=0) / np.mean(X, axis=0)
            self.top_indices_ = np.argsort(self.coefficient_of_variation_)[::-1][:self.n_features]
#         elif self.method == 'all_features':
#             self.top_indices_ = range(X.shape[1])
        return self
    def transform(self, X, y=None):
        return X[:, self.top_indices_]
    
class MeanCenterer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
#         self.mean_ = np.mean(X, axis=0)
        return self

    def transform(self, X, y=None):
        return X - np.mean(X, axis=0)
    
def pearson_corr_scorer(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]


In [5]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr_joint.csv'), index_col = 0).values
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential_joint.csv'), index_col = 0)['mean'].values.ravel()

Let's take a look at the results from the model selection:

In [6]:
res = pd.read_csv(os.path.join(data_path, 'interim', 'pipeline_model_selection_joint.csv'), index_col = 0)

In [3]:
res.test_corr.mean()

Select a best consensus model and re-run on new folds to see the performance:

In [2]:
C_best = []
epsilon_best = []
for bp in res.best_params:
    print(bp)
    print('------------------------------------------------------')
    C_best.append(float(bp.split(', ')[2].split(': ')[1]))
    epsilon_best.append(float(bp.split(', ')[-1].split(': ')[1][:-1]))

Looks like consistently, the best performing model uses all features and a linear SVM. We will take the median C and epsiolon value across folds:

In [9]:
best_steps = [
    ("feature_reduction", FeatureSelector(n_features = 19138)),
    ("mean_centering", MeanCenterer()),
]
best_steps.append(("model", SVR(
    kernel='linear',
    C=np.median(C_best),
    epsilon=np.median(epsilon_best)
#     random_state=random_state,
#     n_jobs=n_cores
)))
best_pipeline = Pipeline(best_steps)
write_pickled_object(best_pipeline, 
                    os.path.join(data_path, 'processed', 'best_model.pickle'))

In [10]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0).values
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential.csv'), index_col = 0)['mean'].values.ravel()

In [11]:
def mixup(X, y, n_synthetic, alpha=2, random_state=None):
    """
    Create synthetic samples using the mixup technique.

    Parameters:
    - n_synthetic (int): Number of synthetic samples to generate.
    - alpha (float): Parameter for the Beta distribution controlling the mixup ratio.
    - random_seed (int, optional): Random seed for reproducibility.

    Returns:
    - synthetic_data (np.ndarray): A 2D array of shape (n_synthetic, features) with synthetic samples.
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    n_samples, n_features = X.shape
    synthetic_X = np.zeros((n_synthetic, n_features))
    synthetic_y = np.zeros((n_synthetic, ))

    for i in range(n_synthetic):
        # Randomly select two samples to mix
        idx1, idx2 = np.random.choice(n_samples, size=2, replace=False)
        
        # Generate mixup coefficient from a Beta distribution
        lambda_ = np.random.beta(alpha, alpha)
        
        # Create a synthetic sample
        synthetic_X[i] = lambda_ * X[idx1] + (1 - lambda_) * X[idx2]
        synthetic_y[i] = lambda_ * y[idx1] + (1 - lambda_) * y[idx2]
    
    return synthetic_X, synthetic_y



In [12]:
def get_stats(model, y_train, y_test, X_train, X_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_corr = pearsonr(y_train, y_train_pred)[0]
    test_corr = pearsonr(y_test, y_test_pred)[0]
    
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    return train_corr, test_corr, train_mse, test_mse

In [55]:
np.random.seed(random_state + 1)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=random_state+1)
n_synthetic = 1000
augment = False
# res = {}

baseline_linear = LinearRegression(n_jobs = n_cores)

results = []
for k, (train_idx, test_idx) in tqdm(enumerate(outer_cv.split(X, y))):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # augment data
    if augment:
        synthetic_X, synthetic_y = mixup(X_train, y_train, n_synthetic = n_synthetic, alpha = 2, random_state = random_state)
        X_train = np.concatenate((X_train, synthetic_X), axis = 0)
        y_train = np.concatenate((y_train, synthetic_y), axis = 0)

    train_corr, test_corr, train_mse, test_mse = get_stats(best_pipeline, y_train, y_test, X_train, X_test)
    
    # random - y
    y_train_rand = np.random.permutation(y_train)
    res_y_rand = get_stats(best_pipeline, y_train_rand, y_test, X_train, X_test)
    _, test_corr_y_rand, _, test_mse_y_rand = res_y_rand    
    
    # random - X (features)
    X_train_rand = X_train[:, np.random.permutation(X_train.shape[1])]
    res_X_rand = get_stats(best_pipeline, y_train, y_test, X_train_rand, X_test)
    _, test_corr_X_rand, _, test_mse_X_rand = res_X_rand  
    
    # linear simple
    linear_res = get_stats(baseline_linear, y_train, y_test, X_train, X_test)
    _, test_corr_linear, _, test_mse_linear = linear_res
    

    results.append({
        "fold": k,
        "train_corr": train_corr,
        "test_corr": test_corr,
        'train_mse': train_mse, 
        'test_mse': test_mse,
        'random_y_test_corr': test_corr_y_rand, 
        'random_y_test_mse': test_mse_y_rand, 
        'random_X_test_corr': test_corr_X_rand, 
        'random_X_test_mse': test_mse_X_rand, 
        'linear_baseline_test_corr': test_corr_linear, 
        'linear_baseline_test_mse': test_mse_linear, 
        })
    best_res_df = pd.DataFrame(results)
#     res[k] = {'test': y_test, 'pred': y_test_pred, 'train': y_train}

10it [02:07, 12.76s/it]


In [58]:
viz_df = best_res_df[[col for col in best_res_df if 'test' in col or col == 'fold']]
viz_df_corr = viz_df[[col for col in viz_df if 'corr' in col]]
viz_df_mse = viz_df[[col for col in viz_df if 'mse' in col]]
viz_dfs = {'Pearson Correlation': viz_df_corr, 
          'Mean Squared Error': viz_df_mse}

In [1]:
fig, ax = plt.subplots(ncols = 2, figsize = (10,5))

for i, (metric, viz_df) in enumerate(viz_dfs.items()):
    viz_df = pd.melt(viz_df, value_name=metric, var_name = 'Model Type')
    sns.violinplot(data = viz_df, x = 'Model Type', y = metric, ax = ax[i])
    ax[i].set_xticklabels(ax[i].get_xticklabels(), 
                     rotation = 30, ha = 'center')