In [1]:
import os
import pickle
import pathlib

from tqdm import tqdm

import numpy as np
import pandas as pd

import optuna
from optuna.samplers import CmaEsSampler, TPESampler, RandomSampler
from optuna.distributions import CategoricalDistribution

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import pearsonr
from sklearn.utils import shuffle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'
random_state = 42 + 2

n_cores = 80
os.environ["OMP_NUM_THREADS"] = str(n_cores)
os.environ["MKL_NUM_THREADS"] = str(n_cores)
os.environ["OPENBLAS_NUM_THREADS"] = str(n_cores)
os.environ["VECLIB_MAXIMUM_THREADS"] = str(n_cores)
os.environ["NUMEXPR_NUM_THREADS"] = str(n_cores)

In [3]:
def write_pickled_object(object_, file_name: str) -> None:
    if '.' in file_name:
        p = pathlib.Path(file_name)
        extensions = "".join(p.suffixes)
        file_name = str(p).replace(extensions, '.pickle')
    else:
        file_name = file_name + '.pickle'

    with open(file_name, 'wb') as handle:
        pickle.dump(object_, handle)

In [4]:
# Feature selection transformer
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, method='top_n_cv', n_features=None):
        if method not in ['top_n_cv']:#, 'all_features']:
            raise ValueError('Incorrect feature selection method implemented')
        self.method = method
        self.n_features = n_features

    def fit(self, X, y=None):
        if self.method == 'top_n_cv':
            self.coefficient_of_variation_ = np.std(X, axis=0) / np.mean(X, axis=0)
            self.top_indices_ = np.argsort(self.coefficient_of_variation_)[::-1][:self.n_features]
#         elif self.method == 'all_features':
#             self.top_indices_ = range(X.shape[1])
        return self
    def transform(self, X, y=None):
        return X[:, self.top_indices_]
    
class MeanCenterer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
#         self.mean_ = np.mean(X, axis=0)
        return self

    def transform(self, X, y=None):
        return X - np.mean(X, axis=0)
    
def pearson_corr_scorer(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

class PLSRegression_X(PLSRegression):
    def transform(self, X, y=None):
        X_transformed = super().transform(X, y)
        if isinstance(X_transformed, tuple):
            X_transformed = X_transformed[0]
        return X_transformed

In [6]:
outer_folds=10
inner_folds=5
n_trials = 200

Let's take a look at the results:

In [7]:
res = pd.read_csv(os.path.join(data_path, 'interim', 'pipeline_model_selection.csv'), index_col = 0)

In [8]:
res.test_corr.mean()

np.float64(0.5048246138993638)

Select a best consensus model and re-run on new folds to see the performance:

In [9]:
C_best = []
epsilon_best = []
for bp in res.best_params:
    print(bp)
    print('------------------------------------------------------')
    C_best.append(float(bp.split(', ')[2].split(': ')[1]))
    epsilon_best.append(float(bp.split(', ')[-1].split(': ')[1][:-1]))

{'FeatureSelector__n_features': 19138, 'model_type': 'SVR', 'SVR__C': 0.0005719969499161776, 'SVR__epsilon': 0.754774672907146}
------------------------------------------------------
{'FeatureSelector__n_features': 19138, 'model_type': 'SVR', 'SVR__C': 0.13740150254792138, 'SVR__epsilon': 0.792875261527864}
------------------------------------------------------
{'FeatureSelector__n_features': 19138, 'model_type': 'SVR', 'SVR__C': 0.00012293594680558607, 'SVR__epsilon': 0.1414183067920836}
------------------------------------------------------
{'FeatureSelector__n_features': 19138, 'model_type': 'SVR', 'SVR__C': 0.0001309759625950283, 'SVR__epsilon': 0.0033638552879614743}
------------------------------------------------------
{'FeatureSelector__n_features': 19138, 'model_type': 'SVR', 'SVR__C': 62.549312779871485, 'SVR__epsilon': 1.083336855506116}
------------------------------------------------------
{'FeatureSelector__n_features': 19138, 'model_type': 'SVR', 'SVR__C': 0.000168892469

Looks like consistently, the best performing model uses all features and a linear SVM. We will take the median C and epsiolon value across folds:

In [10]:
best_steps = [
    ("feature_reduction", FeatureSelector(n_features = 19138)),
    ("mean_centering", MeanCenterer()),
]
best_steps.append(("model", SVR(
    kernel='linear',
    C=np.median(C_best),
    epsilon=np.median(epsilon_best)
#     random_state=random_state,
#     n_jobs=n_cores
)))
best_pipeline = Pipeline(best_steps)

In [11]:
X = pd.read_csv(os.path.join(data_path, 'processed',  'expr.csv'), index_col = 0).values
y = pd.read_csv(os.path.join(data_path, 'processed', 'metastatic_potential.csv'), index_col = 0)['mean'].values.ravel()

In [65]:
def mixup(X, y, n_synthetic, alpha=2, random_state=None):
    """
    Create synthetic samples using the mixup technique.

    Parameters:
    - n_synthetic (int): Number of synthetic samples to generate.
    - alpha (float): Parameter for the Beta distribution controlling the mixup ratio.
    - random_seed (int, optional): Random seed for reproducibility.

    Returns:
    - synthetic_data (np.ndarray): A 2D array of shape (n_synthetic, features) with synthetic samples.
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    n_samples, n_features = X.shape
    synthetic_X = np.zeros((n_synthetic, n_features))
    synthetic_y = np.zeros((n_synthetic, ))

    for i in range(n_synthetic):
        # Randomly select two samples to mix
        idx1, idx2 = np.random.choice(n_samples, size=2, replace=False)
        
        # Generate mixup coefficient from a Beta distribution
        lambda_ = np.random.beta(alpha, alpha)
        
        # Create a synthetic sample
        synthetic_X[i] = lambda_ * X[idx1] + (1 - lambda_) * X[idx2]
        synthetic_y[i] = lambda_ * y[idx1] + (1 - lambda_) * y[idx2]
    
    return synthetic_X, synthetic_y



In [None]:
outer_cv = KFold(n_splits=10, shuffle=True, random_state=random_state+1)
n_synthetic = 1000

res = {}

results = []
for k, (train_idx, test_idx) in tqdm(enumerate(outer_cv.split(X, y))):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # augment data
    synthetic_X, synthetic_y = mixup(X, y, n_synthetic = n_synthetic, alpha = 2, random_state = random_state)
    X_train = np.concatenate((X_train, synthetic_X), axis = 0)
    y_train = np.concatenate((y_train, synthetic_y), axis = 0)
    
    
    best_pipeline.fit(X_train, y_train)

    y_train_pred = best_pipeline.predict(X_train)
    y_test_pred = best_pipeline.predict(X_test)

    train_corr = pearsonr(y_train, y_train_pred)[0]
    test_corr = pearsonr(y_test, y_test_pred)[0]

    results.append({
        "fold": k,
        "train_corr": train_corr,
        "test_corr": test_corr,
        })
    best_res_df = pd.DataFrame(results)
    res[k] = {'test': y_test, 'pred': y_test_pred, 'train': y_train}

3it [01:00, 20.12s/it]

In [59]:
best_res_df

Unnamed: 0,fold,train_corr,test_corr
0,0,0.971755,0.850419
1,1,0.973514,0.80845
2,2,0.970723,0.79176
3,3,0.97346,0.825299
4,4,0.972731,0.857133
5,5,0.972461,0.758532
6,6,0.973004,0.775226
7,7,0.971306,0.841415
8,8,0.972276,0.83223
9,9,0.97181,0.909345


In [60]:
best_res_df

Unnamed: 0,fold,train_corr,test_corr
0,0,0.941063,0.548148
1,1,0.938265,0.404039
2,2,0.940764,0.229311
3,3,0.939848,0.559639
4,4,0.937392,0.563501
5,5,0.942795,0.346068
6,6,0.938975,0.521602
7,7,0.936515,0.574523
8,8,0.935324,0.598116
9,9,0.934136,0.715371


In [63]:
best_res_df.test_corr.median()

np.float64(0.8249808986851747)

In [64]:
0.82**2

0.6723999999999999