In [54]:
# load libraries
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from scipy import stats

import warnings
import os

# suppress all warnings
warnings.filterwarnings('ignore')

# set seed
np.random.seed(42)

Define functions for running models

In [55]:
# function to run linear model
def runLM(X, y, path):

  # create dataframe to store results
  model_df = pd.DataFrame(columns=['Model', 'Fold', 'Spearman', 'Pearson'])

  # initialize the outer folds (5 folds, 80% train, 20% test)
  outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

  # initialize variables to store best model correlation and features
  best_corr = 0
  best_fold = 0
  best_feat = None

  # loop through each of the outer five folds
  fold = 1
  for train_index, test_index in outer_cv.split(X):

    # split train and test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # initialize linear regression model
    reg = linear_model.LinearRegression()

    # fit model
    reg.fit(X_train, y_train)

    # get predicted values for test data
    y_pred = reg.predict(X_test)

    # compute correlations
    s_cor = stats.spearmanr(y_pred, y_test)
    p_cor = stats.pearsonr(y_pred, y_test)

    # save model correlation and features (if better than previous)
    if s_cor[0] > best_corr:
            best_corr = s_cor[0]
            best_fold = fold
            best_feat = reg.coef_

    # save results to dataframe
    new_row = pd.DataFrame({'Model': ['Linear'], 'Fold': [fold], 'Spearman': [s_cor[0]], 'Pearson': [p_cor[0]]})
    model_df = pd.concat([model_df, new_row],ignore_index = True)

    # print results from fold
    #print("Fold", fold, "Spearman correlation:", s_cor[0])

    fold += 1

  # print best results
  print("------ Best correlation:", best_corr, "from Fold", best_fold)

  # create feature importance dataframe
  feature_importance = pd.DataFrame({
      'Peak': X_train.columns,
      'Weight': best_feat.flatten()
  }).sort_values(by='Weight', ascending=False)

  # save results to dataframe
  feature_importance.to_csv((path+"/lm_features.csv"), index=False)
  model_df.to_csv((path+"/lm.csv"), index=False)

In [56]:
# function to run LASSO
def runLASSO(X, y, path):

  # create dataframe to store results
  model_df = pd.DataFrame(columns=['Model', 'Fold', 'Spearman', 'Pearson', 'alpha', 'max_iter'])

  # initialize the outer folds (5 folds, 80% train, 20% test)
  outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

  # initialize variables to store best model correlation and features
  best_corr = 0
  best_fold = 0
  best_feat = None

  # loop through each of the outer five folds
  fold = 1
  for train_index, test_index in outer_cv.split(X):

    # split train and test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # initialize LASSO model
    lasso = linear_model.Lasso()

    # specify parameters for optimization
    parameters = {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
        'max_iter': [500, 1000, 5000, 7500]
      }

    # identify optimal parameters
    reg = GridSearchCV(
        estimator = lasso,
        param_grid = parameters,
        #verbose=2
      )

    # fit model
    reg.fit(X_train, y_train)

    # get best model parameters
    reg_best = reg.best_estimator_

    alpha = reg.best_params_['alpha']
    max_iter = reg.best_params_['max_iter']

    # get predicted values for test data
    y_pred = reg_best.predict(X_test)

    # compute correlations
    s_cor = stats.spearmanr(y_pred, y_test)
    p_cor = stats.pearsonr(y_pred, y_test.values.ravel())

    # save model correlation and features (if better than previous)
    if s_cor[0] > best_corr:
            best_corr = s_cor[0]
            best_fold = fold
            best_feat = reg_best.coef_

    # save results to dataframe
    new_row = pd.DataFrame({'Model': ['LASSO'], 'Fold': [fold], 'Spearman': [s_cor[0]], 'Pearson': [p_cor[0]],
                            'alpha': [alpha], 'max_iter': [max_iter]})
    model_df = pd.concat([model_df, new_row],ignore_index = True)

    # print results from fold
    #print("Fold", fold, "Spearman correlation:", s_cor[0])

    fold += 1

  # print results
  print("------ Best correlation:", best_corr, "from Fold", best_fold)

  # create feature importance dataframe
  feature_importance = pd.DataFrame({
      'Peak': X_train.columns,
      'Weight': best_feat
  }).sort_values(by='Weight', ascending=False)

  # save feature importance dataframe
  feature_importance.to_csv((path+"/lasso_features.csv"), index=False)
  model_df.to_csv((path+"/lasso.csv"), index=False)

In [57]:
# run LM and LASSO for each dataset pair
def runModels(df, pair, outdir):

  # specify column names
  var = {"gcsi_ccle": ["gdsc_median", "gcsi_ccle_spearman"],
        "gcsi_gdsc": ["ccle_median", "gcsi_gdsc_spearman"],
        "gdsc_ccle": ["gcsi_median", "gdsc_ccle_spearman"]}

  # get needed columns and remove NA
  subset = df[[var[pair][0], var[pair][1], "gc", "n_exon", "length"]]
  subset = subset.dropna()

  # get X and y
  X = subset[[var[pair][0], "gc", "n_exon", "length"]]
  y = subset[[var[pair][1]]]

  # create folder to store results
  path = os.path.join(outdir, pair)
  os.makedirs(path, exist_ok=True)

  # run models
  print("--- running LM for " + pair)
  runLM(X, y, path)

  print("--- running LASSO for " + pair)
  runLASSO(X, y, path)


In [58]:
# function to run models for each dataset
def runAllPairs(df, outdir):

  # load in dataset
  df = pd.read_csv(df)

  # run models
  print("\nStarting gCSI/CCLE:")
  runModels(df, "gcsi_ccle", outdir)

  print("\nStarting gCSI/GDSC:")
  runModels(df, "gcsi_gdsc", outdir)

  print("\nStarting GDSC/CCLE:")
  runModels(df, "gdsc_ccle", outdir)

Run models for each pipeline

In [73]:
runAllPairs("gene_stability.csv", "Gene_Expression")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle
------ Best correlation: 0.1555690574797088 from Fold 5
--- running LASSO for gcsi_ccle
------ Best correlation: 0.22428912363811845 from Fold 4

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc
------ Best correlation: 0.26174780670624737 from Fold 3
--- running LASSO for gcsi_gdsc
------ Best correlation: 0.18052975262476517 from Fold 4

Starting GDSC/CCLE:
--- running LM for gdsc_ccle
------ Best correlation: 0.23411142633235357 from Fold 5
--- running LASSO for gdsc_ccle
------ Best correlation: 0.23436980252882988 from Fold 5


In [74]:
runAllPairs("transcript_stability.csv", "Isoform_Expression")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle
------ Best correlation: 0.32932286804954286 from Fold 4
--- running LASSO for gcsi_ccle
------ Best correlation: 0.3295279702825283 from Fold 4

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc
------ Best correlation: 0.27667257024767433 from Fold 4
--- running LASSO for gcsi_gdsc
------ Best correlation: 0.27702433995214876 from Fold 4

Starting GDSC/CCLE:
--- running LM for gdsc_ccle
------ Best correlation: 0.41338472573643154 from Fold 4
--- running LASSO for gdsc_ccle
------ Best correlation: 0.41348680572774044 from Fold 4


In [59]:

runAllPairs("ciri_stability.csv", "CIRI2")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle
------ Best correlation: 0.12264864269617226 from Fold 3
--- running LASSO for gcsi_ccle
------ Best correlation: 0.1605967684221346 from Fold 4

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc
------ Best correlation: 0.38181818181818183 from Fold 4
--- running LASSO for gcsi_gdsc
------ Best correlation: 0.5757575757575757 from Fold 4

Starting GDSC/CCLE:
--- running LM for gdsc_ccle
------ Best correlation: 0.21757512627648798 from Fold 3
--- running LASSO for gdsc_ccle
------ Best correlation: 0.22424242424242422 from Fold 2


In [69]:
runAllPairs("circ_stability.csv", "CIRCexplorer2")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle
------ Best correlation: 0.180458023679647 from Fold 2
--- running LASSO for gcsi_ccle
------ Best correlation: 0.37645082467929136 from Fold 2

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc
------ Best correlation: 0.30882995527018386 from Fold 1
--- running LASSO for gcsi_gdsc
------ Best correlation: 0.305905618431955 from Fold 5

Starting GDSC/CCLE:
--- running LM for gdsc_ccle
------ Best correlation: 0.02941176470588235 from Fold 5
--- running LASSO for gdsc_ccle
------ Best correlation: 0 from Fold 0


In [70]:
runAllPairs("cfnd_stability.csv", "circRNA_finder")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle
------ Best correlation: 0.29073217434513365 from Fold 4
--- running LASSO for gcsi_ccle
------ Best correlation: 0.22212099711589933 from Fold 5

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc
------ Best correlation: 0.4159887773536933 from Fold 5
--- running LASSO for gcsi_gdsc
------ Best correlation: 0.3635352096747466 from Fold 3

Starting GDSC/CCLE:
--- running LM for gdsc_ccle
------ Best correlation: 0.38909909560510225 from Fold 3
--- running LASSO for gdsc_ccle
------ Best correlation: 0.45565367623615144 from Fold 3


In [71]:
runAllPairs("fcrc_stability.csv", "find_circ")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle
------ Best correlation: 0.2401407472854148 from Fold 4
--- running LASSO for gcsi_ccle
------ Best correlation: 0.2203166509877976 from Fold 1

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc
------ Best correlation: 0.2086409288980511 from Fold 1
--- running LASSO for gcsi_gdsc
------ Best correlation: 0.30357999614732184 from Fold 3

Starting GDSC/CCLE:
--- running LM for gdsc_ccle
------ Best correlation: 0.41048868290137236 from Fold 4
--- running LASSO for gdsc_ccle
------ Best correlation: 0.40548737334101925 from Fold 4
