In [25]:
# load libraries
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from scipy import stats

import warnings
import os

# suppress all warnings
warnings.filterwarnings('ignore')

# set seed
np.random.seed(42)

Define functions for running models

In [26]:
# function to run linear model
def runLM(X, y, path, col):

  # create dataframe to store results
  model_df = pd.DataFrame(columns=['Model', 'Fold', 'Spearman', 'Pearson'])

  # initialize the outer folds (5 folds, 80% train, 20% test)
  outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

  # initialize variables to store best model correlation and features
  best_corr = 0
  best_fold = 0
  best_feat = None

  # loop through each of the outer five folds
  fold = 1
  for train_index, test_index in outer_cv.split(X):

    # split train and test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # initialize linear regression model
    reg = linear_model.LinearRegression()

    # fit model
    reg.fit(X_train, y_train)

    # get predicted values for test data
    y_pred = reg.predict(X_test)

    # compute correlations
    s_cor = stats.spearmanr(y_pred, y_test)
    p_cor = stats.pearsonr(y_pred, y_test)

    # save model correlation and features (if better than previous)
    if abs(s_cor[0]) > best_corr:
            best_corr = s_cor[0]
            best_fold = fold
            best_feat = reg.coef_

    # save results to dataframe
    new_row = pd.DataFrame({'Model': ['Linear'], 'Fold': [fold], 'Spearman': [s_cor[0]], 'Pearson': [p_cor[0]]})
    model_df = pd.concat([model_df, new_row],ignore_index = True)

    fold += 1

  # print best results
  print("------ Best correlation:", best_corr, "from Fold", best_fold)

  # create feature importance dataframe
  feature_importance = pd.DataFrame({
      'Peak': X_train.columns,
      'Weight': best_feat.flatten()
  }).sort_values(by='Weight', ascending=False)

  # save results to dataframe
  feature_importance.to_csv((path+"/"+col+"_lm_features.csv"), index=False)
  model_df.to_csv((path+"/"+col+"_lm.csv"), index=False)

In [27]:
# run LM and LASSO for each dataset pair
def runModels(df, pair, outdir):

  # create folder to store results
  path = os.path.join(outdir, pair)
  os.makedirs(path, exist_ok=True)

  # specify column names
  var = {"gcsi_ccle": ["gdsc_median", "gcsi_ccle_spearman"],
        "gcsi_gdsc": ["ccle_median", "gcsi_gdsc_spearman"],
        "gdsc_ccle": ["gcsi_median", "gdsc_ccle_spearman"]}

  # get needed columns and remove NA
  subset = df[[var[pair][0], var[pair][1], "gc", "n_exon", "length"]]
  subset = subset.dropna()

  # loop through each variable
  for col in [var[pair][0], "gc", "n_exon", "length"]:

    # get X and y
    X = subset[[col]]
    y = subset[[var[pair][1]]]

    # run models
    print("--- running LM for " + pair + " for " + col)
    runLM(X, y, path, col)


In [28]:
# function to run models for each dataset
def runAllPairs(df, outdir):

  # load in dataset
  df = pd.read_csv(df)

  # run models
  print("\nStarting gCSI/CCLE:")
  runModels(df, "gcsi_ccle", outdir)

  print("\nStarting gCSI/GDSC:")
  runModels(df, "gcsi_gdsc", outdir)

  print("\nStarting GDSC/CCLE:")
  runModels(df, "gdsc_ccle", outdir)

Run models for each pipeline

In [29]:
runAllPairs("gene_stability.csv", "Gene_Expression")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle for gdsc_median
------ Best correlation: -0.025234766238305843 from Fold 5
--- running LM for gcsi_ccle for gc
------ Best correlation: 0.019588151306191206 from Fold 4
--- running LM for gcsi_ccle for n_exon
------ Best correlation: 0.10586514825191538 from Fold 2
--- running LM for gcsi_ccle for length
------ Best correlation: 0.22428912363811845 from Fold 4

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc for ccle_median
------ Best correlation: 0.2066693327283207 from Fold 3
--- running LM for gcsi_gdsc for gc
------ Best correlation: 0.04452357946311545 from Fold 3
--- running LM for gcsi_gdsc for n_exon
------ Best correlation: 0.05984006421406075 from Fold 5
--- running LM for gcsi_gdsc for length
------ Best correlation: 0.18052975262476517 from Fold 4

Starting GDSC/CCLE:
--- running LM for gdsc_ccle for gcsi_median
------ Best correlation: 0.19252796506781436 from Fold 5
--- running LM for gdsc_ccle for gc
------ Best correla

In [30]:
runAllPairs("transcript_stability.csv", "Isoform_Expression")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle for gdsc_median
------ Best correlation: 0.2763604410344527 from Fold 4
--- running LM for gcsi_ccle for gc
------ Best correlation: 0.018833293317480616 from Fold 4
--- running LM for gcsi_ccle for n_exon
------ Best correlation: 0.10900265012818157 from Fold 4
--- running LM for gcsi_ccle for length
------ Best correlation: 0.29167879433346966 from Fold 4

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc for ccle_median
------ Best correlation: 0.1906499473092695 from Fold 4
--- running LM for gcsi_gdsc for gc
------ Best correlation: 0.02007242277930821 from Fold 5
--- running LM for gcsi_gdsc for n_exon
------ Best correlation: 0.07167548319371818 from Fold 4
--- running LM for gcsi_gdsc for length
------ Best correlation: 0.26587166961816205 from Fold 4

Starting GDSC/CCLE:
--- running LM for gdsc_ccle for gcsi_median
------ Best correlation: 0.37776032456621134 from Fold 1
--- running LM for gdsc_ccle for gc
------ Best correlatio

In [31]:

runAllPairs("ciri_stability.csv", "CIRI2")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle for gdsc_median
------ Best correlation: -0.2869926774029444 from Fold 5
--- running LM for gcsi_ccle for gc
------ Best correlation: 0.33784810166000834 from Fold 3
--- running LM for gcsi_ccle for n_exon
------ Best correlation: -0.09862992959790293 from Fold 5
--- running LM for gcsi_ccle for length
------ Best correlation: 0.1605967684221346 from Fold 4

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc for ccle_median
------ Best correlation: 0.6315584534755296 from Fold 3
--- running LM for gcsi_gdsc for gc
------ Best correlation: 0.5757575757575757 from Fold 4
--- running LM for gcsi_gdsc for n_exon
------ Best correlation: -0.445130226322946 from Fold 5
--- running LM for gcsi_gdsc for length
------ Best correlation: -0.3939393939393939 from Fold 5

Starting GDSC/CCLE:
--- running LM for gdsc_ccle for gcsi_median
------ Best correlation: 0.25241260113576186 from Fold 5
--- running LM for gdsc_ccle for gc
------ Best correlation:

In [32]:
runAllPairs("circ_stability.csv", "CIRCexplorer2")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle for gdsc_median
------ Best correlation: 0.4347438741806116 from Fold 5
--- running LM for gcsi_ccle for gc
------ Best correlation: 0.3115455682086025 from Fold 5
--- running LM for gcsi_ccle for n_exon
------ Best correlation: -0.22659400634440224 from Fold 5
--- running LM for gcsi_ccle for length
------ Best correlation: 0.37645082467929136 from Fold 2

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc for ccle_median
------ Best correlation: 0.29259552473639505 from Fold 3
--- running LM for gcsi_gdsc for gc
------ Best correlation: 0.5390704911994877 from Fold 3
--- running LM for gcsi_gdsc for n_exon
------ Best correlation: 0.43213801416104974 from Fold 5
--- running LM for gcsi_gdsc for length
------ Best correlation: 0.526211929024087 from Fold 3

Starting GDSC/CCLE:
--- running LM for gdsc_ccle for gcsi_median
------ Best correlation: 0.09999999999999999 from Fold 4
--- running LM for gdsc_ccle for gc
------ Best correlation: 

In [33]:
runAllPairs("cfnd_stability.csv", "circRNA_finder")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle for gdsc_median
------ Best correlation: 0.21339762225080422 from Fold 4
--- running LM for gcsi_ccle for gc
------ Best correlation: 0.20280428819675578 from Fold 4
--- running LM for gcsi_ccle for n_exon
------ Best correlation: 0.27442069090199933 from Fold 5
--- running LM for gcsi_ccle for length
------ Best correlation: 0.24035019471397015 from Fold 5

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc for ccle_median
------ Best correlation: 0.31567366592283747 from Fold 1
--- running LM for gcsi_gdsc for gc
------ Best correlation: -0.10288690400740812 from Fold 5
--- running LM for gcsi_gdsc for n_exon
------ Best correlation: 0.4841060629367038 from Fold 5
--- running LM for gcsi_gdsc for length
------ Best correlation: 0.4110541536602924 from Fold 4

Starting GDSC/CCLE:
--- running LM for gdsc_ccle for gcsi_median
------ Best correlation: 0.4346977721012687 from Fold 3
--- running LM for gdsc_ccle for gc
------ Best correlation

In [34]:
runAllPairs("fcrc_stability.csv", "find_circ")


Starting gCSI/CCLE:
--- running LM for gcsi_ccle for gdsc_median
------ Best correlation: 0.3252025715867288 from Fold 1
--- running LM for gcsi_ccle for gc
------ Best correlation: -0.08651641005013049 from Fold 5
--- running LM for gcsi_ccle for n_exon
------ Best correlation: 0.1278121893503362 from Fold 3
--- running LM for gcsi_ccle for length
------ Best correlation: 0.2203166509877976 from Fold 1

Starting gCSI/GDSC:
--- running LM for gcsi_gdsc for ccle_median
------ Best correlation: 0.1884719856594349 from Fold 2
--- running LM for gcsi_gdsc for gc
------ Best correlation: 0.188110068817058 from Fold 1
--- running LM for gcsi_gdsc for n_exon
------ Best correlation: 0.18197680814733247 from Fold 4
--- running LM for gcsi_gdsc for length
------ Best correlation: 0.30357999614732184 from Fold 3

Starting GDSC/CCLE:
--- running LM for gdsc_ccle for gcsi_median
------ Best correlation: 0.3998509335097213 from Fold 1
--- running LM for gdsc_ccle for gc
------ Best correlation: 0.