In [277]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    BaggingClassifier, 
    RandomForestClassifier, 
    ExtraTreesClassifier, 
    AdaBoostClassifier,
    StackingClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)

# to do PCA 
from sklearn.decomposition import PCA

# for cross validation
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold

import sklearn.metrics as metrics
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
)

from joblib import dump
from joblib import load

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
# To tune a model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config

set_config(display='diagram')

Train data prepossessing

In [278]:
train_data = pd.read_csv('fulvestrantTrain.tsv', sep='\t')

In [279]:
train_data.head()

Unnamed: 0,Fulvestrant_response,type,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,...,rna_MAFIP,rna_CD24,rna_HLA-DRB3,rna_LOC389831,rna_MGC70870,rna_LOC100233156,rna_LOC389834,rna_LOC283788,rna_RNA5-8S5,rna_LOC102723780
Breast_SQ68,1,Breast,0,0,0,0,0,0,1,0,...,1.765641,3.610263,0.0,0.144895,0.109381,0.0,0.37707,0.74383,7.430427,0.379076
Breast_EI813,0,Breast,0,0,0,0,0,0,0,0,...,0.68372,1.901263,0.0,1.973497,0.560144,0.763473,0.508316,1.166169,9.509599,1.445901
Breast_PV9,1,Breast,0,0,0,0,1,0,1,0,...,2.98508,4.12575,0.160901,1.316661,0.265797,1.167377,1.31625,1.320358,7.672418,0.230426
Breast_KY76,0,Breast,0,0,0,0,0,0,0,0,...,0.442882,0.320471,0.077141,3.301013,0.063881,0.32325,0.515944,0.27292,4.727648,0.222489
Breast_EPQV2,1,Breast,0,0,0,1,0,1,0,0,...,3.361561,7.31791,0.071225,2.624524,0.108735,1.40204,1.296005,1.550906,10.491553,2.474496


In [280]:
train = train_data.drop('Fulvestrant_response', axis = 1)

In [281]:
train.shape

(48, 15989)

In [282]:
train.columns = train.columns.str.replace('.', '-')

  train.columns = train.columns.str.replace('.', '-')


In [283]:
#create dummies for type column
train = pd.get_dummies(train, columns=['type'], drop_first=True)
train.head()

Unnamed: 0,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,mut_MACF1,mut_CDCP2,...,rna_RNA5-8S5,rna_LOC102723780,type_Bowel,type_BrainCNS,type_Breast,type_Kidney,type_Lung,type_Ovary,type_Prostate,type_Skin
Breast_SQ68,0,0,0,0,0,0,1,0,0,0,...,7.430427,0.379076,0,0,1,0,0,0,0,0
Breast_EI813,0,0,0,0,0,0,0,0,0,0,...,9.509599,1.445901,0,0,1,0,0,0,0,0
Breast_PV9,0,0,0,0,1,0,1,0,0,0,...,7.672418,0.230426,0,0,1,0,0,0,0,0
Breast_KY76,0,0,0,0,0,0,0,0,0,0,...,4.727648,0.222489,0,0,1,0,0,0,0,0
Breast_EPQV2,0,0,0,1,0,1,0,0,0,0,...,10.491553,2.474496,0,0,1,0,0,0,0,0


In [284]:
train.shape

(48, 15996)

In [285]:
# zero_variance_cols_from_training_data = ['rna_MIR6728', 'rna_MIR6084', 'rna_RNVU1-20', 'rna_MIR568',
#        'rna_MIR7110', 'rna_HTN3', 'rna_GYPB', 'rna_SNORA29', 'rna_MIR7-1',
#        'rna_MIR455', 'rna_SNORA70C', 'rna_MIR5699', 'rna_SNORA19', 'rna_HBBP1',
#        'rna_MIR7851', 'rna_SNORA34', 'rna_SNORA2B', 'rna_AHSP', 'rna_MIR212',
#        'rna_MIR636', 'rna_MIR5196', 'rna_MIR330', 'rna_MIR6807',
#        'rna_SNORA71D', 'rna_P2RX6P', 'rna_SNORD83B', 'rna_SNORA11E']

In [286]:
# # Drop columns with zero variance
# train = train.drop(columns=zero_variance_cols_from_training_data)
# train.shape

In [287]:
#scale test data
scaler = MinMaxScaler() 
train = pd.DataFrame(scaler.fit_transform(train), columns=train.columns)

Preprossesing test data

In [288]:
test = pd.read_csv('testData.tsv', sep='\t')

In [289]:
test

Unnamed: 0,type,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,mut_MACF1,...,rna_MAFIP,rna_CD24,rna_HLA.DRB3,rna_LOC389831,rna_MGC70870,rna_LOC100233156,rna_LOC389834,rna_LOC283788,rna_RNA5.8S5,rna_LOC102723780
BrainCNS_VFF746,BrainCNS,0,0,0,0,0,0,0,1,0,...,0.370241,2.571563,0.0,0.963339,0.133134,0.0,0.084735,0.711156,13.008779,0.448631
Bowel_KI64,Bowel,1,1,1,0,0,0,0,1,1,...,1.930737,1.18322,0.0,2.600574,0.094808,0.40677,0.603006,1.173358,11.705745,0.21826
Bowel_QEVT62,Bowel,0,0,0,0,0,0,0,0,0,...,4.210233,7.482356,0.0,3.37876,1.26441,0.3337,1.46936,0.277728,3.262672,2.91663
Lung_TXY395,Lung,0,0,0,0,0,0,0,0,0,...,0.127407,0.0,0.828134,2.794684,0.065409,0.0,0.091516,0.558419,10.159543,0.615718
Lung_JIL5,Lung,0,0,0,0,0,0,0,0,0,...,2.546692,4.055924,0.307951,2.450686,0.560719,1.406771,0.580068,1.408821,13.650738,0.712231
Blood_CB485,Blood,0,0,1,1,0,1,1,1,1,...,3.844657,0.825484,0.081115,2.106693,1.519104,2.339254,0.186431,2.521264,10.997504,2.044275
Skin_SFL7,Skin,0,0,0,0,0,0,0,1,0,...,1.75666,0.177772,0.092193,0.879886,0.142429,0.251139,0.681834,0.621057,8.113169,0.484338
Skin_VAK5,Skin,1,0,0,0,1,0,0,0,0,...,3.066494,0.0,2.905245,2.401002,0.065441,2.869348,1.210588,0.77373,11.872706,1.93981
Ovary_XT38,Ovary,0,0,0,0,0,0,0,1,0,...,1.090664,4.135198,0.0,1.534913,0.0,0.0,0.125957,0.554509,11.689054,0.512495


In [290]:
#create dummies for type column
test = pd.get_dummies(test, columns=['type'], drop_first=True)

In [291]:
test

Unnamed: 0,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,mut_MACF1,mut_CDCP2,...,rna_LOC100233156,rna_LOC389834,rna_LOC283788,rna_RNA5.8S5,rna_LOC102723780,type_Bowel,type_BrainCNS,type_Lung,type_Ovary,type_Skin
BrainCNS_VFF746,0,0,0,0,0,0,0,1,0,0,...,0.0,0.084735,0.711156,13.008779,0.448631,0,1,0,0,0
Bowel_KI64,1,1,1,0,0,0,0,1,1,1,...,0.40677,0.603006,1.173358,11.705745,0.21826,1,0,0,0,0
Bowel_QEVT62,0,0,0,0,0,0,0,0,0,0,...,0.3337,1.46936,0.277728,3.262672,2.91663,1,0,0,0,0
Lung_TXY395,0,0,0,0,0,0,0,0,0,1,...,0.0,0.091516,0.558419,10.159543,0.615718,0,0,1,0,0
Lung_JIL5,0,0,0,0,0,0,0,0,0,0,...,1.406771,0.580068,1.408821,13.650738,0.712231,0,0,1,0,0
Blood_CB485,0,0,1,1,0,1,1,1,1,0,...,2.339254,0.186431,2.521264,10.997504,2.044275,0,0,0,0,0
Skin_SFL7,0,0,0,0,0,0,0,1,0,0,...,0.251139,0.681834,0.621057,8.113169,0.484338,0,0,0,0,1
Skin_VAK5,1,0,0,0,1,0,0,0,0,0,...,2.869348,1.210588,0.77373,11.872706,1.93981,0,0,0,0,1
Ovary_XT38,0,0,0,0,0,0,0,1,0,0,...,0.0,0.125957,0.554509,11.689054,0.512495,0,0,0,1,0


In [292]:
new_columns = ['type_Breast', 'type_Prostate', 'type_Kidney']

# Add new columns only if they do not exist
for new_column in new_columns:
    if new_column not in test.columns:
        test[new_column] = 0

test.head()

  test[new_column] = 0


Unnamed: 0,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,mut_MACF1,mut_CDCP2,...,rna_RNA5.8S5,rna_LOC102723780,type_Bowel,type_BrainCNS,type_Lung,type_Ovary,type_Skin,type_Breast,type_Prostate,type_Kidney
BrainCNS_VFF746,0,0,0,0,0,0,0,1,0,0,...,13.008779,0.448631,0,1,0,0,0,0,0,0
Bowel_KI64,1,1,1,0,0,0,0,1,1,1,...,11.705745,0.21826,1,0,0,0,0,0,0,0
Bowel_QEVT62,0,0,0,0,0,0,0,0,0,0,...,3.262672,2.91663,1,0,0,0,0,0,0,0
Lung_TXY395,0,0,0,0,0,0,0,0,0,1,...,10.159543,0.615718,0,0,1,0,0,0,0,0
Lung_JIL5,0,0,0,0,0,0,0,0,0,0,...,13.650738,0.712231,0,0,1,0,0,0,0,0


In [293]:
test.columns = test.columns.str.replace('.', '-')

  test.columns = test.columns.str.replace('.', '-')


In [294]:
test.columns = test.columns.str.replace('rna_THRA1-BTR', 'rna_THRA1/BTR')

In [295]:
test = test[train.columns]

In [296]:
#scale test data
test = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [297]:
fulv_model = load('fulv_lasso.joblib')
gefit_model = load('gefit_lasso.joblib')
mito_full_model = load('mito_full_lasso.joblib')
mito_lasso_model = load('mito_lasso.joblib')
repa_gef_model = load('repa_gef.joblib')
repa_model = load('repa_lasso.joblib')

In [298]:
train_fulv = fulv_model.predict(train)

In [299]:
train_fulv

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0], dtype=int64)

In [300]:
test_fulv = fulv_model.predict(test)
test_fulv

array([0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [301]:
test_gefit = gefit_model.predict(test)
test_gefit

array([0, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int64)

In [302]:
test_mito_full = mito_full_model.predict(test)
test_mito_full

array([1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [303]:
test_mito = mito_lasso_model.predict(test)
test_mito

array([1, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [304]:
test_repa = repa_model.predict(test)
test_repa

array([0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [305]:
import pandas as pd
import numpy as np

# Create a DataFrame
test_repa_gef = pd.DataFrame({'Repa': test_repa, 'Gefit': test_gefit})

# Display the resulting DataFrame
print(test_repa_gef)

   Repa  Gefit
0     0      0
1     0      0
2     0      0
3     1      0
4     0      0
5     0      0
6     0      1
7     1      1
8     0      0


In [306]:
test_repa_gef_op = repa_gef_model.predict(test_repa_gef)
test_repa_gef_op

array([0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

Model_output.tsv

In [315]:
test_model_op = pd.DataFrame({'Fulvestrant_response': test_fulv,
'Mitomycin_response': test_mito,
'Gefitinib_response': test_gefit,
'Rapamycin_response': test_repa,
'Rapamycin.Gefitinib_response': test_repa_gef_op,
'Mitomycin.Fulvestrant_response': test_mito_full,
'Fulvestrant_key_feature': 'rna_SNORA52',
'Mitomycin_key_feature': 'rna_SNORA62',
'Gefitinib_key_feature': 'mut_OR5L2',
'Rapamycin_key_feature': 'rna_SNORA74A',
'Rapamycin.Gefitinib_key_feature': 'rna_SNORA74A',
'Mitomycin.Fulvestrant_key_feature': 'rna_MARCH10'
}, 

# Create a DataFrame with custom index values
index = ['BrainCNS_VFF746',
'Bowel_KI64',
'Bowel_QEVT62',
'Lung_TXY395',
'Lung_JIL5',
'Blood_CB485',
'Skin_SFL7',
'Skin_VAK5',
'Ovary_XT38'])

# Path to save the TSV file
tsv_file_path = "model_output.tsv"

# Save the DataFrame as a TSV file
test_model_op.to_csv(tsv_file_path, sep='\t')

print(f"DataFrame has been saved as {tsv_file_path}")


DataFrame has been saved as ex_model_output2.tsv


Model_info.tsv

In [317]:
import pandas as pd

# Example data for the DataFrame
data = {
    'Fulvestrant_model': ['Lantern_ful.ipynb', 'Logistic Regression using L1 Regularization', '1.0','51', 'Coefficients of Logistic regression', 'rna_SNORA52','1', '0', '0','0', '0', '0','0', '0', '0'],
    'Mitomycin_model': ['Lantern_mito.ipynb', 'Logistic Regression using L1 Regularization', '1.0','107', 'Coefficients of Logistic regression', 'rna_SNORA62','0', '0', '0','0', '0', '0','1', '0', '0'],
    'Gefitinib_model': ['Lantern_gefit.ipynb', 'Logistic Regression using L1 Regularization', '1.0','140', 'Coefficients of Logistic regression', 'mut_OR5L2','0', '0', '0','0', '0', '0','0', '0', '0'],
    'Rapamycin_model': ['Lantern_repa.ipynb', 'Logistic Regression using L1 Regularization', '1.0','570', 'Coefficients of Logistic regression', 'rna_SNORA74A','0', '0', '0','0', '0', '0','0', '0', '0'],
    'Rapamycin.Gefitinib_model': ['Lantern_repa-gef.ipynb', 'Logistic Regression using L1 Regularization', '0.6','2', 'Meta Classifier - Coefficients of Logistic regression', 'rna_SNORA74A','0', '0', '0','0', '0', '0','0', '0', '0'],
    'Mitomycin.Fulvestrant_model': ['Lantern_mito-ful.ipynb', 'Meta Logistic Classfier', '1.0','653', 'Coefficients of Logistic regression', 'rna_MARCH10','0', '0', '1','0', '0', '0','0', '0', '0'],
}

# Create a DataFrame with custom index values
index_values = ['Filename',
'Algorithm used',
'Training accuracy',
'Number of features',
'Feature evaluation method',
'Overall key feature',
'Breast key feature',
'BrainCNS key feature',
'Bowel key feature',
'Blood key feature',
'Skin key feature',
'Lung key feature',
'Ovary key feature',
'Prostate key feature',
'Kidney key feature']
df = pd.DataFrame(data, index=index_values)

# Set headers
df.columns.name = 'General'

# Path to save the TSV file
tsv_file_path = "model_info.tsv"

# Save the DataFrame as a TSV file
df.to_csv(tsv_file_path, sep='\t')

print(f"DataFrame has been saved as {tsv_file_path}")


DataFrame has been saved as example_model_info2.tsv
