In [100]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    BaggingClassifier, 
    RandomForestClassifier, 
    ExtraTreesClassifier, 
    AdaBoostClassifier,
    StackingClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)

# to do PCA 
from sklearn.decomposition import PCA

# for cross validation
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold

import sklearn.metrics as metrics
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
)

from joblib import dump
from joblib import load


from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
# To tune a model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config

set_config(display='diagram')

In [101]:
df = pd.read_csv('rapamycinGefitinibTrain.tsv', sep='\t')

In [102]:
df.head()

Unnamed: 0,Rapamycin-Gefitinib_response,type,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,...,rna_MAFIP,rna_CD24,rna_HLA-DRB3,rna_LOC389831,rna_MGC70870,rna_LOC100233156,rna_LOC389834,rna_LOC283788,rna_RNA5-8S5,rna_LOC102723780
Breast_SQ68,1,Breast,0,0,0,0,0,0,1,0,...,1.765641,3.610263,0.0,0.144895,0.109381,0.0,0.37707,0.74383,7.430427,0.379076
Breast_EI813,0,Breast,0,0,0,0,0,0,0,0,...,0.68372,1.901263,0.0,1.973497,0.560144,0.763473,0.508316,1.166169,9.509599,1.445901
Breast_PV9,0,Breast,0,0,0,0,1,0,1,0,...,2.98508,4.12575,0.160901,1.316661,0.265797,1.167377,1.31625,1.320358,7.672418,0.230426
Breast_KY76,0,Breast,0,0,0,0,0,0,0,0,...,0.442882,0.320471,0.077141,3.301013,0.063881,0.32325,0.515944,0.27292,4.727648,0.222489
Breast_EPQV2,1,Breast,0,0,0,1,0,1,0,0,...,3.361561,7.31791,0.071225,2.624524,0.108735,1.40204,1.296005,1.550906,10.491553,2.474496


In [103]:
df.columns = df.columns.str.replace('.', '-')

  df.columns = df.columns.str.replace('.', '-')


In [104]:
df = pd.get_dummies(df, columns=['type'], drop_first=True)
df.head()

Unnamed: 0,Rapamycin-Gefitinib_response,mut_C1orf222,mut_CAMTA1,mut_H6PD,mut_SPEN,mut_HSPG2,mut_ARID1A,mut_ZSCAN20,mut_CSMD2,mut_MACF1,...,rna_RNA5-8S5,rna_LOC102723780,type_Bowel,type_BrainCNS,type_Breast,type_Kidney,type_Lung,type_Ovary,type_Prostate,type_Skin
Breast_SQ68,1,0,0,0,0,0,0,1,0,0,...,7.430427,0.379076,0,0,1,0,0,0,0,0
Breast_EI813,0,0,0,0,0,0,0,0,0,0,...,9.509599,1.445901,0,0,1,0,0,0,0,0
Breast_PV9,0,0,0,0,0,1,0,1,0,0,...,7.672418,0.230426,0,0,1,0,0,0,0,0
Breast_KY76,0,0,0,0,0,0,0,0,0,0,...,4.727648,0.222489,0,0,1,0,0,0,0,0
Breast_EPQV2,1,0,0,0,1,0,1,0,0,0,...,10.491553,2.474496,0,0,1,0,0,0,0,0


In [105]:
# variances = df.var()

# # Get column indices where variance is zero
# zero_variance_cols = variances[variances == 0].index

In [106]:
# # Drop columns with zero variance
# df = df.drop(columns=zero_variance_cols)
# df.shape

# SPLITTING 

In [107]:
X = df.drop('Rapamycin-Gefitinib_response', axis = 1)
y = df['Rapamycin-Gefitinib_response']

In [108]:
y.value_counts()

0    26
1    22
Name: Rapamycin-Gefitinib_response, dtype: int64

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4, stratify = y)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])

Number of rows in train data = 38
Number of rows in test data = 10


In [110]:
y_test.value_counts()

0    5
1    5
Name: Rapamycin-Gefitinib_response, dtype: int64

# SCALING

In [111]:
scaler = MinMaxScaler()
 
# Scale the Data
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# MODELLING PART

In [112]:
# Load the trained models from the file
repa_model = load('repa_lasso.joblib')
gefit_model = load('gefit_lasso.joblib')

In [113]:
# Predict on the train set
repa = repa_model.predict(X_train)
gefit = gefit_model.predict(X_train)

In [114]:
# Predict on the test set
repa_test = repa_model.predict(X_test)
gefit_test = gefit_model.predict(X_test)

In [115]:
induvidual = pd.DataFrame({'Repa': repa, 'Gefit': gefit})
induvidual.head()

Unnamed: 0,Repa,Gefit
0,0,0
1,0,1
2,1,0
3,0,0
4,0,0


In [116]:
induvidual_test = pd.DataFrame({'Repa': repa_test, 'Gefit': gefit_test})
induvidual_test.head()

Unnamed: 0,Repa,Gefit
0,0,0
1,1,0
2,1,1
3,1,1
4,0,1


In [117]:
lasso_logreg = LogisticRegression(random_state = 4)
lasso_logreg.fit(induvidual, y_train)

# Save the trained model to a file
dump(lasso_logreg, 'repa_gef.joblib')

['repa_gef.joblib']

In [118]:
# Predict on the train set
y_pred = lasso_logreg.predict(induvidual)

# Calculate accuracy on the train set
accuracy = accuracy_score(y_train, y_pred)
print("Train Accuracy:", accuracy)

Train Accuracy: 0.631578947368421


In [119]:
# Predict on the test set
y_pred = lasso_logreg.predict(induvidual_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.3


In [120]:
y_test

BrainCNS_QMQT2    0
Breast_PV9        0
Ovary_PM7         1
Lung_ND782        1
Kidney_SZ892      0
Skin_GMZV17       0
Lung_MN34         1
Bowel_KRW49       0
Kidney_SA2        1
BrainCNS_NZ6      1
Name: Rapamycin-Gefitinib_response, dtype: int64

In [121]:
pred_test = lasso_logreg.predict_proba(induvidual_test)[:,1]
pred_test

array([0.38939138, 0.50947976, 0.48612077, 0.48612077, 0.36741421,
       0.50947976, 0.38939138, 0.38939138, 0.38939138, 0.38939138])

In [122]:
# Retrieve coefficients and intercept
coefficients = lasso_logreg.coef_
intercept = lasso_logreg.intercept_

print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [[ 0.48779485 -0.09345478]]
Intercept: [-0.44987124]


In [123]:
# Retrieve coefficients greater than zero and their column indices
coefficients = lasso_logreg.coef_[0]  # Assuming binary classification, extracting coefficients for the first class

non_zero_coefficients = [(index, coef) for index, coef in enumerate(coefficients) if coef != 0]

sorted_coefficients = sorted(non_zero_coefficients, key=lambda x: abs(x[1]), reverse=True)

print("Coefficients:")
for index, coef in sorted_coefficients:
    print(f"{induvidual.columns[index]}: {coef}")

Coefficients:
Repa: 0.4877948460036033
Gefit: -0.09345477578878315


In [124]:
columns = []
for index, coef in non_zero_coefficients:
    columns.append(induvidual.columns[index])

columns

['Repa', 'Gefit']

In [125]:
len(columns)

2