In [55]:
# load libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
from scipy import stats

In [None]:
# specify file paths
features = "peakmat_bin.csv"
response = "pac_bin.csv"

# read in files
X = pd.read_csv(features)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(response)['response']   # read only drug response column

In [57]:
# split the training dataframe into train and val
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Un-penalized Logistic Model Training and Testing

In [58]:
# initialize logistic regression model
log = LogisticRegression(penalty=None, solver='lbfgs', max_iter=1000)

# fit model
log.fit(X_train, y_train)

# get predicted values for test data
y_pred = log.predict(X_test)

# compute precision and recall (and F1 score)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 1.0
Recall: 0.5
F1-Score: 0.6666666666666666


LASSO Model Training and Testing

In [59]:
# initialize LASSO model
lasso = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)

# specify parameters for optimization
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
  }

# identify optimal parameters
clf = GridSearchCV(
    estimator = lasso,
    param_grid = parameters,
    #verbose=2
  )
clf.fit(X_train, y_train)
print('Best params:', clf.best_params_ )

Best params: {'C': 10}


In [60]:
# test best model parameters on test data
clf_best = clf.best_estimator_

# get predicted values for test data
y_pred = clf_best.predict(X_test)

# compute precision and recall (and F1 score)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 1.0
Recall: 0.5
F1-Score: 0.6666666666666666


Ridge Model Training and Testing

In [61]:
# initialize Ridge model
ridge = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000)

# specify parameters for optimization
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
  }

# identify optimal parameters
clf = GridSearchCV(
    estimator = ridge,
    param_grid = parameters,
    #verbose=2
  )
clf.fit(X_train, y_train)
print('Best params:', clf.best_params_ )

Best params: {'C': 0.01}


In [62]:
# test best model parameters on test data
clf_best = clf.best_estimator_

# get predicted values for test data
y_pred = clf_best.predict(X_test)

# compute precision and recall (and F1 score)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 0.0
Recall: 0.0
F1-Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ElasticNet Model Training and Testing

In [63]:
# initialize Elastic Net model
en = LogisticRegression(penalty='l2', solver='saga', max_iter = 1000)

# specify parameters for optimization
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
  }

# identify optimal parameters
clf = GridSearchCV(
    estimator = en,
    param_grid = parameters,
    #verbose=2
  )
clf.fit(X_train, y_train)
print('Best params:', clf.best_params_ )



Best params: {'C': 0.01}


In [64]:
# test best model parameters on test data
clf_best = clf.best_estimator_

# get predicted values for test data
y_pred = clf_best.predict(X_test)

# compute precision and recall (and F1 score)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 0.0
Recall: 0.0
F1-Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Training and Testing

In [65]:
# initialize Random Forest model
rf = RandomForestClassifier()

# specify parameters for optimization
parameters = {
    'n_estimators': [10, 50, 100, 150, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', 50, 100, 200]
}

# identify optimal parameters
clf = GridSearchCV(
    estimator = rf,
    param_grid = parameters,
    #verbose=2
  )
clf.fit(X_train, y_train)
print('Best params:', clf.best_params_ )

Best params: {'max_depth': 10, 'max_features': 50, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}


In [66]:
# test best model parameters on test data
clf_best = clf.best_estimator_

# get predicted values for test data
y_pred = clf_best.predict(X_test)

# compute precision and recall (and F1 score)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 1.0
Recall: 0.5
F1-Score: 0.6666666666666666
