In [None]:
# predicting recurrence with absence/presence of 100 most commonly mutated genes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV


X_train, X_test, y_train, y_test = train_test_split(gene_matrix, y, test_size = 0.4, random_state = 0, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


param_grid = {
    'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

lr_model = LogisticRegression(penalty='l2')

grid_search = GridSearchCV(lr_model, param_grid, scoring='balanced_accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

predictions = grid_search.predict(X_test)

target_names = ['No Recurrence', 'Recurrence']

y_score = grid_search.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_score)

print("Predictions: ",predictions)
print("Actual: ",y_test)
print("Balanced Accuracy: ",balanced_accuracy_score(y_test, predictions))
print("ROC AUC: ",roc_auc)
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(gene_matrix, y, test_size = 0.4, random_state = 0, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


param_grid = {
    'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

svm_model = svm.SVC(probability=True)

grid_search = GridSearchCV(svm_model, param_grid, scoring='balanced_accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

predictions = grid_search.predict(X_test)

target_names = ['No Recurrence', 'Recurrence']

y_score = grid_search.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_score)

print("Predictions: ",predictions)
print("Actual: ",y_test)
print("Balanced Accuracy: ",balanced_accuracy_score(y_test, predictions))
print("ROC AUC: ",roc_auc)
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
# Neural network/Multi-layer perceptron (MLP) algorithm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(gene_matrix, y, test_size = 0.4, random_state = 0, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


param_grid = {
    'hidden_layer_sizes': [(1,), (2,), (5,), (10,), (15,)],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

clf = MLPClassifier(solver='lbfgs', random_state=1, max_iter=2000)

grid_search = GridSearchCV(clf, param_grid, scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

predictions = grid_search.predict(X_test)

target_names = ['No Recurrence', 'Recurrence']

y_score = grid_search.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_score)

print("Predictions: ",predictions)
print("Actual: ",y_test)
print("Balanced Accuracy: ",balanced_accuracy_score(y_test, predictions))
print("ROC AUC: ",roc_auc)
print(classification_report(y_test, predictions, target_names=target_names))