### Import the necessary libraries

In [1]:
import pickle 
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
# %matplotlib inline

import os
import warnings
import itertools

from sklearn.model_selection import cross_val_score, cross_val_predict, LeaveOneOut, GridSearchCV
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

np.random.seed(10)

os.environ["CUDA_VISIBLE_DEVICES"] = "1" 

### Get preprocessed data (241 samples)

In [2]:
X = pickle.load(open( "../data/preprocessed/article-data-features.p", "rb"))
y = pickle.load(open( "../data/preprocessed/article-data-labels.p", "rb"))

print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


### Create function to plot Confusion Matrix without Normalization

### Create function to plot a Normalized Confusion Matrix

### Create function to plot ROC curve

### Create function to be used to perform model fitting using LOOCV 

In [None]:
def fit_model(model, X, y):
    # prepare a LOOCV object (number of folds equals the number of samples)
    loocv = LeaveOneOut()
    loocv.get_n_splits(X)
    
    # perform cross-validation and get the accuracies
    cv_score = cross_val_score(model, X, y, cv=loocv, scoring='accuracy') 
    
    # perform cross-validation and get the predictions and predictions probabilities
    preds = cross_val_predict(model, X, y, cv=loocv)
    predprobs = cross_val_predict(model, X, y, cv=loocv, method='predict_proba')[:,1]
    
    # calculate fpr and tpr values using the y_true and predictions probabilities
    fpr, tpr, _ = metrics.roc_curve(y, predprobs)
    
    # calculate the auc score based on fpr and tpr values
    auc_score = metrics.auc(fpr, tpr)

    # generate the confusion matrix for the model results
    cm = metrics.confusion_matrix(y, preds)
    
    # print model report
    print("\nModel Report\n")
    print(model) # print the used params for the model
    print("\nAccuracy (CV Score) : Mean - %.7g | Std - %.7g" % (np.mean(cv_score), np.std(cv_score)))
    print("\nAUC Score : %f" % auc_score)
    print("\n" + metrics.classification_report(y, preds)) # print a complete classification metrics report
    
    # get current model name
    model_name = str(model).split('(')[0]
    
    # plot confusion matrix
    #plot_confusion_matrix(cm, model_name)
    
    # plot normalized confusion matrix
    #plot_normalized_confusion_matrix(cm, model_name) # gets only the mod

    # plot the roc curve
    #plot_roc_curve(fpr, tpr, auc_score, model_name)
    
    return predprobs # return prediction probabilities to be used on meta-learning step

### Create a DecisionTreeClassifier baseline model using default parameters

In [None]:
dt = DecisionTreeClassifier(random_state=10)

# perform model fitting
dt_predprobs = fit_model(dt, X, y)

# export prediction probabilities 
pickle.dump(dt_predprobs, open("./predictions/dt-predprobs.p", "wb"))

### Create a RandomForestClassifier baseline model using default parameters

In [None]:
rf = RandomForestClassifier(random_state=10)

# perform model fitting
rf_predprobs = fit_model(rf, X, y)

# export prediction probabilities 
pickle.dump(rf_predprobs, open("./predictions/rf-predprobs.p", "wb"))

### Create a SVM baseline model using a reduced set of tuned parameters

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [0.01, 1, 10, 100]}, 
                    {'kernel': ['linear'], 'C': [0.01, 1, 10, 100]}]

clf = GridSearchCV(SVC(probability=True, random_state=10), tuned_parameters) # svc = SVC(probability=True, random_state=10)

# perform model fitting
svc_predprobs = fit_model(clf, X, y)

# export prediction probabilities 
pickle.dump(svc_predprobs, open("./predictions/svc-predprobs.p", "wb"))

### Create a GBM baseline model using default parameters

In [None]:
gbm = GradientBoostingClassifier(random_state=10)

# perform model fitting
gbm_predprobs = fit_model(gbm, X, y)

# export prediction probabilities 
pickle.dump(gbm_predprobs, open("./predictions/gbm-predprobs.p", "wb"))

### Create a XGB baseline model using default parameters

In [None]:
xgb = XGBClassifier()

# ignore deprecation warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

# perform model fitting
xgb_predprobs = fit_model(xgb, X, y)

# export prediction probabilities 
pickle.dump(xgb_predprobs, open("./predictions/xgb-predprobs.p", "wb"))

### Compare all generated ROC curves

In [None]:
# calculate fpr, tpr and auc score for all models using the y_true and its predictions probabilities

fpr_dt, tpr_dt, _ = metrics.roc_curve(y, dt_predprobs)
auc_dt = metrics.auc(fpr_dt, tpr_dt)

fpr_rf, tpr_rf, _ = metrics.roc_curve(y, rf_predprobs)
auc_rf = metrics.auc(fpr_rf, tpr_rf)

fpr_svc, tpr_svc, _ = metrics.roc_curve(y, svc_predprobs)
auc_svc = metrics.auc(fpr_svc, tpr_svc)

fpr_gbm, tpr_gbm, _ = metrics.roc_curve(y, gbm_predprobs)
auc_gbm = metrics.auc(fpr_gbm, tpr_gbm)

fpr_xgb, tpr_xgb, _ = metrics.roc_curve(y, xgb_predprobs)
auc_xgb = metrics.auc(fpr_xgb, tpr_xgb)

# plot all roc curves into the same image

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
#plt.plot([0, 1], [0, 1], color='black', linestyle='--')  , 
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.plot(fpr_dt, tpr_dt, color='darkorange', label='DT (AUC = %f)' % auc_dt)
plt.plot(fpr_rf, fpr_rf, color='navy', label='RF (AUC = %f)' % auc_rf)
plt.plot(fpr_svc, fpr_svc, color='aqua', label='SVM (AUC = %f)' % auc_svc)
plt.plot(fpr_gbm, fpr_gbm, color='cornflowerblue', label='GBDT (AUC = %f)' % auc_gbm)
plt.plot(fpr_xgb, fpr_xgb, color='deeppink', label='XBG (AUC = %f)' % auc_xgb)
plt.xlabel('Taxa de falsos positivos') # False positive rate
plt.ylabel('Taxa de verdadeiros positivos') # True positive rate
plt.title('Curva ROC') # Drug Response Prediction - ROC Curve
plt.legend(loc='lower right')
# save plot as image 
plt.savefig('./figures/roc-curves/model-comparison-roc-curves')
plt.show()