In [None]:
import numpy as np
import pandas as pd
from numpy.random import seed
from sklearn.ensemble import GradientBoostingClassifier
import random
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

class ClassifierGAM:
    def __init__(self, train_file, label_file, test_file, seeds):
        self.x_train = pd.read_csv(train_file, index_col='ID').replace(np.nan,0).to_numpy()
        self.y_train = pd.read_csv(label_file, index_col='ID').replace([0, 1], [1, 0])
        self.x_test = pd.read_csv(test_file, index_col='ID').replace(np.nan, 0)
        self.y_test = pd.read_csv('Y_test.csv', index_col='ID').replace([np.nan, 'S', 'R'], [2, 1, 0])
        self.drug_set = ['AMIKACIN', 'ETHAMBUTOL', 'ETHIONAMIDE', 'ISONIAZID', 'KANAMYCIN',\
                         'LEVOFLOXACIN', 'MOXIFLOXACIN', 'RIFAMPICIN']
        self.seeds = seeds

    def evaluate_classifier(self):
        for i in range(len(self.drug_set)):
            print(self.drug_set[i])
            for j in self.seeds:
                seed(j)
                y = self.y_test[self.y_test.iloc[:, i] != 2]
                x_test = self.x_test[self.x_test.index.isin(y.index)].to_numpy()
                y_train = self.y_train.iloc[:, i].to_numpy()
                y_test = y.iloc[:, i].to_numpy()
                clf = GradientBoostingClassifier( loss='log_loss', learning_rate=0.1, n_estimators=50,\
                                                criterion='friedman_mse', max_depth=3, random_state=j,\
                                                verbose=0, warm_start=True)
                
                clf.fit(self.x_train, y_train)
                acc = clf.score(x_test, y_test)
                print (acc)
                              

def main():
    seeds = random.sample(range(0, 1000000), 1)
    train_files = ['GAM_train.csv', 'WHO23_train_I.csv', 'WHO21_train_I.csv']
    label_file = 'Y9_train.csv'
    test_files = ['GAM_all.csv', 'WHO23_all_I.csv', 'WHO21_all_I.csv']

    for train, test in zip(train_files, test_files):
        evaluator = ClassifierGAM(train, label_file, test, seeds)
        evaluator.evaluate_classifier()


if __name__ == "__main__":
    main()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn import metrics
from numpy.random import seed
import matplotlib.pyplot as plt


x = pd.read_csv('GAM_train.csv', index_col='ID').replace(np.nan, 0).to_numpy()
y = pd.read_csv('Y9_train.csv', index_col='ID').replace([0, 1], [1, 0]).to_numpy()

# Initialize the Gradient Boosting model and the MultiOutputClassifier
clf = GradientBoostingClassifier(loss='log_loss', learning_rate=0.1, n_estimators=50,
                                 criterion='friedman_mse', max_depth=3,
                                 verbose=0, warm_start=True)
mlf = MultiOutputClassifier(clf)

# Initialize StratifiedKFold with 10 splits
skf = StratifiedKFold(n_splits=10, shuffle=True)

for fold, (train_idx, test_idx) in enumerate(skf.split(x, np.argmax(y, axis=1))):  # Stratified on the first column

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    mlf.fit(x_train, y_train)

    yhat = mlf.predict_proba(x_test)

    # Metrics for each output in the fold
    for i in range(y_test.shape[1]):
        
        fpr, tpr, thresh = metrics.roc_curve(y_test[:, i],yhat[i][:, 1])
        
        j_scores = tpr - fpr 
        optimal_idx = np.argmax(j_scores)
        optimal_threshold = thresh[optimal_idx]
        
        print(optimal_threshold)
        
        plt.plot(fpr, tpr)
        plt.ylabel('Sensitivity')
        plt.xlabel('1 - Specificity')
        plt.show()
