# classification and undersampling

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import accuracy_score, matthews_corrcoef, classification_report

tob_data = pd.read_csv('tob_processed.csv', sep=',')
tob_data = tob_data.drop(columns=['Unnamed: 0'])

col_data = pd.read_csv('col_processed.csv', sep=',')
col_data = col_data.drop(columns=['Unnamed: 0'])

tob_X = tob_data.drop(columns=['Tob_Pot_1','ID','MCS Cluster','Structure','Class'])
tob_y = tob_data['Class']

tob_X = tob_X.convert_dtypes()
tob_y = tob_y.convert_dtypes()

col_X = col_data.drop(columns=['Col_Pot_1','Structure','ID','Class','MCS Cluster'])
col_y = col_data['Class']

col_X = col_X.convert_dtypes()
col_y = col_y.convert_dtypes()


tob_X_train, tob_X_test, tob_y_train, tob_y_test = train_test_split(tob_X, tob_y, test_size=0.25, stratify=tob_y)

col_X_train, col_X_test, col_y_train, col_y_test = train_test_split(col_X, col_y, test_size=0.25, stratify=col_y)


rus = RandomUnderSampler(random_state=42)

tob_X_resampled, tob_y_resampled = rus.fit_resample(tob_X_train, tob_y_train)

col_X_resampled, col_y_resampled = rus.fit_resample(col_X_train, col_y_train)

# data processing for classification

In [4]:
tob_data = tob_data.drop(columns=['ID','Tob_Pot_1','RB','clogP','tPSA','Hacc','Hdon','LogSw','Mol Weight','MCS Cluster'])

In [6]:
tob_data = tob_data.convert_dtypes()
# tob_data.dtypes

In [None]:
tob_data_train, tob_data_test = train_test_split(tob_data, test_size=0.3)

tob_data_train_rus, tob_data_test_rus = rus.fit_resample(tob_data_train, tob_data_test)

In [7]:
X_smiles = tob_data['Structure']  # Column with SMILES strings
y = tob_data['Class']  # Binary target column

rusamp = RandomUnderSampler(random_state=42)

# Perform random undersampling on the target (y), which contains 1s and 0s
X_resamp, y_resamp = rusamp.fit_resample(X_smiles.values.reshape(-1, 1), y)

# After resampling, X_resampled will be the SMILES column with the undersampled target labels
df_resampled = pd.DataFrame({'Structure': X_resamp.flatten(), 'Class': y_resamp})
# df_resampled now contains the undersampled SMILES data and the corresponding target labels

In [15]:
df_resampled_train, df_resampled_test = train_test_split(df_resampled, test_size=0.25, )
df_resampled_train_strat, df_resampled_test_strat = train_test_split(df_resampled, test_size=0.25, stratify=y_resamp)

In [28]:
count=0
count1=0
count2=0
count3=0
for i in df_resampled_train['Class']:
    if i == 0:
        count+=1
    if i == 1:
        count1+=1
for j in df_resampled_train_strat['Class']:
    if j == 0:
        count2+=1
    if j == 1:
        count3+=1 
print('Number of active in random split:', count)
print('Number of inactive in random split:', count1)

print('\nNumber of active in strat split:', count2)
print('Number of inactive in strat split:', count3)

print('\nTotal compounds in train set:', 'Rand:',len(df_resampled_train))
print('Total compounds in test set:','Rand:', len(df_resampled_test))

print('\nTotal compounds in train set:', 'Strat:',len(df_resampled_train))
print('Total compounds in test set:','Strat:', len(df_resampled_test))

print('\nTotal in undersampled dataset:', len(df_resampled),'( reduced from', len(tob_data),')')

Number of active in random split: 255
Number of inactive in random split: 237

Number of active in strat split: 246
Number of inactive in strat split: 246

Total compounds in train set: Rand: 492
Total compounds in test set: Rand: 164

Total compounds in train set: Strat: 492
Total compounds in test set: Strat: 164

Total in undersampled dataset: 656 ( reduced from 5889 )


In [476]:
df_resampled_train.to_csv('./tobramycin_undersampled_train.csv')
df_resampled_test.to_csv('./tobramycin_undersampled_test.csv')

In [471]:
count=0
for x in df_resampled['Class']:
    if x == 0:
        count+=1
print(count, 'out of', len(df_resampled))


328 out of 656


In [351]:
count = 0

for x in tob_X_train:
    if x == 0:
        count+=1
print(count)
print('total:', len(tob_X_train))

0
total: 4416


In [250]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # fpgen = AllChem.GetMorganGenerator(radius=2)

    descriptors = [
        # Descriptors.CalcMolDescriptors(mol)
        Descriptors.ExactMolWt(mol),
        Descriptors.qed(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol),
        # fpgen.GetSparseCountFingerprint(mol),
    ]
    return descriptors

x = 'C[C@H]1C(=O)N[C@@H](Cc2ccc(cc2)OCC(=O)[N@]3CCC[C@@H](C3)C(=O)N[C@H](C(=O)N1)[C@@H](C)O)C(=O)NCc4cccnc4'


compute_descriptors(x)

[594.280197556, 0.29205617459964683, 4, 8, 5, -0.5733999999999935, 179.06]

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_fingerprint(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    else:
        return None

# Apply the function to generate fingerprints
X['Fingerprint'] = X['Structure'].apply(smiles_to_fingerprint)

# Convert fingerprints to a numpy array
fingerprint_arr = np.array([np.array(fp) for fp in X['Fingerprint']])


In [423]:
from sklearn.manifold import TSNE

tsne1 = TSNE(n_components=2, random_state=42, perplexity=10)
a_tsne1 = tsne1.fit_transform(active_df)

tsne2 = TSNE(n_components=2, random_state=42, perplexity=10)
a_tsne2 = tsne2.fit_transform(inactive_df)

tsne3 = TSNE(n_components=2, random_state=42, perplexity=10)
a_tsne3 = tsne3.fit_transform(tob_X)

# tsne4 = TSNE(n_components=2, random_state=42, perplexity=100)
# a_tsne4 = tsne4.fit_transform(tob_X)

In [None]:
import seaborn as sns

plt.figure()
plt.subplots(2,2, figsize=(15,15))

plt.subplot(221)
sns.scatterplot(x=a_tsne1[:, 0], y=a_tsne1[:,1], s=10, marker='o')
plt.legend(['Perplexity: 5'], prop={'size': 10})

plt.subplot(222)
sns.scatterplot(x=a_tsne2[:, 0], y=a_tsne2[:,1], s=10, marker='o')
plt.legend(['Perplexity: 10'])

plt.subplot(223)
sns.scatterplot(x=a_tsne3[:, 0], y=a_tsne3[:,1], s=10, marker='o')
plt.legend(['Perplexity: 30'])
plt.ylabel('t-SNE Component 2', loc='center')
plt.xlabel('t-SNE Component 1', loc='center')

plt.subplot(224)
sns.scatterplot(x=a_tsne4[:, 0], y=a_tsne4[:,1], s=10, marker='o')
plt.legend(['Perplexity: 100'])

plt.suptitle('\n\nt-SNEs of Tobramycin Dataset', fontsize=25)
plt.subplots_adjust();


In [None]:
plt.figure(figsize=(10,10))

#sns.scatterplot(x=a_tsne3[:,0], y=a_tsne3[:,1], s=10, marker='o', alpha=0.5)
sns.scatterplot(x=a_tsne1[:, 0], y=a_tsne1[:,1], s=10, marker='o', color='limegreen')
sns.scatterplot(x=a_tsne2[:,0], y=a_tsne2[:,1], s=10, marker='o', color='red', alpha=0.7)

In [425]:
a_tsne1

array([[-6.27873878e+01,  1.02107105e+01],
       [-6.64829712e+01,  1.56520996e+01],
       [-6.30487823e+01,  6.28686714e+00],
       [-6.13106613e+01,  1.04721155e+01],
       [ 2.07706242e+01, -1.93809357e+01],
       [-4.35910950e+01,  1.18387423e+01],
       [ 2.32884903e+01,  8.64790440e+00],
       [ 3.96652184e+01,  2.02136350e+00],
       [ 1.70183258e+01,  4.98022318e+00],
       [ 2.68326054e+01,  1.60929470e+01],
       [ 6.65212326e+01,  2.40335965e+00],
       [ 5.70748520e+01,  2.72946239e+00],
       [ 3.28162231e+01,  1.92635117e+01],
       [ 5.97909584e+01,  5.29493189e+00],
       [ 4.72058144e+01,  1.19866590e+01],
       [ 4.11607857e+01,  1.25695210e+01],
       [ 2.75086441e+01,  1.80808086e+01],
       [ 6.03058472e+01,  6.62091064e+00],
       [ 5.93582535e+01,  3.85351324e+00],
       [ 6.06292839e+01,  4.25769043e+00],
       [ 6.03455086e+01,  2.92965961e+00],
       [ 7.07976837e+01, -2.40921187e+00],
       [ 3.35144119e+01,  1.95825577e+01],
       [ 6.

In [None]:
from sklearn.manifold import TSNE

# tsne11 = TSNE(n_components=2, random_state=42, perplexity=5)
# a_tsne11 = tsne11.fit_transform(tob_X_resampled)

# tsne21 = TSNE(n_components=2, random_state=42, perplexity=10)
# a_tsne21 = tsne21.fit_transform(tob_X_resampled)

# tsne31 = TSNE(n_components=2, random_state=42, perplexity=30)
# a_tsne31 = tsne31.fit_transform(tob_X_resampled)

# tsne41 = TSNE(n_components=2, random_state=42, perplexity=100)
# a_tsne41 = tsne41.fit_transform(tob_X_resampled)

import seaborn as sns

plt.figure(figsize=(10,10))
# plt.subplots(2,2, figsize=(15,15))

# plt.subplot(221)
# sns.scatterplot(x=a_tsne11[:, 0], y=a_tsne11[:,1], s=10, marker='o', color='red')
# plt.legend(['Perplexity: 5'], prop={'size': 10})

# plt.subplot(222)
# sns.scatterplot(x=a_tsne21[:, 0], y=a_tsne21[:,1], s=10, marker='o', color='red')
# plt.legend(['Perplexity: 10'])

# plt.subplot(223)
# sns.scatterplot(x=a_tsne31[:, 0], y=a_tsne31[:,1], s=10, marker='o', color='red')
# plt.legend(['Perplexity: 30'])
# plt.ylabel('t-SNE Component 2', loc='center')
# plt.xlabel('t-SNE Component 1', loc='center')

# plt.subplot(224)
# sns.scatterplot(x=a_tsne41[:, 0], y=a_tsne41[:,1], s=10, marker='o', color='red')
# plt.legend(['Perplexity: 100'])

# plt.subplot(221)
sns.scatterplot(x=a_tsne1[:, 0], y=a_tsne1[:,1], s=10, marker='o')

# plt.subplot(222)
# sns.scatterplot(x=a_tsne2[:, 0], y=a_tsne2[:,1], s=10, marker='o')

# plt.subplot(223)
# sns.scatterplot(x=a_tsne3[:, 0], y=a_tsne3[:,1], s=10, marker='o')

# plt.subplot(224)
# sns.scatterplot(x=a_tsne4[:, 0], y=a_tsne4[:,1], s=10, marker='o')


# plt.title('t-SNEs of Tobramycin Dataset', fontsize=12)
# plt.subplots_adjust();


# Algorithm testing

Random Forest Classifier

In [283]:
# tobramycin

from sklearn.metrics import roc_auc_score

tob_model = rfc(n_estimators=1000, max_depth=10, random_state=42)
tob_model.fit(tob_X_resampled, tob_y_resampled)

tob_y_pred = tob_model.predict(tob_X_test)
print('Tobramycin:')
print("Accuracy:", accuracy_score(tob_y_test, tob_y_pred));
print(classification_report(tob_y_test, tob_y_pred, zero_division=True))

tob_mcc = matthews_corrcoef(tob_y_test, tob_y_pred)
print('MCC:', tob_mcc)

tob_y_pred_prob = tob_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob):.4f}')


# colistin
col_model = rfc(n_estimators=1000, max_depth=10, random_state=42)
col_model.fit(col_X_resampled, col_y_resampled)

col_y_pred = col_model.predict(col_X_test)
print('\nColistin:')
print("Accuracy:", accuracy_score(col_y_test, col_y_pred));
print(classification_report(col_y_test, col_y_pred, zero_division=True))

col_mcc = matthews_corrcoef(col_y_test, col_y_pred)
print('MCC:', col_mcc)

col_y_pred_prob = col_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob):.4f}')

Tobramycin:
Accuracy: 0.5811269517990496
              precision    recall  f1-score   support

         0.0       0.95      0.59      0.73      1391
         1.0       0.06      0.46      0.11        82

    accuracy                           0.58      1473
   macro avg       0.51      0.53      0.42      1473
weighted avg       0.90      0.58      0.69      1473

MCC: 0.023957490118233234
ROC-AUC: 0.5413

Colistin:
Accuracy: 0.6224703419399861
              precision    recall  f1-score   support

         0.0       0.94      0.62      0.75      1294
         1.0       0.15      0.60      0.24       139

    accuracy                           0.62      1433
   macro avg       0.54      0.61      0.49      1433
weighted avg       0.86      0.62      0.70      1433

MCC: 0.13831460342152685
ROC-AUC: 0.6712


Logistic Regression

In [296]:
from sklearn.linear_model import LogisticRegression

# tobramycin

tob_LR_model = LogisticRegression(max_iter=10000)
tob_LR_model.fit(tob_X_resampled, tob_y_resampled)

tob_y_pred_LR = tob_LR_model.predict(tob_X_test)
print('Tobramycin')
print("Accuracy:", accuracy_score(tob_y_test, tob_y_pred_LR));
print(classification_report(tob_y_test, tob_y_pred_LR, zero_division=True))

tob_LR_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_LR)
print('MCC:', tob_LR_mcc)

tob_y_pred_prob_LR = tob_LR_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob_LR):.4f}')


# colistin

col_LR_model = LogisticRegression(max_iter=10000)
col_LR_model.fit(col_X_resampled, col_y_resampled)

col_y_pred_LR = col_LR_model.predict(col_X_test)
print('\nColistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_LR))
print(classification_report(col_y_test, col_y_pred_LR))

col_LR_mcc = matthews_corrcoef(col_y_test, col_y_pred_LR)
print('MCC:', col_LR_mcc)

col_y_pred_prob_LR = col_LR_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob_LR):.4f}')


Tobramycin
Accuracy: 0.5451459606245757
              precision    recall  f1-score   support

         0.0       0.94      0.55      0.70      1391
         1.0       0.05      0.39      0.09        82

    accuracy                           0.55      1473
   macro avg       0.49      0.47      0.39      1473
weighted avg       0.89      0.55      0.66      1473

MCC: -0.02560947072192967
ROC-AUC: 0.4822

Colistin
Accuracy: 0.6273551988834613
              precision    recall  f1-score   support

         0.0       0.95      0.62      0.75      1294
         1.0       0.16      0.68      0.26       139

    accuracy                           0.63      1433
   macro avg       0.56      0.65      0.51      1433
weighted avg       0.87      0.63      0.70      1433

MCC: 0.18352221140183306
ROC-AUC: 0.6969


KNeighbours Classifier

In [346]:
from sklearn.neighbors import KNeighborsClassifier

# tobramycin

tob_KNN_model = KNeighborsClassifier(n_neighbors=10)
tob_KNN_model.fit(tob_X_resampled, tob_y_resampled)

tob_y_pred_KNN = tob_KNN_model.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_KNN))
print(classification_report(tob_y_test, tob_y_pred_KNN))

tob_KNN_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_KNN)
print('MCC:', tob_KNN_mcc)

tob_y_pred_prob_KNN = tob_KNN_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob_KNN):.4f}')

# colistin

col_KNN_model = KNeighborsClassifier(n_neighbors=10)
col_KNN_model.fit(col_X_resampled, col_y_resampled)

col_y_pred_KNN = col_KNN_model.predict(col_X_test)
print('Colistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_KNN))
print(classification_report(col_y_test, col_y_pred_KNN))

col_KNN_mcc = matthews_corrcoef(col_y_test, col_y_pred_KNN)
print('MCC:', col_KNN_mcc)

col_y_pred_prob_KNN = col_KNN_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob_KNN):.4f}')


Tobramycin
Accuracy: 0.6558044806517311
              precision    recall  f1-score   support

         0.0       0.95      0.67      0.79      1391
         1.0       0.07      0.44      0.12        82

    accuracy                           0.66      1473
   macro avg       0.51      0.55      0.46      1473
weighted avg       0.90      0.66      0.75      1473

MCC: 0.052181015243465946
ROC-AUC: 0.5525
Colistin
Accuracy: 0.6280530355896721
              precision    recall  f1-score   support

         0.0       0.92      0.64      0.76      1294
         1.0       0.13      0.49      0.20       139

    accuracy                           0.63      1433
   macro avg       0.52      0.57      0.48      1433
weighted avg       0.84      0.63      0.70      1433

MCC: 0.08103001328671121
ROC-AUC: 0.5736


Support Vector Classification

In [None]:
from sklearn.svm import SVC

# tobramycin

tob_svm_model = SVC(kernel='poly')
tob_svm_model.fit(tob_X_resampled, tob_y_resampled)

tob_y_pred_svm = tob_svm_model.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_svm))
print(classification_report(tob_y_test, tob_y_pred_svm))

tob_svm_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_svm)
print('MCC:', tob_svm_mcc)

tob_y_pred_prob_svm = tob_svm_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob_svm):.4f}')


# colistin

col_svm_model = SVC(kernel='linear')
col_svm_model.fit(col_X_resampled, col_y_resampled)

col_y_pred_svm = col_svm_model.predict(col_X_test)
print('Colistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_svm))
print(classification_report(col_y_test, col_y_pred_svm))

col_svm_mcc = matthews_corrcoef(col_y_test, col_y_pred_svm)
print('MCC:', col_svm_mcc)

col_y_pred_prob_svm = col_svm_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob_svm):.4f}')


Decision Tree Classifier

In [335]:
from sklearn.tree import DecisionTreeClassifier

# tobramycin

tob_tree_model = DecisionTreeClassifier(max_depth=10000)
tob_tree_model.fit(tob_X_resampled, tob_y_resampled)

tob_y_pred_tree = tob_tree_model.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_tree))
print(classification_report(tob_y_test, tob_y_pred_tree))

tob_tree_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_tree)
print('MCC:', tob_tree_mcc)

tob_y_pred_prob_tree = tob_tree_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob_tree):.4f}')

# colistin

col_tree_model = DecisionTreeClassifier(max_depth=10000)
col_tree_model.fit(col_X_resampled, col_y_resampled)

col_y_pred_tree = col_tree_model.predict(col_X_test)
print('Colistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_tree))
print(classification_report(col_y_test, col_y_pred_tree))

col_tree_mcc = matthews_corrcoef(col_y_test, col_y_pred_tree)
print('MCC:', col_tree_mcc)

col_y_pred_prob_tree = col_tree_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob_tree):.4f}')

Tobramycin
Accuracy: 0.5621181262729125
              precision    recall  f1-score   support

         0.0       0.95      0.57      0.71      1391
         1.0       0.06      0.45      0.10        82

    accuracy                           0.56      1473
   macro avg       0.50      0.51      0.41      1473
weighted avg       0.90      0.56      0.68      1473

MCC: 0.009198312796642434
ROC-AUC: 0.5099
Colistin
Accuracy: 0.5938590369853455
              precision    recall  f1-score   support

         0.0       0.93      0.60      0.73      1294
         1.0       0.13      0.58      0.22       139

    accuracy                           0.59      1433
   macro avg       0.53      0.59      0.47      1433
weighted avg       0.85      0.59      0.68      1433

MCC: 0.10653293796752551
ROC-AUC: 0.5888


Naive Bayes

In [340]:
from sklearn.naive_bayes import GaussianNB

# tobramycin

tob_nb = GaussianNB()
tob_nb.fit(tob_X_resampled, tob_y_resampled)

tob_y_pred_nb = tob_nb.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_nb))
print(classification_report(tob_y_test, tob_y_pred_nb))

tob_nb_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_nb)
print('MCC:', tob_nb_mcc)

tob_y_pred_prob_nb = tob_nb.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob_nb):.4f}')


col_nb = GaussianNB()
col_nb.fit(col_X_resampled, col_y_resampled)

col_y_pred_nb = col_nb.predict(col_X_test)
print('Colistin')
print("Accuracy:", accuracy_score(col_y_test, col_y_pred_nb))
print(classification_report(col_y_test, col_y_pred_nb))

col_nb_mcc = matthews_corrcoef(col_y_test, col_y_pred_nb)
print('MCC:', col_nb_mcc)

col_y_pred_prob_nb = col_nb.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob_nb):.4f}')

Tobramycin
Accuracy: 0.5315682281059063
              precision    recall  f1-score   support

         0.0       0.94      0.54      0.68      1391
         1.0       0.05      0.45      0.10        82

    accuracy                           0.53      1473
   macro avg       0.50      0.49      0.39      1473
weighted avg       0.89      0.53      0.65      1473

MCC: -0.005736592223098953
ROC-AUC: 0.4986
Colistin
Accuracy: 0.5652477320307048
              precision    recall  f1-score   support

         0.0       0.95      0.55      0.69      1294
         1.0       0.15      0.73      0.24       139

    accuracy                           0.57      1433
   macro avg       0.55      0.64      0.47      1433
weighted avg       0.87      0.57      0.65      1433

MCC: 0.1626467514431777
ROC-AUC: 0.6753


Gradient Boosting Classifier

In [345]:
from sklearn.ensemble import GradientBoostingClassifier

# tobramycin

tob_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1, random_state=42)
tob_gb.fit(tob_X_resampled, tob_y_resampled)
tob_y_pred_gb = tob_gb.predict(tob_X_test)

print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_gb))
print(classification_report(tob_y_test, tob_y_pred_gb))

tob_gb_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_gb)
print('MCC', tob_gb_mcc)

tob_y_pred_prob_gb = tob_gb.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_pred_prob_gb):.4f}')

# colistin

col_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1, random_state=42)
col_gb.fit(col_X_resampled, col_y_resampled)
col_y_pred_gb = col_gb.predict(col_X_test)
print('Colistin')
print("Accuracy:", accuracy_score(col_y_test, col_y_pred_gb))
print(classification_report(col_y_test, col_y_pred_gb))

col_gb_mcc = matthews_corrcoef(col_y_test, col_y_pred_gb)
print('MCC:', col_gb_mcc)

col_y_pred_prob_gb = col_gb.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_pred_prob_gb):.4f}')

Tobramycin
Accuracy: 0.5655125594025797
              precision    recall  f1-score   support

         0.0       0.95      0.57      0.71      1391
         1.0       0.07      0.51      0.12        82

    accuracy                           0.57      1473
   macro avg       0.51      0.54      0.41      1473
weighted avg       0.90      0.57      0.68      1473

MCC 0.0373840716832947
ROC-AUC: 0.5232
Colistin
Accuracy: 0.6140963014654571
              precision    recall  f1-score   support

         0.0       0.94      0.61      0.74      1294
         1.0       0.15      0.63      0.24       139

    accuracy                           0.61      1433
   macro avg       0.54      0.62      0.49      1433
weighted avg       0.86      0.61      0.69      1433

MCC: 0.1436350302501236
ROC-AUC: 0.6662
