# classification and oversampling the minority

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import accuracy_score, matthews_corrcoef, classification_report, roc_auc_score

tob_data = pd.read_csv('tob_processed.csv', sep=',')
tob_data = tob_data.drop(columns=['Unnamed: 0'])

col_data = pd.read_csv('col_processed.csv', sep=',')
col_data = col_data.drop(columns=['Unnamed: 0'])

tob_X = tob_data.drop(columns=['Tob_Pot_1','ID','MCS Cluster','Structure','Class'])
tob_y = tob_data['Class']

tob_X = tob_X.convert_dtypes()
tob_y = tob_y.convert_dtypes()

col_X = col_data.drop(columns=['Col_Pot_1','Structure','ID','Class','MCS Cluster'])
col_y = col_data['Class']

col_X = col_X.convert_dtypes()
col_y = col_y.convert_dtypes()


tob_X_train, tob_X_test, tob_y_train, tob_y_test = train_test_split(tob_X, tob_y, test_size=0.25, stratify=tob_y)

col_X_train, col_X_test, col_y_train, col_y_test = train_test_split(col_X, col_y, test_size=0.25, stratify=col_y)


ros = RandomOverSampler(random_state=42)

tob_X_oversampled, tob_y_oversampled = ros.fit_resample(tob_X_train, tob_y_train)

col_X_oversampled, col_y_oversampled = ros.fit_resample(col_X_train, col_y_train)

 # data processing for classification

In [7]:
tob_data = tob_data.drop(columns=['ID','Tob_Pot_1','RB','clogP','tPSA','Hacc','Hdon','LogSw','Mol Weight','MCS Cluster'])

In [8]:
tob_data = tob_data.convert_dtypes()

In [None]:
tob_data_train, tob_data_test = train_test_split(tob_data, test_size=0.3)

##tob_data_train_rus, tob_data_test_rus = ros.fit_resample(tob_data_train, tob_data_test)

In [10]:
X_smiles = tob_data['Structure']  # Column with SMILES strings
y = tob_data['Class']  # Binary target column

rosamp = RandomOverSampler(random_state=42)

# Perform random undersampling on the target (y), which contains 1s and 0s
X_resamp, y_resamp = rosamp.fit_resample(X_smiles.values.reshape(-1, 1), y)

# After resampling, X_resampled will be the SMILES column with the undersampled target labels
df_resampled = pd.DataFrame({'Structure': X_resamp.flatten(), 'Class': y_resamp})
# df_resampled now contains the undersampled SMILES data and the corresponding target labels

In [20]:
df_resampled_train, df_resampled_test = train_test_split(df_resampled, test_size=0.25)
df_resampled_train_strat, df_resampled_test_strat = train_test_split(df_resampled, test_size=0.25, stratify=y_resamp)

# df_resampled_train.to_csv('./tobramycin_oversampled_train.csv')
# df_resampled_test.to_csv('./tobramycin_oversampled_test.csv')

In [23]:
count=0
count1=0
count2=0
count3=0
for i in df_resampled_train['Class']:
    if i == 0:
        count+=1
    if i == 1:
        count1+=1
for j in df_resampled_train_strat['Class']:
    if j == 0:
        count2+=1
    if j == 1:
        count3+=1 
print('Number of active in random split:', count)
print('Number of inactive in random split:', count1)

print('\nNumber of active in strat split:', count2)
print('Number of inactive in strat split:', count3)

print('\nTotal compounds in train set:', 'Rand:',len(df_resampled_train))
print('Total compounds in test set:','Rand:', len(df_resampled_test))

print('\nTotal compounds in train set:', 'Strat:',len(df_resampled_train))
print('Total compounds in test set:','Strat:', len(df_resampled_test))

print('\nTotal in undersampled dataset:', len(df_resampled),'( increased from', len(tob_data),')')

Number of active in random split: 4160
Number of inactive in random split: 4181

Number of active in strat split: 4171
Number of inactive in strat split: 4170

Total compounds in train set: Rand: 8341
Total compounds in test set: Rand: 2781

Total compounds in train set: Strat: 8341
Total compounds in test set: Strat: 2781

Total in undersampled dataset: 11122 ( increased from 5889 )


SMOTE Oversampling

In [19]:
from imblearn.over_sampling import SMOTE, ADASYN

adasyn = ADASYN()

X_smote_resamp, y_smote_resamp = adasyn.fit_resample(X_smiles.values.reshape(-1, 1),y)

ValueError: could not convert string to float: 'COc1c2cc(cc1)C(=O)[N@]3CC[C@@H](CNC(=O)CCCc4nn[n@](c4)CCO2)CC3'

In [11]:
count = 0 
for x in tob_y_oversampled:
    if x == 1:
        count += 1
    else:
        continue
print(count)
print('total:', len(tob_X_oversampled))

4170
total: 8340


In [14]:
from sklearn.manifold import TSNE

tsne1 = TSNE(n_components=2, random_state=42, perplexity=5)
a_tsne1 = tsne1.fit_transform(tob_X)

tsne2 = TSNE(n_components=2, random_state=42, perplexity=10)
a_tsne2 = tsne2.fit_transform(tob_X)

tsne3 = TSNE(n_components=2, random_state=42, perplexity=30)
a_tsne3 = tsne3.fit_transform(tob_X)

tsne4 = TSNE(n_components=2, random_state=42, perplexity=100)
a_tsne4 = tsne4.fit_transform(tob_X)

In [None]:
tsne11 = TSNE(n_components=2, random_state=42, perplexity=5)
a_tsne11 = tsne11.fit_transform(tob_X_oversampled)

tsne21 = TSNE(n_components=2, random_state=42, perplexity=10)
a_tsne21 = tsne21.fit_transform(tob_X_oversampled)

tsne31 = TSNE(n_components=2, random_state=42, perplexity=30)
a_tsne31 = tsne31.fit_transform(tob_X_oversampled)

tsne41 = TSNE(n_components=2, random_state=42, perplexity=100)
a_tsne41 = tsne41.fit_transform(tob_X_oversampled)

In [None]:
import seaborn as sns

plt.figure(figsize=(10,10))
# plt.subplots(2,2, figsize=(15,15))

# plt.subplot(221)
# sns.scatterplot(x=a_tsne11[:, 0], y=a_tsne11[:,1], s=10, marker='o', color='orange')
# plt.legend(['Perplexity: 5'], prop={'size': 10})

# plt.subplot(222)
# sns.scatterplot(x=a_tsne21[:, 0], y=a_tsne21[:,1], s=10, marker='o', color='orange')
# plt.legend(['Perplexity: 10'])

# plt.subplot(223)
# sns.scatterplot(x=a_tsne31[:, 0], y=a_tsne31[:,1], s=10, marker='o', color='orange')
# plt.legend(['Perplexity: 30'])
plt.ylabel('t-SNE Component 2', loc='center')
plt.xlabel('t-SNE Component 1', loc='center')

# plt.subplot(224)
# sns.scatterplot(x=a_tsne41[:, 0], y=a_tsne41[:,1], s=10, marker='o', color='orange')
# plt.legend(['Perplexity: 100'])

# plt.subplot(221)
sns.scatterplot(x=a_tsne1[:, 0], y=a_tsne1[:,1], s=10, marker='o')

# plt.subplot(222)
# sns.scatterplot(x=a_tsne2[:, 0], y=a_tsne2[:,1], s=10, marker='o')

# plt.subplot(223)
# sns.scatterplot(x=a_tsne3[:, 0], y=a_tsne3[:,1], s=10, marker='o')

# plt.subplot(224)
# sns.scatterplot(x=a_tsne4[:, 0], y=a_tsne4[:,1], s=10, marker='o')

plt.suptitle('\n\nt-SNEs of Tobramycin Dataset', fontsize=15)
# plt.subplots_adjust();


# Algorithm testing

Random Forest

In [25]:
# tobramycin

tob_model = rfc(n_estimators=1000, max_depth=10, random_state=42)
tob_model.fit(tob_X_oversampled, tob_y_oversampled)

tob_y_pred = tob_model.predict(tob_X_test)
print('Tobramycin:')
print("Accuracy:", accuracy_score(tob_y_test, tob_y_pred));
print(classification_report(tob_y_test, tob_y_pred, zero_division=True))

tob_mcc = matthews_corrcoef(tob_y_test, tob_y_pred)
print('MCC:', tob_mcc)

tob_y_prob_pred = tob_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_prob_pred):.4f}')


# colistin
col_model = rfc(n_estimators=1000, max_depth=10, random_state=42)
col_model.fit(col_X_oversampled, col_y_oversampled)

col_y_pred = col_model.predict(col_X_test)
print('\nColistin:')
print("Accuracy:", accuracy_score(col_y_test, col_y_pred));
print(classification_report(col_y_test, col_y_pred, zero_division=True))

col_mcc = matthews_corrcoef(col_y_test, col_y_pred)
print('MCC:', col_mcc)

col_y_prob_pred = col_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_prob_pred):.4f}')


Tobramycin:
Accuracy: 0.8968092328581126
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.95      1391
         1.0       0.10      0.11      0.11        82

    accuracy                           0.90      1473
   macro avg       0.52      0.53      0.53      1473
weighted avg       0.90      0.90      0.90      1473

MCC: 0.051235634842755416
ROC-AUC: 0.6231

Colistin:
Accuracy: 0.8129797627355199
              precision    recall  f1-score   support

         0.0       0.92      0.86      0.89      1294
         1.0       0.21      0.34      0.26       139

    accuracy                           0.81      1433
   macro avg       0.57      0.60      0.58      1433
weighted avg       0.85      0.81      0.83      1433

MCC: 0.16501846686533986
ROC-AUC: 0.6917


Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

# tobramycin

tob_LR_model = LogisticRegression(max_iter=10000)
tob_LR_model.fit(tob_X_oversampled, tob_y_oversampled)

tob_y_pred_LR = tob_LR_model.predict(tob_X_test)
print('Tobramycin')
print("Accuracy:", accuracy_score(tob_y_test, tob_y_pred_LR));
print(classification_report(tob_y_test, tob_y_pred_LR, zero_division=True))

tob_LR_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_LR)
print('MCC:', tob_LR_mcc)

tob_y_prob_pred_LR = tob_LR_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_prob_pred_LR):.4f}')


# colistin

col_LR_model = LogisticRegression(max_iter=10000)
col_LR_model.fit(col_X_oversampled, col_y_oversampled)

col_y_pred_LR = col_LR_model.predict(col_X_test)
print('\nColistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_LR))
print(classification_report(col_y_test, col_y_pred_LR))

col_LR_mcc = matthews_corrcoef(col_y_test, col_y_pred_LR)
print('MCC:', col_LR_mcc)

col_y_prob_pred_LR = col_LR_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_prob_pred_LR):.4f}')


Tobramycin
Accuracy: 0.5261371350984385
              precision    recall  f1-score   support

         0.0       0.96      0.52      0.67      1391
         1.0       0.07      0.63      0.13        82

    accuracy                           0.53      1473
   macro avg       0.52      0.58      0.40      1473
weighted avg       0.91      0.53      0.64      1473

MCC: 0.07059782878631787
ROC-AUC: 0.6126

Colistin
Accuracy: 0.6643405443126309
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78      1294
         1.0       0.17      0.66      0.28       139

    accuracy                           0.66      1433
   macro avg       0.56      0.66      0.53      1433
weighted avg       0.87      0.66      0.73      1433

MCC: 0.20046095028631017
ROC-AUC: 0.7151


KNeighbours Classifier

In [36]:
from sklearn.neighbors import KNeighborsClassifier

# tobramycin

tob_KNN_model = KNeighborsClassifier(n_neighbors=10)
tob_KNN_model.fit(tob_X_oversampled, tob_y_oversampled)

tob_y_pred_KNN = tob_KNN_model.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_KNN))
print(classification_report(tob_y_test, tob_y_pred_KNN))

tob_KNN_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_KNN)
print('MCC:', tob_KNN_mcc)

tob_y_prob_pred_KNN = tob_KNN_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_prob_pred_KNN):.4f}')


# colistin

col_KNN_model = KNeighborsClassifier(n_neighbors=10)
col_KNN_model.fit(col_X_oversampled, col_y_oversampled)

col_y_pred_KNN = col_KNN_model.predict(col_X_test)
print('Colistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_KNN))
print(classification_report(col_y_test, col_y_pred_KNN))

col_KNN_mcc = matthews_corrcoef(col_y_test, col_y_pred_KNN)
print('MCC:', col_KNN_mcc)

col_y_prob_pred_KNN = col_KNN_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_prob_pred_KNN):.4f}')


Tobramycin
Accuracy: 0.7257298031228785
              precision    recall  f1-score   support

         0.0       0.94      0.76      0.84      1391
         1.0       0.05      0.22      0.08        82

    accuracy                           0.73      1473
   macro avg       0.50      0.49      0.46      1473
weighted avg       0.89      0.73      0.80      1473

MCC: -0.013319092509819629
ROC-AUC: 0.5206
Colistin
Accuracy: 0.6727145847871598
              precision    recall  f1-score   support

         0.0       0.92      0.69      0.79      1294
         1.0       0.14      0.47      0.22       139

    accuracy                           0.67      1433
   macro avg       0.53      0.58      0.51      1433
weighted avg       0.85      0.67      0.74      1433

MCC: 0.10293127119208735
ROC-AUC: 0.5985


Support Vector Classification

In [12]:
from sklearn.svm import SVC

# # tobramycin

# tob_svm_model = SVC(kernel='sigmoid')
# tob_svm_model.fit(tob_X_oversampled, tob_y_oversampled)

# tob_y_pred_svm = tob_svm_model.predict(tob_X_test)
# print('Tobramycin')
# print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_svm))
# print(classification_report(tob_y_test, tob_y_pred_svm))

# tob_svm_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_svm)
# print('MCC:', tob_svm_mcc)

# colistin

col_svm_model = SVC(kernel='sigmoid')
col_svm_model.fit(col_X_oversampled, col_y_oversampled)

col_y_pred_svm = col_svm_model.predict(col_X_test)
print('Colistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_svm))
print(classification_report(col_y_test, col_y_pred_svm))

col_svm_mcc = matthews_corrcoef(col_y_test, col_y_pred_svm)
print('MCC:', col_svm_mcc)

Colistin
Accuracy: 0.49406838799720865
              precision    recall  f1-score   support

         0.0       0.91      0.49      0.63      1294
         1.0       0.11      0.56      0.18       139

    accuracy                           0.49      1433
   macro avg       0.51      0.52      0.41      1433
weighted avg       0.83      0.49      0.59      1433

MCC: 0.028437873567189846


Decision Tree Classifier

In [37]:
from sklearn.tree import DecisionTreeClassifier

# tobramycin

tob_tree_model = DecisionTreeClassifier(max_depth=100)
tob_tree_model.fit(tob_X_oversampled, tob_y_oversampled)

tob_y_pred_tree = tob_tree_model.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_tree))
print(classification_report(tob_y_test, tob_y_pred_tree))

tob_tree_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_tree)
print('MCC:', tob_tree_mcc)

tob_y_prob_pred_tree = tob_tree_model.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_prob_pred_tree):.4f}')

# colistin

col_tree_model = DecisionTreeClassifier(max_depth=100)
col_tree_model.fit(col_X_oversampled, col_y_oversampled)

col_y_pred_tree = col_tree_model.predict(col_X_test)
print('Colistin')
print('Accuracy:', accuracy_score(col_y_test, col_y_pred_tree))
print(classification_report(col_y_test, col_y_pred_tree))

col_tree_mcc = matthews_corrcoef(col_y_test, col_y_pred_tree)
print('MCC:', col_tree_mcc)

col_y_prob_pred_tree = col_tree_model.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_prob_pred_tree):.4f}')


Tobramycin
Accuracy: 0.8995247793618466
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.95      1391
         1.0       0.12      0.13      0.13        82

    accuracy                           0.90      1473
   macro avg       0.54      0.54      0.54      1473
weighted avg       0.90      0.90      0.90      1473

MCC: 0.07622159416186773
ROC-AUC: 0.5394
Colistin
Accuracy: 0.845778087927425
              precision    recall  f1-score   support

         0.0       0.91      0.92      0.91      1294
         1.0       0.18      0.17      0.18       139

    accuracy                           0.85      1433
   macro avg       0.55      0.55      0.55      1433
weighted avg       0.84      0.85      0.84      1433

MCC: 0.09350933128226971
ROC-AUC: 0.5456


Naive Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB

# tobramycin

tob_nb = GaussianNB()
tob_nb.fit(tob_X_oversampled, tob_y_oversampled)

tob_y_pred_nb = tob_nb.predict(tob_X_test)
print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_nb))
print(classification_report(tob_y_test, tob_y_pred_nb))

tob_nb_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_nb)
print('MCC:', tob_nb_mcc)

tob_y_prob_pred_nb = tob_nb.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_prob_pred_nb):.4f}')

# colistin

col_nb = GaussianNB()
col_nb.fit(col_X_oversampled, col_y_oversampled)

col_y_pred_nb = col_nb.predict(col_X_test)
print('Colistin')
print("Accuracy:", accuracy_score(col_y_test, col_y_pred_nb))
print(classification_report(col_y_test, col_y_pred_nb))

col_nb_mcc = matthews_corrcoef(col_y_test, col_y_pred_nb)
print('MCC:', col_nb_mcc)

col_y_prob_pred_nb = col_nb.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_prob_pred_nb):.4f}')


Tobramycin
Accuracy: 0.6401900882552614
              precision    recall  f1-score   support

         0.0       0.96      0.65      0.77      1391
         1.0       0.08      0.51      0.14        82

    accuracy                           0.64      1473
   macro avg       0.52      0.58      0.45      1473
weighted avg       0.91      0.64      0.74      1473

MCC: 0.07633985528604799
ROC-AUC: 0.6194
Colistin
Accuracy: 0.6008374040474529
              precision    recall  f1-score   support

         0.0       0.95      0.59      0.73      1294
         1.0       0.15      0.68      0.25       139

    accuracy                           0.60      1433
   macro avg       0.55      0.64      0.49      1433
weighted avg       0.87      0.60      0.68      1433

MCC: 0.16442861464109568
ROC-AUC: 0.6772


Gradient Boosting Classifier

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

# tobramycin

tob_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1, random_state=42)
tob_gb.fit(tob_X_oversampled, tob_y_oversampled)
tob_y_pred_gb = tob_gb.predict(tob_X_test)

print('Tobramycin')
print('Accuracy:', accuracy_score(tob_y_test, tob_y_pred_gb))
print(classification_report(tob_y_test, tob_y_pred_gb))

tob_gb_mcc = matthews_corrcoef(tob_y_test, tob_y_pred_gb)
print('MCC', tob_gb_mcc)

tob_y_prob_pred_gb = tob_gb.predict_proba(tob_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(tob_y_test, tob_y_prob_pred_gb):.4f}')


# colistin

col_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1, random_state=42)
col_gb.fit(col_X_oversampled, col_y_oversampled)
col_y_pred_gb = col_gb.predict(col_X_test)
print('Colistin')
print("Accuracy:", accuracy_score(col_y_test, col_y_pred_gb))
print(classification_report(col_y_test, col_y_pred_gb))

col_gb_mcc = matthews_corrcoef(col_y_test, col_y_pred_gb)
print('MCC:', col_gb_mcc)

col_y_prob_pred_gb = col_gb.predict_proba(col_X_test)[:,1] # probability for class 1 (active)
print(f'ROC-AUC: {roc_auc_score(col_y_test, col_y_prob_pred):.4f}')


Tobramycin
Accuracy: 0.7542430414120842
              precision    recall  f1-score   support

         0.0       0.95      0.78      0.86      1391
         1.0       0.09      0.38      0.15        82

    accuracy                           0.75      1473
   macro avg       0.52      0.58      0.50      1473
weighted avg       0.91      0.75      0.82      1473

MCC 0.08388154264756088
ROC-AUC: 0.6224
Colistin
Accuracy: 0.7292393579902303
              precision    recall  f1-score   support

         0.0       0.93      0.76      0.83      1294
         1.0       0.17      0.47      0.25       139

    accuracy                           0.73      1433
   macro avg       0.55      0.61      0.54      1433
weighted avg       0.86      0.73      0.78      1433

MCC: 0.15095738473786507
ROC-AUC: 0.6917
