In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, auc, accuracy_score, roc_auc_score, precision_score, recall_score, make_scorer, get_scorer, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from dcekit.validation import ApplicabilityDomain

In [None]:
AF = pd.read_csv('./06June_database_merged_part_af.csv')

X = AF.drop(columns=['Smiles', 'Activity'])
y_with_labels = AF['Activity']

y = y_with_labels.map({'Active': 1, 'Inactive': 0})
print(y_with_labels)
print(y)


print("Shape of X:", X.shape)
print("Shape of y_with_labels:", y_with_labels.shape)
print("Shape of y:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45, shuffle=True, stratify=y)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
model_random_forest = RandomForestClassifier(n_estimators=550, max_depth=30, n_jobs=-1, random_state=25)

In [None]:
model_SVC = SVC(C=10, gamma=0.01, kernel='rbf', random_state=25)

In [None]:
model_random_forest.fit(X_train, y_train)

In [None]:
model_SVC.fit(X_train, y_train)

In [None]:
pred_random_forest = model_random_forest.predict(X_test)
pred_SVC = model_SVC.predict(X_test)

In [None]:
metrics_random_forest = {
    'accuracy': accuracy_score(y_test, pred_random_forest),
        'precision': precision_score(y_test, pred_random_forest), 
        'recall': recall_score(y_test, pred_random_forest), 
        'f1': f1_score(y_test, pred_random_forest),
        'roc_auc': roc_auc_score(y_test, pred_random_forest)
}

for metric_name, metric_value in metrics_random_forest.items():
    print(f"{metric_name}: {metric_value:.4f}")

In [None]:
confusion_matrix_RF = confusion_matrix(y_test, pred_random_forest)
confusion_matrix_RF

In [None]:
metrics_SVC = {
    'accuracy': accuracy_score(y_test, pred_SVC),
        'precision': precision_score(y_test, pred_SVC), 
        'recall': recall_score(y_test, pred_SVC), 
        'f1': f1_score(y_test, pred_SVC),
        'roc_auc': roc_auc_score(y_test, pred_SVC)
}

for metric_name, metric_value in metrics_SVC.items():
    print(f"{metric_name}: {metric_value:.4f}")

In [None]:
confusion_matrix_SVC = confusion_matrix(y_test, pred_SVC)
confusion_matrix_SVC

In [None]:
X_copy = X
y_copy = y
X_train_copy, X_test_copy, y_train_copy, y_test_copy = train_test_split(X_copy, y_copy, test_size=0.2, random_state=0, stratify=y)

In [None]:
acc_shuffle_RF = []
for i in range(10):
  # Y-Shuffling
  y_train_copy = y_train_copy.sample(frac=1, replace=False, random_state=i)
  # Model building
  shuffle_RF_model = RandomForestClassifier(n_estimators=450, max_depth=40, n_jobs=-1, random_state=25) 
  shuffle_RF_model.fit(X_train_copy, y_train_copy)
  # Make prediction
  shuffle_RF_pred = shuffle_RF_model.predict(X_test_copy)
  # Compute R2 score
  acc = accuracy_score(y_test_copy, shuffle_RF_pred)
  acc_shuffle_RF.append(acc)
# Print R2 score
acc_shuffle_RF

In [None]:
acc_shuffle_SVC = []
for i in range(10):
  # Y-Shuffling
  y_train_copy = y_train_copy.sample(frac=1, replace=False, random_state=i)
  # Model building
  shuffle_SVC_model = SVC(C=10, gamma=0.01, kernel='rbf', random_state=25) 
  shuffle_SVC_model.fit(X_train_copy, y_train_copy)
  # Make prediction
  shuffle_SVC_pred = shuffle_SVC_model.predict(X_test_copy)
  # Compute R2 score
  acc = accuracy_score(y_test_copy, shuffle_SVC_pred)
  acc_shuffle_SVC.append(acc)
# Print R2 score
acc_shuffle_SVC

In [None]:
joblib.dump(model_random_forest, 'RF_model.joblib')
joblib.dump(model_SVC, 'SVC_model.joblib')

Applicability Domain

In [None]:
ad_dcekit = ApplicabilityDomain(method_name='knn', rate_of_outliers=0.001, n_neighbors=2)
ad_dcekit.fit(X_train)

In [None]:
outlier_dcekit = ad_dcekit.predict (X_test)

In [None]:
joblib.dump(ad_dcekit, 'ad_model.joblib')