In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score, make_scorer, get_scorer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

In [None]:
AF = pd.read_csv('./06June_database_merged_part_af.csv')

X = AF.drop(columns=['Smiles', 'Activity'])
y_with_labels = AF['Activity']

y = y_with_labels.map({'Active': 1, 'Inactive': 0})
print(y_with_labels)
print(y)


print("Shape of X:", X.shape)
print("Shape of y_with_labels:", y_with_labels.shape)
print("Shape of y:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45, shuffle=True, stratify=y)
print("Shape of X_tr:", X_train.shape)
print("Shape of y_tr:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
gscv_scoring = {
    'accuracy': make_scorer(accuracy_score), 
        'precision': make_scorer(precision_score), 
        'recall': make_scorer(recall_score), 
        'f1': make_scorer(f1_score), 
        'roc_auc': make_scorer(roc_auc_score)
}

In [None]:
logreg = LogisticRegression(random_state=45, max_iter=1000)
radomfor = RandomForestClassifier(random_state=45)
gradboost = GradientBoostingClassifier(random_state=45)
gaussian = GaussianNB()
kneighbor = KNeighborsClassifier()
mlp = MLPClassifier(random_state=45)
dectree = DecisionTreeClassifier(random_state=45)
svc = SVC(random_state=45)
linsvc = LinearSVC(random_state=45, max_iter=100000)
sgdc = SGDClassifier(random_state=45, max_iter=100000)


logreg.fit(X_train, y_train)
radomfor.fit(X_train, y_train)
gradboost.fit(X_train, y_train)
gaussian.fit(X_train, y_train)
kneighbor.fit(X_train, y_train)
mlp.fit(X_train, y_train)
dectree.fit(X_train, y_train)
svc.fit(X_train, y_train)
linsvc.fit(X_train, y_train)
sgdc.fit(X_train, y_train)



models = [logreg, radomfor, gradboost, gaussian, kneighbor, mlp, dectree, svc, linsvc, sgdc]
model_names = ['Logistic Regression', 'Random Forest', 'Gradient Boosting', 'Gaussian Naive Bayes', 'K-Nearest Neighbors', 'MLP', 'Decision Tree', 'SVC', 'Linear SVC', 'SGD']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    roc_auc = metrics.roc_auc_score(y_test, y_pred)
    
    print(f"Metrics for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print()



In [None]:
rf = RandomForestClassifier(random_state=45)
grid_rf = GridSearchCV(
    estimator = rf,
    param_grid = {
        'n_estimators' : np.arange(100, 1050, 50, dtype=int).tolist(),
        'max_depth' : np.arange(10, 60, 10, dtype=int).tolist(),
        # 'min_samples_split' : np.arange(2, 11, 1, dtype=int).tolist(),
        # 'min_samples_leaf' : np.arange(1, 11, 1, dtype=int).tolist()
    },
    scoring = gscv_scoring,
    refit = 'accuracy',
    cv = 10,
    verbose = 10,
    n_jobs = -1
)

grid_rf.fit(X_train, y_train)
rf_gscv_results = pd.DataFrame(grid_rf.cv_results_)
rf_gscv_results.to_csv('grid_rf.cv_results_june10.csv')
grid_rf.best_params_

In [None]:
multilp = MLPClassifier(random_state=45)
grid_mlp = GridSearchCV(
    estimator = multilp,
    param_grid = {
        'hidden_layer_sizes' : [(150, 100, 50), (50, 50), (100, 100), (100,)],
        'activation' : ['tanh', 'relu'],
        'learning_rate' : ['constant', 'adaptive'],
        # 'alpha' : [ ]
#         'batch_size' : [200, 400]
    },
    scoring = gscv_scoring,
    refit = 'accuracy',
    cv = 10,
    verbose = 2,
    n_jobs = -1
)

grid_mlp.fit(X_train, y_train)
mlp_gscv_results = pd.DataFrame(grid_mlp.cv_results_)
mlp_gscv_results.to_csv('grid_mlp.cv_results.csv')
grid_mlp.best_params_

In [None]:
supportvc = SVC(random_state=45)
grid_svc = GridSearchCV(
    estimator = supportvc,
    param_grid = {
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'C' : [0.01, 1, 10, 100],
        'gamma' : ['scale', 0.1, 0.01, 'auto']
    },
    scoring = gscv_scoring,
    refit = 'accuracy',
    cv = 10,
    n_jobs = -1,
    verbose = 4,
    
)

grid_svc.fit(X_train, y_train)
svc_gscv_results = pd.DataFrame(grid_svc.cv_results_)
svc_gscv_results.to_csv('grid_svc.cv_results.csv')
grid_svc.best_params_