In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    roc_auc_score, cohen_kappa_score, roc_curve, make_scorer
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import classification_report

In [2]:
X = pd.read_csv('../../data/4-modelling/1-datainput/aaindex-morgan_chembl.csv')
y = X.pop('Class')
X

Unnamed: 0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,3.066707,-2.170256,-1.977858,-2.333132,-2.488069,-2.677095,2.738514,2.638519,-0.334663,-2.453463,...,0,0,0,1,0,1,0,0,0,0
1,3.066707,-2.170256,-1.977858,-2.333132,-2.488069,-2.677095,2.738514,2.638519,-0.334663,-2.453463,...,0,0,0,0,0,1,1,0,0,0
2,1.383463,0.391879,-0.959167,-0.895293,-0.823573,0.666852,0.722144,-0.242320,0.448794,0.459470,...,0,0,0,0,0,0,0,0,0,0
3,1.383463,0.391879,-0.959167,-0.895293,-0.823573,0.666852,0.722144,-0.242320,0.448794,0.459470,...,0,0,0,1,0,0,0,0,0,0
4,1.383463,0.391879,-0.959167,-0.895293,-0.823573,0.666852,0.722144,-0.242320,0.448794,0.459470,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353,-0.411820,0.698588,1.259762,1.148029,0.956758,0.614219,-0.637768,-0.880549,0.794245,0.890624,...,1,0,0,0,0,0,0,0,0,0
1354,-0.437278,-0.165105,-0.783151,-0.720472,0.166092,-1.055810,0.203921,1.306768,0.001005,-1.074539,...,0,0,0,0,0,0,0,0,0,0
1355,-0.437278,-0.165105,-0.783151,-0.720472,0.166092,-1.055810,0.203921,1.306768,0.001005,-1.074539,...,0,0,0,0,0,1,0,0,0,0
1356,-0.437278,-0.165105,-0.783151,-0.720472,0.166092,-1.055810,0.203921,1.306768,0.001005,-1.074539,...,0,0,0,0,0,0,0,0,0,0


In [3]:
y

0       1
1       1
2       1
3       1
4       1
       ..
1353    0
1354    0
1355    0
1356    0
1357    0
Name: Class, Length: 1358, dtype: int64

In [4]:
if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
    X = X.values
if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
    y = y.values

In [5]:
# Models
models = {
    "MLP": MLPClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

# Cross-validation and metrics
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics = [
    ("Accuracy", accuracy_score),
    ("Recall", recall_score),
    ("Precision", precision_score),
    ("F1-Score", f1_score),
    ("AUC", roc_auc_score),
    ("Cohen Kappa", cohen_kappa_score)
]

results = []

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    metric_scores = {metric[0]: [] for metric in metrics}
    
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

        for metric_name, metric_func in metrics:
            if metric_name == "AUC" and y_proba is not None:
                score = metric_func(y_test, y_proba)
            elif metric_name != "AUC":
                score = metric_func(y_test, y_pred)
            else:
                score = np.nan
            
            metric_scores[metric_name].append(score)

    result = {
        "Model": model_name,
        **{f"Mean {metric[0]}": np.mean(scores) for metric, scores in zip(metrics, metric_scores.values())},
        **{f"Std {metric[0]}": np.std(scores) for metric, scores in zip(metrics, metric_scores.values())}
    }
    results.append(result)

# Save results to CSV
results_df = pd.DataFrame(results)

Evaluating MLP...
Evaluating Random Forest...
Evaluating XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Evaluating Logistic Regression...


In [6]:
results_df

Unnamed: 0,Model,Mean Accuracy,Mean Recall,Mean Precision,Mean F1-Score,Mean AUC,Mean Cohen Kappa,Std Accuracy,Std Recall,Std Precision,Std F1-Score,Std AUC,Std Cohen Kappa
0,MLP,0.849023,0.855632,0.845357,0.849925,0.925523,0.698021,0.014753,0.029846,0.021877,0.014637,0.014632,0.029535
1,Random Forest,0.813683,0.842473,0.800419,0.819217,0.910608,0.627341,0.023487,0.036311,0.045833,0.018105,0.016763,0.047073
2,XGBoost,0.840211,0.843922,0.839915,0.841146,0.927601,0.680402,0.022407,0.021803,0.039399,0.019123,0.015521,0.044829
3,Logistic Regression,0.77023,0.743736,0.78909,0.764656,0.861438,0.540401,0.032552,0.022048,0.051239,0.026216,0.027712,0.065127
