# Benchmarking
Implementing ML algorithms to compare with the performance of the GT model.

## Loading data

In [None]:
from utils import *
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score, roc_auc_score, average_precision_score, confusion_matrix, classification_report

In [None]:
set_seed(222)
H = load_trans()
train_loader, test_loader, val_loader = mask_and_batch_trans(H)

In [None]:
train_patient_ids = get_unmasked_node_ids(train_loader)
test_patient_ids = get_unmasked_node_ids(test_loader)
val_patient_ids = get_unmasked_node_ids(val_loader)
len(train_patient_ids), len(test_patient_ids), len(val_patient_ids)

In [None]:
train_set = set(train_patient_ids)
test_set = set(test_patient_ids)
val_set = set(val_patient_ids)

train_test_overlap = train_set.intersection(test_set)
train_val_overlap = train_set.intersection(val_set)
test_val_overlap = test_set.intersection(val_set)

assert not train_test_overlap, "There is an overlap between train and test patient IDs!"
assert not train_val_overlap, "There is an overlap between train and validation patient IDs!"
assert not test_val_overlap, "There is an overlap between test and validation patient IDs!"

print("No overlap between train, test, and validation patient IDs.")

In [None]:
patient_data = pd.read_csv(r"/PATIENT_DATA.csv")
patient_data = patient_data.drop(columns=['Unnamed: 0'])

In [None]:
train_df = patient_data[patient_data['patient_id'].isin(train_set)]
test_df = patient_data[patient_data['patient_id'].isin(test_set)]
val_df = patient_data[patient_data['patient_id'].isin(val_set)]

In [None]:
train_df = train_df.drop(columns=['patient_id'])
test_df = test_df.drop(columns=['patient_id'])
val_df = val_df.drop(columns=['patient_id'])

## Splitting data

In [None]:
# split labels for each set
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_val = val_df.drop('label', axis=1)
y_val = val_df['label']

X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

## Search/plot metrics
https://stackoverflow.com/questions/34624978/is-there-easy-way-to-grid-search-without-cross-validation-in-python

In [None]:
def grid_search(model, param_grid, X_train, y_train, X_val, y_val, X_test, y_test):    
    param_candidates = ParameterGrid(param_grid)
    print(f'{len(param_candidates)} candidates')

    results = []
    for i, params in enumerate(param_candidates):
        model.set_params(**params).fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        score = f1_score(y_val, y_val_pred)  
        results.append([params, score])
        print(f'{i+1}/{len(param_candidates)}: ', params, score)

    best_params, best_score = max(results, key=lambda x: x[1])
    print(f'Best parameters: {best_params}')
    print(f'Best validation F1 score: {best_score}')

    best_model = model.set_params(**best_params)
    best_model.fit(X_train, y_train)

    y_test_pred = best_model.predict(X_test)
    test_f1_score = f1_score(y_test, y_test_pred)  
    print(f'Test set F1 score: {test_f1_score}')

    return best_model

In [None]:
def plot_metrics(model, X, y, set_name):
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]

    accuracy = accuracy_score(y, y_pred)
    balanced_accuracy = balanced_accuracy_score(y, y_pred)
    auroc = roc_auc_score(y, y_pred_proba)
    auprc = average_precision_score(y, y_pred_proba)
    recall = recall_score(y, y_pred, average='binary', zero_division=0)
    precision = precision_score(y, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y, y_pred, average='binary', zero_division=0)
    cm = confusion_matrix(y, y_pred)

    metrics = {
        "Accuracy": accuracy,
        "Balanced accuracy": balanced_accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1 Score": f1,
        "AUROC": auroc,
        "AUPRC": auprc,
        "Confusion matrix": cm
    }
    
    return metrics

## KNN

In [None]:
knn = KNeighborsClassifier()
knn_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15],    
    'weights': ['uniform', 'distance'],     
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  
    'p': [1, 2] #manhatan, eucl
}
best_model_knn = grid_search(knn, knn_grid, X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
plot_metrics(best_model_knn, X_test, y_test, "Test")

In [None]:
#joblib.dump(best_model_knn, '/KNN.joblib')

## LR

In [None]:
log_reg = LogisticRegression()
log_reg_grid = [
    {
        'penalty': ['l1'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200]
    },
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 200]
    },
    {
        'penalty': ['elasticnet'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'l1_ratio': [0.1, 0.5, 0.7, 1.0],
        'max_iter': [100, 200]
    },
]
best_model_log_reg = grid_search(log_reg, log_reg_grid, X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
plot_metrics(best_model_log_reg, X_test, y_test, "Test")

In [None]:
#joblib.dump(best_model_log_reg, '/LR.joblib')

## RF

In [None]:
rf = RandomForestClassifier()
rf_grid = {
    'n_estimators': [100, 150],
    'max_depth': [5, 7],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}
best_model_rf = grid_search(rf, rf_grid, X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
plot_metrics(best_model_rf, X_test, y_test, "Test")

In [None]:
import joblib
#joblib.dump(best_model_rf, '/RF.joblib')

## GBT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier()
gbt_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
best_model_gbt = grid_search(gbt, gbt_grid, X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
plot_metrics(best_model_gbt, X_test, y_test, "Test")

In [None]:
#joblib.dump(best_model_gbt, '/GBT.joblib')

## MLP

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp_grid = {
    'hidden_layer_sizes': [(128, 64, 32), (150, 100, 50), (50, 30)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 400]
}
best_model_mlp = grid_search(mlp, mlp_grid, X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
plot_metrics(best_model_mlp, X_test, y_test, "Test")

In [None]:
import joblib
joblib.dump(best_model_mlp, '/MLP.joblib')

## Curves

In [None]:
import joblib
loaded_mlp = joblib.load('/nfs/home/heloss23/sweep/max-f1/baselines/mlp_model.joblib')
loaded_knn = joblib.load('/nfs/home/heloss23/sweep/max-f1/baselines/knn_model.joblib')
loaded_gbt = joblib.load('/nfs/home/heloss23/sweep/max-f1/baselines/gbt_model.joblib')
loaded_rf = joblib.load('/nfs/home/heloss23/sweep/max-f1/baselines/rf_model.joblib')
loaded_log_reg = joblib.load('/nfs/home/heloss23/sweep/max-f1/baselines/log_reg_model.joblib')

In [None]:
models = {
    'KNN': loaded_knn,
    'GBT': loaded_gbt,
    'LR': loaded_log_reg,
    'RF': loaded_rf,
    'MLP': loaded_mlp
}

In [None]:
# loading special loss models

def create_model_loss(config, loss_type="bce", alpha=None, gamma=None):
    if config["model_type"] == 'gat':
        model = GATnorm(config["hidden_size"], config["num_layers"], config["dropout"], config["activation_function"], config["num_heads"], loss_type=loss_type, alpha=alpha, gamma=gamma)
    elif config["model_type"] == 'graphsage':
        model = SAGEnorm(config["hidden_size"], config["num_layers"], config["dropout"], config["activation_function"], loss_type=loss_type, alpha=alpha, gamma=gamma)
    elif config["model_type"] == 'graphtransformer':
        model = GraphTransformernorm(config["hidden_size"], config["num_layers"], config["dropout"], config["activation_function"], config["num_heads"], loss_type=loss_type, alpha=alpha, gamma=gamma)
    print(model)
    return model

def load_checkpoint(basemodel_path, checkpoint_path, test_loader, load_state_dicts=True, loss_type='focal', alpha=0.75, gamma=1, device='cuda'):
    base_model = torch.load(basemodel_path) #, map_location=device
    print(base_model["config"])

    checkpoint = torch.load(checkpoint_path) #, map_location=device
    print(checkpoint["config"]["loss_type"], checkpoint["config"]["alpha"])

    model_loaded = create_model_loss(base_model["config"], loss_type=loss_type, alpha=alpha, gamma=gamma)

    if load_state_dicts:
        model_loaded.load_state_dict(checkpoint["model_state_dict"])

    optimizer = set_optim(base_model["config"], model_loaded)

    if load_state_dicts:
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)

    scheduler = StepLR(optimizer, step_size=1, gamma=0.95)

    if load_state_dicts:
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

    model_loaded.to(device)
    model_loaded.eval()

    # first_batch = next(iter(test_loader))
    # with torch.inference_mode():
    #     first_batch.to(device)
    #     loaded_model_output = model_loaded(first_batch)

    return model_loaded, optimizer, scheduler #, loaded_model_output

basemodel_path = r"/BASEMODEL_PATH.pth"
checkpoint_path = r"/CHECKPOINT_PATH.pth" 
model_loaded, optimizer, scheduler = load_checkpoint(basemodel_path, checkpoint_path, test_loader)

In [None]:
def plot_metrics(models, X_test, y_test):
    f1_scores = {}
    plt.figure(figsize=(18, 6))
    
    plt.subplot(1, 3, 1)
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        f1 = f1_score(y_test, y_pred)
        f1_scores[name] = f1
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        
        plt.subplot(1, 3, 2)
        plt.plot(fpr, tpr, label=f'{name} ({roc_auc_score(y_test, y_pred_proba):.4f})')
        
        plt.subplot(1, 3, 3)
        plt.plot(recall, precision, label=f'{name} ({average_precision_score(y_test, y_pred_proba):.4f})')
    
    plt.subplot(1, 3, 1)
    bars = plt.bar(f1_scores.keys(), f1_scores.values())
    plt.xlabel('Model')
    plt.ylabel('F1 score')
    plt.title('F1 scores - Baselines')
    plt.xticks(rotation=45)
    
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.4f}', ha='center', va='bottom')

    plt.subplot(1, 3, 2)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('AUROC - Baselines')
    plt.legend(loc='best')
    
    plt.subplot(1, 3, 3)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('AUPRC - Baselines')
    plt.legend(loc='best')
    
    plt.tight_layout()
    plt.show()

plot_metrics(models, X_test, y_test)

In [None]:
def plot_metrics(models, gt_model, test_loader, X_test, y_test, device='cuda'):
    f1_scores = {}
    plt.figure(figsize=(18, 6))
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        f1 = f1_score(y_test, y_pred)
        f1_scores[name] = f1
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        
        plt.subplot(1, 3, 2)
        plt.plot(fpr, tpr, label=f'{name} ({roc_auc_score(y_test, y_pred_proba):.4f})')
        
        plt.subplot(1, 3, 3)
        plt.plot(recall, precision, label=f'{name} ({average_precision_score(y_test, y_pred_proba):.4f})')

    test_acc, test_avg_loss, test_probs, test_preds, test_labels = test(test_loader, gt_model, device)
    
    f1_scores['GT'] = f1_gt
    fpr_gt, tpr_gt, _ = roc_curve(test_labels, test_probs)
    precision_gt, recall_gt, _ = precision_recall_curve(test_labels, test_probs)
    
    plt.subplot(1, 3, 2)
    plt.plot(fpr_gt, tpr_gt, label=f'GT ({auc(fpr_gt, tpr_gt):.4f})', linestyle='--')
    
    plt.subplot(1, 3, 3)
    plt.plot(recall_gt, precision_gt, label=f'GT ({auc(recall_gt, precision_gt):.4f})', linestyle='--')
    
    sorted_f1_scores = dict(sorted(f1_scores.items(), key=lambda item: item[1]))
    
    plt.subplot(1, 3, 1)
    bars = plt.bar(sorted_f1_scores.keys(), sorted_f1_scores.values())
    plt.xlabel('Model')
    plt.ylabel('F1 score')
    plt.title('F1 scores')
    plt.xticks(rotation=45)
    
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.4f}', ha='center', va='bottom')

    plt.subplot(1, 3, 2)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('AUROC')
    plt.legend(loc='best')
    
    plt.subplot(1, 3, 3)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('AUPRC')
    plt.legend(loc='best')
    
    plt.tight_layout()
    plt.show()

In [None]:
set_seed(222)
H = load_trans()
train_loader, test_loader, val_loader = mask_and_batch_trans(H)
plot_metrics(models, model_loaded, test_loader, X_test, y_test, device='cuda')