In [None]:
import pickle
import random
import warnings
from pathlib import Path

warnings.filterwarnings("ignore")

import _config as cfg
import DataSets_validation as ds
import kaplanmeier as km
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearnex import patch_sklearn

patch_sklearn()

from sklearn import svm
from sklearn.metrics import (accuracy_score, auc, f1_score, precision_score,
                             recall_score, roc_auc_score, roc_curve)
from tqdm.notebook import tqdm

random.seed(1024)
np.random.seed(1024)

results_dir = Path("Results")
results_dir.mkdir(parents=True, exist_ok=True)

# Matplotlib config
plt.rcdefaults()

main_font_size = 7
label_font_size = 10

axis_color = "0.15"

plt.rcParams["text.color"] = axis_color
plt.rcParams["axes.labelcolor"] = axis_color
plt.rcParams["xtick.color"] = axis_color
plt.rcParams["ytick.color"] = axis_color
plt.rcParams["axes.edgecolor"] = axis_color

plt.rcParams["font.size"] = main_font_size
plt.rcParams["axes.labelsize"] = main_font_size
plt.rcParams["axes.titlesize"] = main_font_size
plt.rcParams["xtick.labelsize"] = main_font_size
plt.rcParams["ytick.labelsize"] = main_font_size
plt.rcParams["legend.fontsize"] = main_font_size
plt.rcParams["figure.labelsize"] = main_font_size
plt.rcParams["figure.titlesize"] = main_font_size

plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

plt.rcParams["savefig.transparent"] = True

plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42
plt.rcParams["svg.fonttype"] = "none"

In [None]:
def svm_train_test(
    train_data, train_y, test_data, verbose=False, clf_kwargs={}
):
    sm = SMOTE(random_state=432)
    train_data, train_y = sm.fit_resample(train_data, train_y)
    clf = svm.SVC(probability=True, random_state=2137, **clf_kwargs)
    clf = clf.fit(train_data, train_y)

    predictions = clf.predict(test_data)
    probs = clf.predict_proba(test_data)
    probSV = [i[1] for i in probs]
    if verbose:
        print(probSV)
    new_pd = pd.DataFrame(probSV)
    return clf, predictions, new_pd

In [None]:
# Params

# dimension_of_embedding_vectors = 149  # best Peptide level
# dimension_of_embedding_vectors = 327  # best Contig level
# dimension_of_embedding_vectors = 77  # best Scaffold level
# dimension_of_embedding_vectors = 21  # best Baseline

clf_kwargs = {"kernel": "rbf"}  # The default

# dimension_of_embedding_vectors = 243  # best Peptide level
# clf_kwargs = {"C": 5, "gamma": "scale", "kernel": "rbf"}
# dimension_of_embedding_vectors = 212  # best Contig level
# clf_kwargs = {"C": 5, "gamma": "scale", "kernel": "rbf"}
# dimension_of_embedding_vectors = 41  # best Scaffold level
# clf_kwargs = {"C": 50, "gamma": "scale", "kernel": "rbf"}
# dimension_of_embedding_vectors = 112  # best Baseline
# clf_kwargs = {"C": 5, "gamma": "scale", "kernel": "rbf"}

#

mut_vec_len_label = "mut_vec_len"

fig_roc = plt.figure(figsize=(4, 3), layout="constrained")
ax_roc = fig_roc.add_subplot()

test_scores_str = ""

for config in tqdm(cfg.configurations, leave=False):
    dimension_of_embedding_vectors = config[mut_vec_len_label]

    train_data, train_y, test_data, test_y, test_pfs, _ = ds.transforming_Braun_dataset(
        config, dimension_of_embedding_vectors
    )

    model, svm_linear_preds, svm_prob = svm_train_test(
        train_data, train_y, test_data, clf_kwargs=clf_kwargs
    )

    probs = svm_prob.values.flatten()
    fpr, tpr, _ = roc_curve(test_y, probs)
    roc_auc = auc(fpr, tpr)

    prec = precision_score(test_y, svm_linear_preds)
    rec = recall_score(test_y, svm_linear_preds)
    f1 = f1_score(test_y, svm_linear_preds)
    acc = accuracy_score(test_y, svm_linear_preds)
    rauc = roc_auc_score(test_y, probs)

    test_scores_str += f"{config['plot_label']}:\n"
    test_scores_str += f"  Accuracy: {acc:.2f}\n"
    test_scores_str += f"  Precision: {prec:.2f}\n"
    test_scores_str += f"  Recall: {rec:.2f}\n"
    test_scores_str += f"  F1 Score: {f1:.2f}\n"
    test_scores_str += f"  ROC AUC: {rauc:.2f}\n"

    _ = ax_roc.plot(fpr, tpr, label=f'{config["plot_label"]} (AUC={roc_auc:.2f})')

    with open(
        results_dir.joinpath(f"svm-model-{config['plot_label'].replace(' ', '_')}.pkl"),
        "wb",
    ) as f:
        pickle.dump(model, f)

    # Survival prediction
    time_event = test_pfs.reset_index(drop=True)
    censoring = pd.Series([1 if p == -1 else 0 for p in svm_linear_preds])
    labx = pd.Series(["Responders" if y == 1 else "Non-responders" for y in test_y])
    surv_pred = km.fit(time_event, censoring, labx)
    fig_surv, ax_surv = km.plot(surv_pred, figsize=(4, 3), fontsize=main_font_size)
    old_title = ax_surv.get_title()
    ax_surv.set_title(
        old_title.replace("Survival function", f"{config['plot_label']}"),
        {"fontsize": label_font_size},
    )
    ax_surv.tick_params(length=3.5)
    for t in ax_surv.get_legend().get_texts():
        t.set_fontsize(main_font_size)
    fig_surv.savefig(
        results_dir.joinpath(
            f"response-prediction-{config['plot_label'].replace(' ', '_')}.pdf"
        ),
        bbox_inches="tight",
        pad_inches=0.01,
    )

ax_roc.set_xlabel("False Positive Rate")
ax_roc.set_ylabel("True Positive Rate")
ax_roc.set_title("ROC Curves for different levels", {"fontsize": label_font_size})
ax_roc.legend(loc="lower right")
fig_roc.savefig(results_dir.joinpath("roc-curves.pdf"))

print(test_scores_str)