# Machine Learning: Support Vector Classification

## 0 Imports

In [6]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

In [7]:
data_change = pd.read_csv("../../dat/dips/DIPS_Data_cleaned_change.csv", sep =",", low_memory = False)

## 1 Support Vector Classification Model

In [8]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("SVC \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

SVC 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [11]:
def sklearn_support_vector_classification(x_test_svc, x_train_svc, y_train_svc, y_test_svc):

    # 1: GRID SEARCH
    svc_model = SVC(random_state = 42)

    param_grid = {
        'C': np.logspace(-4, 4, 10),
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'degree': [2, 3, 4, 5],
        'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
        'max_iter': [100, 500, 1000, 2500, 5000],
        'tol': [1e-4, 1e-3, 1e-2, 1e-1]
    }

    grid_search = GridSearchCV(
        estimator = svc_model,
        param_grid = param_grid,
        cv = 10
    )
    grid_search.fit(x_train_svc, y_train_svc)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = SVC(
        C = best_model.C,
        kernel = best_model.kernel,
        degree = best_model.degree,
        gamma = best_model.gamma,
        max_iter = best_model.max_iter,
        tol = best_model.tol,
        random_state = 42
    )
    model.fit(x_train_svc, y_train_svc)

    # 3: ESTIMATING WEIGHTS
    # if model.kernel == "linear":
    #     weights_s = model.coef_
    #     features_s = model.feature_names_in_
    # else:
    #     weights_s = model.dual_coef_
    #
    #     # get importance of features
    #     result = permutation_importance(model, x_train_svc, y_train_svc, n_repeats = 30, random_state = 42)
    #     sorted_idx = result.importances_mean.argsort()[::-1]
    #     weights_s = result.importances_mean[sorted_idx]
    #     # get feature names
    #     features_s = x_train_svc.columns[sorted_idx]

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_s = model.predict(x_test_svc)

    # 5: COMPUTE METRICS
    accuracy_svc = model.score(x_test_svc, y_test_svc)
    macro_f1_svc = recall_score(y_test_svc, y_pred_s, average = "macro")
    micro_f1_svc = recall_score(y_test_svc, y_pred_s, average = "micro")
    mcc_svc = matthews_corrcoef(y_test_svc, y_pred_s)

    cm_svc = confusion_matrix(y_test_svc, y_pred_s)
    precision_svc = cm_svc[1][1] / (cm_svc[1][1] + cm_svc[0][1])
    recall_svc = cm_svc[1][1] / (cm_svc[1][1] + cm_svc[1][0])

    # store metrics in a dictionary
    metrics_s = {
        "accuracy": round(accuracy_svc, 4),
        "macro_f1": round(macro_f1_svc, 4),
        "micro_f1": round(micro_f1_svc, 4),
        "mcc": round(mcc_svc, 4),
        "precision": round(precision_svc, 4),
        "recall": round(recall_svc, 4),
        "confusion_matrix": cm_svc
    }

    return y_pred_s, metrics_s

In [1]:
start = time.time()

y_pred_svc, metrics_svc = sklearn_support_vector_classification(X_test, X_train, y_train, y_test)

end = time.time()
print("Execution time: ", {end - start}, "seconds")
time_svc = end - start
time_svc_df = pd.DataFrame({"time": [time_svc]})
# time_svc_df.to_csv("times_ML/time_svc.csv", sep = ",", index = False)

# save y_pred_ML
y_pred_svc = pd.DataFrame(y_pred_svc, columns = ["y_pred"])
y_pred_svc.to_csv("y_pred_ML/y_pred_svc.csv", sep = ",", index = False)

NameError: name 'time' is not defined

In [None]:
metrics_svc