# Machine Learning: Support Vector Classification

## 0 Imports

In [17]:
import pandas as pd
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance

In [18]:
data = pd.read_csv("../dat/dips/DIPS_Data_cleaned.csv", sep = ",", low_memory = False)
data_pred = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred.csv", sep = ",", low_memory = False)
data_pred_y = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred_y.csv", sep = ",", low_memory = False)

## 1 Support Vector Classification Model

In [19]:
# Predictors
X = data
X = X.drop(["hpi"], axis = 1)

# Target
y = data["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Logistic Regression \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

Logistic Regression 
 X_train shape:  (1005, 23) 0.8 
 X_test shape:  (252, 23) 0.2 
 y_train shape:  (1005,) 0.8 
 y_test shape:  (252,) 0.2 



In [20]:
def sklearn_support_vector_classification(x_test_svc, x_train_svc, y_train_svc, y_test_svc):
    """Computes OLS weights for linear regression without regularization using the sklearn library on the training set and
       returns weights, testset predictions and metrics.
    """

    # 1: GRID SEARCH
    svc_model = SVC(random_state = 42)

    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }

    grid_search = GridSearchCV(estimator = svc_model, param_grid = param_grid, cv = 10)
    grid_search.fit(x_train_svc, y_train_svc)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = SVC(C = best_model.C,
                kernel = best_model.kernel,
                gamma = best_model.gamma,
                random_state = 42)

    model.fit(x_train_svc, y_train_svc)

    # 3: ESTIMATING WEIGHTS

    if model.kernel == "linear":
        # for linear kernel, we can get the weights directly
        weights_s = model.coef_
    else:
        # for non-linear kernels, we cannot get the weights directly
        # but we can use the dual coefficients
        weights_s = model.dual_coef_

        result = permutation_importance(svc_model, X_test, y_test, n_repeats=30, random_state=42)
        weights_s = result.importances_mean

    features_s = model.feature_names_in_

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_s = model.predict(x_test_svc)

    # 5: COMPUTE METRICS
    accuracy_svc = model.score(x_test_svc, y_test_svc)
    macro_f1_svc = recall_score(y_test_svc, y_pred_s, average = "macro")
    micro_f1_svc = recall_score(y_test_svc, y_pred_s, average = "micro")
    mcc_svc = matthews_corrcoef(y_test_svc, y_pred_s)

    cm_svc = confusion_matrix(y_test_svc, y_pred_s)
    precision_svc = cm_svc[1][1] / (cm_svc[1][1] + cm_svc[0][1])
    recall_svc = cm_svc[1][1] / (cm_svc[1][1] + cm_svc[1][0])

    # store metrics in a dictionary
    metrics_s = {
        "accuracy": round(accuracy_svc, 4),
        "macro_f1": round(macro_f1_svc, 4),
        "micro_f1": round(micro_f1_svc, 4),
        "mcc": round(mcc_svc, 4),
        "precision": round(precision_svc, 4),
        "recall": round(recall_svc, 4),
        "confusion_matrix": cm_svc
    }

    return weights_s, y_pred_s, features_s, metrics_s

In [21]:
weights_svc, y_pred_svc, features_svc, metrics_svc = sklearn_support_vector_classification(X_test, X_train, y_train, y_test)

# save weights and predictions
weights_svc_df = pd.DataFrame(weights_svc, columns = features_svc)
weights_svc_df.to_csv("../exp/weights/weights_svc.csv", sep = ",", index = False)

y_pred_svc = pd.DataFrame(y_pred_svc, columns = ["y_pred"])
y_pred_svc.to_csv("../exp/predictions/y_pred_svc.csv", sep = ",", index = False)

Best parameters:  {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best score:  0.7363366336633663


ValueError: Shape of passed values is (1, 689), indices imply (1, 23)

In [11]:
metrics_svc

{'accuracy': 0.6905,
 'macro_f1': 0.5218,
 'micro_f1': 0.6905,
 'mcc': np.float64(0.1458),
 'precision': np.float64(0.8),
 'recall': np.float64(0.0494),
 'confusion_matrix': array([[170,   1],
        [ 77,   4]])}

In [22]:
weights_svc

array([[-0.58162391, -1.        , -1.        , -1.        , -0.28881196,
        -0.80665911, -1.        , -0.33134597, -0.96260098, -0.75793511,
        -0.13222992, -0.77643496, -0.66611852, -0.45941541, -0.74936642,
        -0.40853135, -0.14901545, -1.        , -1.        , -0.3066804 ,
        -0.22897846, -1.        , -0.69256039, -1.        , -0.54456652,
        -0.49390007, -0.28639805, -0.78651109, -0.63889542, -1.        ,
        -1.        , -1.        , -1.        , -0.36231284, -0.74190484,
        -0.44656755, -0.35749087, -0.28671639, -0.49221019, -0.78053682,
        -0.68501997, -1.        , -0.14480744, -0.57636347, -0.69279426,
        -0.53292516, -0.35223857, -1.        , -0.55775501, -0.08664568,
        -0.5414422 , -0.16995881, -1.        , -0.41455889, -0.17447951,
        -0.02941003, -0.27461278, -0.21699459, -1.        , -0.27955916,
        -1.        , -0.16151456, -0.03724407, -1.        , -0.33153877,
        -1.        , -0.50014965, -0.0502042 , -1. 