# Machine Learning: Support Vector Classification

## 0 Imports

In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance

In [2]:
data = pd.read_csv("../dat/dips/DIPS_Data_cleaned.csv", sep = ",", low_memory = False)
data_pred = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred.csv", sep = ",", low_memory = False)
data_pred_y = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred_y.csv", sep = ",", low_memory = False)

## 1 Support Vector Classification Model

In [3]:
# Predictors
X = data
X = X.drop(["hpi"], axis = 1)

# Target
y = data["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("SVC \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

Logistic Regression 
 X_train shape:  (1005, 23) 0.8 
 X_test shape:  (252, 23) 0.2 
 y_train shape:  (1005,) 0.8 
 y_test shape:  (252,) 0.2 



In [6]:
def sklearn_support_vector_classification(x_test_svc, x_train_svc, y_train_svc, y_test_svc):
    """Computes OLS weights for linear regression without regularization using the sklearn library on the training set and
       returns weights, testset predictions and metrics.
    """

    # 1: GRID SEARCH
    svc_model = SVC(random_state = 42)

    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }

    grid_search = GridSearchCV(estimator = svc_model, param_grid = param_grid, cv = 10)
    grid_search.fit(x_train_svc, y_train_svc)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = SVC(C = best_model.C,
                kernel = best_model.kernel,
                gamma = best_model.gamma,
                random_state = 42)

    model.fit(x_train_svc, y_train_svc)

    # 3: ESTIMATING WEIGHTS
    if model.kernel == "linear":
        weights_s = model.coef_
        features_s = model.feature_names_in_
    else:
        weights_s = model.dual_coef_

        # get importance of features
        result = permutation_importance(model, x_train_svc, y_train_svc, n_repeats = 30, random_state = 42)
        sorted_idx = result.importances_mean.argsort()[::-1]
        weights_s = result.importances_mean[sorted_idx]
        # get feature names
        features_s = x_train_svc.columns[sorted_idx]

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_s = model.predict(x_test_svc)

    # 5: COMPUTE METRICS
    accuracy_svc = model.score(x_test_svc, y_test_svc)
    macro_f1_svc = recall_score(y_test_svc, y_pred_s, average = "macro")
    micro_f1_svc = recall_score(y_test_svc, y_pred_s, average = "micro")
    mcc_svc = matthews_corrcoef(y_test_svc, y_pred_s)

    cm_svc = confusion_matrix(y_test_svc, y_pred_s)
    precision_svc = cm_svc[1][1] / (cm_svc[1][1] + cm_svc[0][1])
    recall_svc = cm_svc[1][1] / (cm_svc[1][1] + cm_svc[1][0])

    # store metrics in a dictionary
    metrics_s = {
        "accuracy": round(accuracy_svc, 4),
        "macro_f1": round(macro_f1_svc, 4),
        "micro_f1": round(micro_f1_svc, 4),
        "mcc": round(mcc_svc, 4),
        "precision": round(precision_svc, 4),
        "recall": round(recall_svc, 4),
        "confusion_matrix": cm_svc
    }

    return weights_s, y_pred_s, features_s, metrics_s

In [7]:
weights_svc, y_pred_svc, features_svc, metrics_svc = sklearn_support_vector_classification(X_test, X_train, y_train, y_test)

# save weights and predictions
weights_svc_df = pd.DataFrame([weights_svc], columns = features_svc)
weights_svc_df.to_csv("../exp/weights/weights_svc.csv", sep = ",", index = False)

y_pred_svc = pd.DataFrame(y_pred_svc, columns = ["y_pred"])
y_pred_svc.to_csv("../exp/predictions/y_pred_svc.csv", sep = ",", index = False)

Best parameters:  {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best score:  0.7363366336633663


ValueError: Shape of passed values is (23, 1), indices imply (23, 23)

In [8]:
metrics_svc

{'accuracy': 0.6944,
 'macro_f1': 0.5442,
 'micro_f1': 0.6944,
 'mcc': np.float64(0.1692),
 'precision': np.float64(0.625),
 'recall': np.float64(0.1235),
 'confusion_matrix': array([[165,   6],
        [ 71,  10]])}

In [11]:
weights_svc_df

Unnamed: 0,scl_gsiy,bsq_gesy,alter,scl_gsi,soz_gesy,bsq_ges,gke_gesy,whi_ges,woc_gese,lzh_ges,...,woc_gesp,asi_ges,soz_ges,asi_gesy,das_gesy,ile,das_ges,ses_kom,bild,bmi_kat
0,0.029884,0.017479,0.017048,0.013831,0.012736,0.012504,0.012106,0.011177,0.011045,0.010415,...,0.007496,0.00733,0.006667,0.006667,0.006633,0.006501,0.004179,0.003184,0.002952,0.002786
