# Support Vector Machine

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (6.4, 4.8)
plt.rcParams["figure.dpi"] = 300
plt.rcParams["figure.titleweight"] = "bold"
plt.rcParams["axes.titleweight"] = "bold"
plt.rcParams["axes.titlepad"] = 10.0
plt.rcParams["axes.titlelocation"] = "left"
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("svg")
import seaborn as sns
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix

In [29]:
X_train_basic = np.load("../data/X_train_basic.npy")
X_train_freq = np.load("../data/X_train_freq.npy")
X_train_ite = np.load("../data/X_train_ite.npy")
y_train = np.load("../data/y_train.npy")
X_train = X_train_freq

## Linear SVM

In [53]:
linear_sgd = SGDClassifier(loss="hinge", class_weight="balanced", random_state=42)
param_grid = [
    {"penalty": ["l2"],
     "alpha": [0.5, 0.35, 0.1],
     "learning_rate": ["optimal"]}
    ]
grid_search = GridSearchCV(linear_sgd,
                           param_grid,
                           scoring="average_precision",
                           return_train_score=True,
                           cv=5,
                           n_jobs=-1)
grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_

grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_
grid_best_score = grid_search.best_score_

print(f"best parameters found: {grid_best_params}, with mean test score: {grid_best_score}")

for test_score, train_score, params in zip(grid_results["mean_test_score"],
                                           grid_results["mean_test_score"],
                                           grid_results["params"]):
    print(f"mean test score: {test_score}, mean train score: {train_score}, for {params}.")

best parameters found: {'alpha': 0.35, 'learning_rate': 'optimal', 'penalty': 'l2'}, with mean test score: 0.42034961790682274
mean test score: 0.4195851195638352, mean train score: 0.4195851195638352, for {'alpha': 0.5, 'learning_rate': 'optimal', 'penalty': 'l2'}.
mean test score: 0.42034961790682274, mean train score: 0.42034961790682274, for {'alpha': 0.35, 'learning_rate': 'optimal', 'penalty': 'l2'}.
mean test score: 0.4152766490200264, mean train score: 0.4152766490200264, for {'alpha': 0.1, 'learning_rate': 'optimal', 'penalty': 'l2'}.


In [55]:
linear_svm = LinearSVC(loss="squared_hinge", dual=False, class_weight="balanced", random_state=42)
param_grid = [
    {"C": [60, 65, 50, 45, 40],
     "tol": [1, 0.7, 0.5, 0.2, 0.1]}
    ]
grid_search = GridSearchCV(linear_svm,
                           param_grid,
                           scoring="average_precision",
                           return_train_score=True,
                           cv=5,
                           n_jobs=-1)
grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_
grid_best_score = grid_search.best_score_

print(f"best parameters found: {grid_best_params}, with mean test score: {grid_best_score}")

for test_score, train_score, params in zip(grid_results["mean_test_score"],
                                           grid_results["mean_test_score"],
                                           grid_results["params"]):
    print(f"mean test score: {test_score}, mean train score: {train_score}, for {params}.")

best parameters found: {'C': 50, 'tol': 0.5}, with mean test score: 0.4297189896717807
mean test score: 0.4235122568845678, mean train score: 0.4235122568845678, for {'C': 60, 'tol': 1}.
mean test score: 0.4235122568845678, mean train score: 0.4235122568845678, for {'C': 60, 'tol': 0.7}.
mean test score: 0.42971792525818164, mean train score: 0.42971792525818164, for {'C': 60, 'tol': 0.5}.
mean test score: 0.42927445021847654, mean train score: 0.42927445021847654, for {'C': 60, 'tol': 0.2}.
mean test score: 0.4280384167262337, mean train score: 0.4280384167262337, for {'C': 60, 'tol': 0.1}.
mean test score: 0.4235122568845678, mean train score: 0.4235122568845678, for {'C': 65, 'tol': 1}.
mean test score: 0.4235122568845678, mean train score: 0.4235122568845678, for {'C': 65, 'tol': 0.7}.
mean test score: 0.42971792525818164, mean train score: 0.42971792525818164, for {'C': 65, 'tol': 0.5}.
mean test score: 0.42927445021847654, mean train score: 0.42927445021847654, for {'C': 65, 'tol

## Non-Linear SVM

In [57]:
poly_svm = SVC(kernel="poly", class_weight="balanced", random_state=42)
param_grid = [
    {"C": [100, 10, 1],
     "degree": [3],
     "tol": [0.001]}
    ]
grid_search = GridSearchCV(poly_svm,
                           param_grid,
                           scoring="average_precision",
                           return_train_score=True,
                           cv=5,
                           n_jobs=-1)
grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_
grid_best_score = grid_search.best_score_

print(f"best parameters found: {grid_best_params}, with mean test score: {grid_best_score}")

for test_score, train_score, params in zip(grid_results["mean_test_score"],
                                           grid_results["mean_test_score"],
                                           grid_results["params"]):
    print(f"mean test score: {test_score}, mean train score: {train_score}, for {params}.")