In [1]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import shap
from scipy.sparse import load_npz
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
import pickle
from sklearn.model_selection import train_test_split



In [2]:
train_data_x_sparse = load_npz('train_data_x_sparse.npz')
train_data_x_dense = pd.read_csv('train_data_x_dense.csv')
pretest_data_x_sparse = load_npz('pretest_data_x_sparse.npz')
pretest_data_x_dense = pd.read_csv('pretest_data_x_dense.csv')
test_data_x_sparse = load_npz('test_data_x_sparse.npz')
test_data_x_dense = pd.read_csv('test_data_x_dense.csv')

train_data_x = pd.DataFrame(hstack([train_data_x_sparse, csr_matrix(train_data_x_dense.values)]).toarray())
pretest_data_x = pd.DataFrame(hstack([pretest_data_x_sparse, csr_matrix(pretest_data_x_dense.values)]).toarray())
test_data_x = pd.DataFrame(hstack([test_data_x_sparse, csr_matrix(test_data_x_dense.values)]).toarray())
train_data_y = np.ravel(pd.read_csv('train_data_y.csv'))
pretest_data_y = np.ravel(pd.read_csv('pretest_data_y.csv'))
test_data_y = np.ravel(pd.read_csv('test_data_y.csv'))

# 10k samples for training and testing the non-fast SVM's
sampled_train_data_x, _, sampled_train_data_y, _ = train_test_split(train_data_x, train_data_y, train_size=10000, stratify=train_data_y, random_state=42)
sampled_pretest_data_x, _, sampled_pretest_data_y, _ = train_test_split(pretest_data_x, pretest_data_y, train_size=10000, stratify=pretest_data_y, random_state=42)
sampled_test_data_x, _, sampled_test_data_y, _ = train_test_split(test_data_x, test_data_y, train_size=10000, stratify=test_data_y, random_state=42)

with open('sparse_matrices_feature_names.pkl', 'rb') as f:
    sparse_matrices_feature_names = pickle.load(f)

all_feature_names = list(sparse_matrices_feature_names) + list(train_data_x_dense.columns)
dense_feature_indices = [all_feature_names.index(feature) for feature in train_data_x_dense.columns]
dense_feature_names = list(train_data_x_dense.columns) # Engineered features
sparse_feature_indices = [all_feature_names.index(feature) for feature in sparse_matrices_feature_names]
sparse_feature_names = list(sparse_matrices_feature_names) # TF-IDF features

In [11]:
# Faster linear SVM, works with a lot of data

fast_linear_svm_classifier = LinearSVC(dual="auto")

fast_linear_svm_classifier.fit(train_data_x, train_data_y)



In [3]:
# Training a Linear SVM model, using sampled data

linear_svm_classifier = SVC(kernel='linear')

linear_svm_classifier.fit(sampled_train_data_x, sampled_train_data_y)

In [7]:
# Training a Polynimial SVM model, using sampled data

polynomial_svm_classifier = SVC(kernel='poly', degree=3)

polynomial_svm_classifier.fit(sampled_train_data_x, sampled_train_data_y)


In [8]:
# Training a Radial Basis Function SVM model, using sampled data

rbf_svm_classifier = SVC(kernel='rbf')

rbf_svm_classifier.fit(sampled_train_data_x, sampled_train_data_y)


In [9]:
# Training a Sigmoid SVM model, using sampled data

sigmoid_svm_classifier = SVC(kernel='sigmoid')

sigmoid_svm_classifier.fit(sampled_train_data_x, sampled_train_data_y)



In [15]:
fast_linear_svm_f1_train = f1_score(train_data_y, fast_linear_svm_classifier.predict(train_data_x))
linear_svm_f1_train = f1_score(sampled_train_data_y, linear_svm_classifier.predict(sampled_train_data_x))
polynomial_svm_f1_train = f1_score(sampled_train_data_y, polynomial_svm_classifier.predict(sampled_train_data_x))
rbf_svm_f1_train = f1_score(sampled_train_data_y, rbf_svm_classifier.predict(sampled_train_data_x))
sigmoid_svm_f1_train = f1_score(sampled_train_data_y, sigmoid_svm_classifier.predict(sampled_train_data_x))

fast_linear_svm_f1_pretest = f1_score(pretest_data_y, fast_linear_svm_classifier.predict(pretest_data_x))
linear_svm_f1_pretest = f1_score(sampled_pretest_data_y, linear_svm_classifier.predict(sampled_pretest_data_x))
polynomial_svm_f1_pretest = f1_score(sampled_pretest_data_y, polynomial_svm_classifier.predict(sampled_pretest_data_x))
rbf_svm_f1_pretest = f1_score(sampled_pretest_data_y, rbf_svm_classifier.predict(sampled_pretest_data_x))
sigmoid_svm_f1_pretest = f1_score(sampled_pretest_data_y, sigmoid_svm_classifier.predict(sampled_pretest_data_x))

fast_linear_svm_f1_test = f1_score(test_data_y, fast_linear_svm_classifier.predict(test_data_x))
linear_svm_f1_test = f1_score(sampled_test_data_y, linear_svm_classifier.predict(sampled_test_data_x))
polynomial_svm_f1_test = f1_score(sampled_test_data_y, polynomial_svm_classifier.predict(sampled_test_data_x))
rbf_svm_f1_test = f1_score(sampled_test_data_y, rbf_svm_classifier.predict(sampled_test_data_x))
sigmoid_svm_f1_test = f1_score(sampled_test_data_y, sigmoid_svm_classifier.predict(sampled_test_data_x))

# GitHub Copilot used for print statement formatting
print(f"{'':<20s}{'|':<12s}{'Fast linear SVM':<23s}{'|':<12s}{'Linear SVM':<23s}{'|':<8s}{'Polynomial SVM':<27s}{''}{'|':<12s}{'RBF SVM':<23s}{'|':<12s}{'Sigmoid SVM':<23s}{'|'}")
print(f"{'-' * 20}{'|'}{'-' * 17}{'+'}{'-' * 16}{'+'}{'-' * 17}{'+'}{'-' * 16}{'+'}{'-' * 17}{'+'}{'-' * 16}{'+'}{'-' * 17}{'+'}{'-' * 16}{'+'}{'-' * 17}{'+'}{'-' * 16}{'|'}")
print(f"{'F1 score train':<20s}{'|':<15s}{fast_linear_svm_f1_train:<20.4f}{'|':<15s}{linear_svm_f1_train:<20.4f}{'|':<15s}{polynomial_svm_f1_train:<20.4f}{'|':<15s}{rbf_svm_f1_train:<20.4f}{'|':<15s}{sigmoid_svm_f1_train:<20.4f}{'|'}")
print(f"{'F1 score pretest':<20s}{'|':<15s}{fast_linear_svm_f1_pretest:<20.4f}{'|':<15s}{linear_svm_f1_pretest:<20.4f}{'|':<15s}{polynomial_svm_f1_pretest:<20.4f}{'|':<15s}{rbf_svm_f1_pretest:<20.4f}{'|':<15s}{sigmoid_svm_f1_pretest:<20.4f}{'|'}")
print(f"{'F1 score test':<20s}{'|':<15s}{fast_linear_svm_f1_test:<20.4f}{'|':<15s}{linear_svm_f1_test:<20.4f}{'|':<15s}{polynomial_svm_f1_test:<20.4f}{'|':<15s}{rbf_svm_f1_test:<20.4f}{''}{'|':<15s}{sigmoid_svm_f1_test:<20.4f}{'|'}")

                    |           Fast linear SVM        |           Linear SVM             |       Polynomial SVM             |           RBF SVM                |           Sigmoid SVM            |
--------------------|-----------------+----------------+-----------------+----------------+-----------------+----------------+-----------------+----------------+-----------------+----------------|
F1 score train      |              0.9123              |              0.9099              |              0.8940              |              0.8785              |              0.8685              |
F1 score pretest    |              0.9021              |              0.8817              |              0.8770              |              0.8732              |              0.8670              |
F1 score test       |              0.8808              |              0.8194              |              0.7963              |              0.7889              |              0.7919              |


In [None]:
# Samples used for calculating SHAP values

shap_samples = shap.sample(train_data_x, 100) # Should be increased later!! 