In [17]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import shap


In [18]:
train_data = pd.read_csv("data/baseline_processed_train_data.csv")
pretest_data = pd.read_csv("data/baseline_processed_pretest_data.csv")
test_data = pd.read_csv("data/baseline_processed_test_data.csv")

print(test_data.head())

sample_size = 10000

train_data = train_data.sample(n=sample_size, random_state=42)
pretest_data = pretest_data.sample(n=sample_size, random_state=42)
sampltest_dataed_test_data = test_data.sample(n=sample_size, random_state=42)



   Unnamed: 0                                             tokens  label
0           0  phones modern humans today are always on their...      0
1           1  this essay will explain if drivers should or s...      0
2           2  driving while the use of cellular devices toda...      0
3           3  phones & driving drivers should not be able to...      0
4           4  cell phone operation while driving the ability...      0


In [19]:
# Faster linear SVM, works with a lot of data

fast_linear_svm_classifier = LinearSVC()

def get_text(row):
    return row['tokens']

X_train = train_data.apply(get_text, axis=1)
y_train = train_data['label']
X_pretest = pretest_data.apply(get_text, axis=1)
y_pretest = pretest_data['label']
X_test = test_data.apply(get_text, axis=1)
y_test = test_data['label']

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('svm', fast_linear_svm_classifier),      
])


pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)

y_pretest_pred = pipeline.predict(X_pretest)

y_test_pred = pipeline.predict(X_test)

train_f1_score = f1_score(y_train, y_train_pred)

pretest_f1_score = f1_score(y_pretest, y_pretest_pred)

test_f1_score = f1_score(y_test, y_test_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)

print("F1 score on training data:", train_f1_score)
print("F1 score on pretest data:", pretest_f1_score)
print("F1 score on test data:", test_f1_score)
print("Accuracy on test data:", test_accuracy)
print("Recall on test data:", test_recall)
print("Precision on test data:", test_precision)



F1 score on training data: 0.9722431549861844
F1 score on pretest data: 0.8959990136243141
F1 score on test data: 0.8596628446855602
Accuracy on test data: 0.7902343167006617
Recall on test data: 0.9887077017302097
Precision on test data: 0.7604145199752701


In [13]:
# Linear SVM
# Will need to use smaller dataset!!

linear_svm_classifier = SVC(kernel='linear')



In [14]:
# Polynimial SVM
# Will need to use smaller dataset!!

polynomial_svm_classifier = SVC(kernel='poly', degree=3)

In [15]:
# Radial Basis Function SVM
# Will need to use smaller dataset!!

rbf_svm_classifier = SVC(kernel='rbf')

In [16]:
# Sigmoid SVM
# Will need to use smaller dataset!!

sigmoid_svm_classifier = SVC(kernel='sigmoid')

In [17]:
# SHAP explenations for fast linear SVM
fast_linear_svm_explainer = shap.KernelExplainer(fast_linear_svm_classifier.predict,  """ Input training data? """)


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
# SHAP explenations for linear SVM

linear_svm_explainer = shap.Explainer(linear_svm_classifier, """ Input training data? """)

In [None]:
# SHAP explenations for polynomial SVM

polynomial_svm_explainer = shap.Explainer(polynomial_svm_classifier, """ Input training data? """)

In [None]:
# SHAP explenations for rbf SVM

rbf_svm_explainer = shap.Explainer(rbf_svm_classifier, """ Input training data? """)

In [None]:
# SHAP explenations for sigmoid SVM

sigmoid_svm_explainer = shap.Explainer(sigmoid_svm_classifier, """ Input training data? """)