# SVM

## Imports

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

## Import Dataset

In [26]:
DATASET_PATH = "../data/output/tfidf_dataset.csv"
RNG = 42  # Random seed

df = pd.read_csv(DATASET_PATH)

print(df['cyberbullying'].value_counts())
df.head(10)

cyberbullying
0    16225
1    16225
Name: count, dtype: int64


Unnamed: 0,10th,12th,15th,16th,1960s,19th,19th century,1px,1px solid,1st,...,zealand,zero,zionist,zoe,zone,zoo,zuck,zuckerberg,ŷour,cyberbullying
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Fitting

In [27]:
K = 5 # Number of folds for k-fold CV

# 1. Separate features and labels
X = df.drop(columns=['cyberbullying'])  # TF-IDF features
y = df['cyberbullying']                 # Target label

# 2. Initialize model
svm_model = LinearSVC(random_state=RNG, max_iter=5000)

# 3. Cross-validated predictions (5-fold)
cv = StratifiedKFold(n_splits=K, shuffle=True, random_state=RNG)
y_pred = cross_val_predict(svm_model, X, y, cv=cv) # Evaluate on out of fold predictions (Just one prediction per sample, made by a model trained on all other samples.)

# 4. Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))
print("\nClassification Report:")
print(classification_report(y, y_pred))

Confusion Matrix:
[[14849  1376]
 [ 2023 14202]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90     16225
           1       0.91      0.88      0.89     16225

    accuracy                           0.90     32450
   macro avg       0.90      0.90      0.90     32450
weighted avg       0.90      0.90      0.90     32450

