Imports

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

Load Dataset

In [None]:
from pathlib import Path

# use absolute-safe path
project_root = Path.cwd().parent
csv_path = project_root / "data" / "output" / "tfidf_dataset.csv"

# load dataset
df = pd.read_csv(csv_path)

# features and labels
X = df.drop(columns=["cyberbullying"])
y = df["cyberbullying"]

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

Find Best Model (optimal k value)

In [None]:
# SVD + KNN
pipeline = Pipeline([
    ("svd", TruncatedSVD(n_components=100, random_state=42)),
    ("knn", KNeighborsClassifier(weights='distance'))
])

# grid search k = 1 to 26
param_grid = {
    "knn__n_neighbors": list(range(1, 26))
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=cv,
    n_jobs=1
)

# fit to training data
grid_search.fit(X_train, y_train)

# save best model
best_model = grid_search.best_estimator_
print(f"Best k: {grid_search.best_params_['knn__n_neighbors']}")
print(f"Best CV F1 (macro): {grid_search.best_score_:.4f}")

Evaluate Best Model on Test Set

In [None]:
# evaluate on test set
y_pred = best_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Not CB", "Cyberbullying"]))

# confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', xticklabels=["Not CB", "Cyberbullying"], yticklabels=["Not CB", "Cyberbullying"])
plt.title("KNN (Weighted, SVD) - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()