In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from libsvm.svmutil import *

#load dataset
df = pd.read_csv("dataset.csv")
#extract text and lables
df.dropna(subset=['Toxicity'], inplace=True)
texts = df['Comment'].astype(str).tolist()
labels = df['Toxicity'].tolist()

# Split data
X_train_texts, X_test_texts, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=20)

# ----------- Method 1: TF-IDF + LIBSVM -----------
print("\n=== TF-IDF + LIBSVM ===")

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_texts)
X_test_tfidf = vectorizer.transform(X_test_texts)

# Convert to LIBSVM format
X_train_tfidf_libsvm = [dict(zip(row.indices + 1, row.data)) for row in X_train_tfidf]
X_test_tfidf_libsvm = [dict(zip(row.indices + 1, row.data)) for row in X_test_tfidf]

# Train and predict using LIBSVM
prob1 = svm_problem(y_train, X_train_tfidf_libsvm)
param1 = svm_parameter('-t 0 -c 1')  # Linear kernel
model1 = svm_train(prob1, param1)
p_label1, _, _ = svm_predict(y_test, X_test_tfidf_libsvm, model1)

# Evaluate
f1_tfidf = f1_score(y_test, p_label1)
acc_tfidf = accuracy_score(y_test, p_label1)
print(f"F1 Score (TF-IDF): {f1_tfidf:.4f}")
print(f"Accuracy (TF-IDF): {acc_tfidf:.4f}")