In [3]:
import nltk
import pandas as pd
import numpy as np
import random
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes, svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from langdetect import detect

In [4]:
# Set random seeds for reproducibility
SEED = 500
random.seed(SEED)
np.random.seed(SEED)

In [5]:
# Load JSONL file
file_path = "DUTA10K_final.jsonl"
df = pd.read_json(file_path, lines=True)

In [6]:
# Get unique languages from the 'lang' column to build a multi-language stopword list
langs = df["lang"].unique().tolist()
multi_stops = set()
for lg in langs:
    try:
        multi_stops |= set(stopwords.words(lg))
    except OSError:
        pass

In [7]:
# Define a function to clean text by removing stopwords
def clean_multilang(text):
    tokens = re.findall(r"\b\w+\b", text.lower())
    return " ".join(w for w in tokens if w not in multi_stops)

# Apply the cleaning function to the 'text' column to create a new 'clean_text' column
df["clean_text"] = df["text"].map(clean_multilang)

In [8]:
# Split the data into training and testing sets
# X is the cleaned text, y is the label (binary classification: 0 or 1)
X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.3,
    random_state=500,
    stratify=df["label"]    # keep the 0/1 balance
)

In [9]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000) # Initialize TF-IDF Vectorizer
X_train_tfidf = tfidf.fit_transform(X_train) # Fit the TF-IDF vectorizer on the training data and transform it
X_test_tfidf  = tfidf.transform(X_test) # Transform the test data using the fitted TF-IDF vectorizer

In [16]:
# Define a function to evaluate binary classification models
def evaluate_classic_binary(y_true, y_pred,
                            labels=(0,1),
                            target_names=("non-illicit","illicit")):
    acc = accuracy_score(y_true, y_pred)
    print(f"Test Accuracy: {acc:.4f}\n")
    
    print("=== Classification Report ===")
    print(classification_report(
        y_true,
        y_pred,
        labels=labels,
        target_names=target_names,
        zero_division=0,
        digits=4
    ))
    
    print("=== Confusion Matrix ===")
    print(confusion_matrix(
        y_true,
        y_pred,
        labels=labels
    ))

In [13]:
# Train and evaluate Naive Bayes model
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_tfidf, y_train)
pred_nb = nb.predict(X_test_tfidf)

In [17]:
print("=== Naive Bayes (TF-IDF + multi-stopwords) ===")
evaluate_classic_binary(y_test, pred_nb)

=== Naive Bayes (TF-IDF + multi-stopwords) ===
Accuracy: 0.8636
Test Accuracy: 0.8636

=== Classification Report ===
              precision    recall  f1-score   support

 non-illicit     0.8756    0.9695    0.9201      1016
     illicit     0.7597    0.4118    0.5341       238

    accuracy                         0.8636      1254
   macro avg     0.8176    0.6906    0.7271      1254
weighted avg     0.8536    0.8636    0.8469      1254

=== Confusion Matrix ===
[[985  31]
 [140  98]]


In [18]:
# Train and evaluate Support Vector Machine (SVM) model
svm_clf = svm.SVC(kernel="linear", random_state=500)
svm_clf.fit(X_train_tfidf, y_train)
pred_svm = svm_clf.predict(X_test_tfidf)

In [19]:
print("\n--- SVM ---")
evaluate_classic_binary(y_test, pred_svm)


--- SVM ---
Test Accuracy: 0.8987

=== Classification Report ===
              precision    recall  f1-score   support

 non-illicit     0.9037    0.9793    0.9400      1016
     illicit     0.8627    0.5546    0.6752       238

    accuracy                         0.8987      1254
   macro avg     0.8832    0.7670    0.8076      1254
weighted avg     0.8959    0.8987    0.8897      1254

=== Confusion Matrix ===
[[995  21]
 [106 132]]
