In [20]:
import nltk
import pandas as pd
import numpy as np
import random
import re # For regular expressions in text preprocessing
from nltk.corpus import stopwords # For stopword removal
from sklearn.model_selection import train_test_split # For splitting data
from sklearn.preprocessing import LabelEncoder # For encoding categorical labels
from sklearn.feature_extraction.text import TfidfVectorizer # For TF-IDF feature extraction
from sklearn import naive_bayes, svm # For Naive Bayes and SVM models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # For evaluation metrics

In [4]:
# Set random seeds for reproducibility
SEED = 500
random.seed(SEED)
np.random.seed(SEED)

In [5]:
# Load JSONL file
file_path = "DUTA10K_final.jsonl"
df = pd.read_json(file_path, lines=True)

In [6]:
# Drop blanks
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [7]:
# Lower case
df["text"] = df["text"].str.lower()

In [8]:
langs = df["lang"].unique().tolist()
multilingual_stops = set()
for lg in langs:
    try:
        multilingual_stops |= set(stopwords.words(lg))
    except OSError:
        pass

In [9]:
# Define a function to preprocess text by removing stopwords
def preprocess_multilang(doc):
    # basic tokenization
    tokens = re.findall(r"\b\w+\b", doc.lower())
    return " ".join(w for w in tokens if w not in multilingual_stops) # Join words that are not in the multilingual stopword list

# Apply the preprocessing function to the 'text' column to create 'clean_text'
df["clean_text"] = df["text"].map(preprocess_multilang)

In [12]:
# Split the data into training and testing sets
# X is the cleaned text, y is the original 'category' for multiclass classification
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"],
    df["category"],
    test_size=0.3,
    random_state=SEED,
    stratify=None  # keep imbalance proportions
)

In [13]:
# Encode categorical labels to numerical format using LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

In [14]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000) # Limits the vocabulary size to 5000
X_train_tfidf = tfidf.fit_transform(X_train) # Fit the TF-IDF vectorizer on the training data and transform it
X_test_tfidf  = tfidf.transform(X_test) # Transform the test data using the fitted TF-IDF vectorizer

In [18]:
# Define a function to evaluate classic multiclass classification models
def evaluate_classic(y_true, y_pred, label_encoder):
    """
    Prints a classification report + confusion matrix only for the
    classes actually present in y_true.
    """
    # Which integer labels are in the test set?
    present_labels = sorted(set(y_true.tolist()))
    
    # Map those back to their original names
    present_names = [label_encoder.classes_[i] for i in present_labels]

    # Print the report for only those classes
    print("=== Classification Report ===")
    print(classification_report(
        y_true,
        y_pred,
        labels=present_labels,
        target_names=present_names,
        zero_division=0,
        digits=4
    ))

    # The corresponding confusion matrix
    print("=== Confusion Matrix ===")
    print(confusion_matrix(
        y_true,
        y_pred,
        labels=present_labels
    ))

In [16]:
# Train and evaluate Naive Bayes model
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_tfidf, y_train_enc)
pred_nb = nb.predict(X_test_tfidf)

In [21]:
evaluate_classic(y_test_enc, pred_nb, le)

=== Classification Report ===
                                              precision    recall  f1-score   support

                                   Art_Music     0.0000    0.0000    0.0000         2
                             Casino_Gambling     0.0000    0.0000    0.0000         8
                    Counterfeit Credit-Cards     0.8400    0.7368    0.7850        57
                           Counterfeit Money     0.0000    0.0000    0.0000        17
      Counterfeit Personal-Identification_ID     0.0000    0.0000    0.0000         2
Counterfeit Personal-Identification_Passport     0.0000    0.0000    0.0000        10
                              Cryptocurrency     0.4178    0.9639    0.5829       166
                                Cryptolocker     1.0000    0.9000    0.9474        60
                               Drugs_Illegal     0.6875    0.6471    0.6667        68
                                 Drugs_Legal     0.0000    0.0000    0.0000         1
                       

In [23]:
# Train and evaluate Support Vector Machine (SVM) model
svm_clf = svm.SVC(C=1.0, kernel="linear", degree=3, gamma="auto", random_state=SEED)
svm_clf.fit(X_train_tfidf, y_train_enc)
pred_svm = svm_clf.predict(X_test_tfidf)

In [24]:
print("\n--- SVM ---")
evaluate_classic(y_test_enc, pred_svm, le)


--- SVM ---
=== Classification Report ===
                                              precision    recall  f1-score   support

                                   Art_Music     0.0000    0.0000    0.0000         2
                             Casino_Gambling     1.0000    0.5000    0.6667         8
                    Counterfeit Credit-Cards     0.9153    0.9474    0.9310        57
                           Counterfeit Money     1.0000    0.7647    0.8667        17
      Counterfeit Personal-Identification_ID     0.0000    0.0000    0.0000         2
Counterfeit Personal-Identification_Passport     1.0000    0.8000    0.8889        10
                              Cryptocurrency     0.9625    0.9277    0.9448       166
                                Cryptolocker     1.0000    0.9667    0.9831        60
                               Drugs_Illegal     0.6932    0.8971    0.7821        68
                                 Drugs_Legal     0.0000    0.0000    0.0000         1
          