In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
def clean_kazakh_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zа-яәғқңөүұһі0-9\s]', ' ', text)  # keep Kazakh alphabet
    text = re.sub(r'\d+', ' ', text)                      # remove numbers
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
df = pd.read_csv("/content/merged.csv")
df['label'] = df['label'].str.upper()
df = df[df['label'].str.lower() != 'label']
df = df.dropna(subset=["text", "label"])

df["clean"] = df["text"].apply(clean_kazakh_text)


In [None]:

label_counts = df["label"].value_counts()
print("\nLabel counts:")
print(label_counts)

print("\nLabel proportions:")
print((label_counts / len(df)).round(3))

# Imbalance ratio (max_count / min_count)
imbalance_ratio = label_counts.max() / label_counts.min()
print(f"\nImbalance ratio (max/min): {imbalance_ratio:.2f}")

# Comment length statistics
df["text_len"] = df["text"].astype(str).apply(lambda x: len(x.split()))
print("\nText length stats (in tokens):")
print(df["text_len"].describe())


Label counts:
label
NORMAL         1052
HATE_SPEECH     498
OFFENSIVE       396
Name: count, dtype: int64

Label proportions:
label
NORMAL         0.541
HATE_SPEECH    0.256
OFFENSIVE      0.203
Name: count, dtype: float64

Imbalance ratio (max/min): 2.66

Text length stats (in tokens):
count    1946.000000
mean        9.843782
std        11.683234
min         1.000000
25%         4.000000
50%         7.000000
75%        11.000000
max       201.000000
Name: text_len, dtype: float64


In [None]:
vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    sublinear_tf=True
)

X = vectorizer.fit_transform(df["clean"]).toarray()


In [None]:
labels_map = {"NORMAL": 0, "OFFENSIVE": 1, "HATE_SPEECH": 2}
y = df["label"].map(labels_map).values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
class TfidfDataset(Dataset):
    def __init__(self, features, labels):
        self.X = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
batch_size = 64
train_loader = DataLoader(TfidfDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(TfidfDataset(X_test, y_test),  batch_size=batch_size)


In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden1=256, hidden2=128, num_classes=3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden1),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden1, hidden2),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden2, num_classes)
        )

    def forward(self, x):
        return self.model(x)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MLPClassifier(input_dim=X_train.shape[1]).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
epochs = 50

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.3f}")


Epoch 1/50 - Loss: 26.660
Epoch 2/50 - Loss: 21.547
Epoch 3/50 - Loss: 14.564
Epoch 4/50 - Loss: 8.388
Epoch 5/50 - Loss: 2.125
Epoch 6/50 - Loss: 0.387
Epoch 7/50 - Loss: 0.240
Epoch 8/50 - Loss: 0.129
Epoch 9/50 - Loss: 0.125
Epoch 10/50 - Loss: 0.104
Epoch 11/50 - Loss: 0.118
Epoch 12/50 - Loss: 0.105
Epoch 13/50 - Loss: 0.114
Epoch 14/50 - Loss: 0.098
Epoch 15/50 - Loss: 0.107
Epoch 16/50 - Loss: 0.087
Epoch 17/50 - Loss: 0.084
Epoch 18/50 - Loss: 0.062
Epoch 19/50 - Loss: 0.066
Epoch 20/50 - Loss: 0.088
Epoch 21/50 - Loss: 0.114
Epoch 22/50 - Loss: 0.086
Epoch 23/50 - Loss: 0.092
Epoch 24/50 - Loss: 0.095
Epoch 25/50 - Loss: 0.064
Epoch 26/50 - Loss: 0.074
Epoch 27/50 - Loss: 0.087
Epoch 28/50 - Loss: 0.074
Epoch 29/50 - Loss: 0.086
Epoch 30/50 - Loss: 0.081
Epoch 31/50 - Loss: 0.093
Epoch 32/50 - Loss: 0.087
Epoch 33/50 - Loss: 0.074
Epoch 34/50 - Loss: 0.085
Epoch 35/50 - Loss: 0.062
Epoch 36/50 - Loss: 0.072
Epoch 37/50 - Loss: 0.066
Epoch 38/50 - Loss: 0.096
Epoch 39/50 - Loss

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device)
        outputs = model(batch_x)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(batch_y.numpy())


In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average="weighted"
)

accuracy = accuracy_score(all_labels, all_preds)

print("\nEvaluation:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")



Evaluation:
Accuracy:  0.7949
Precision: 0.8193
Recall:    0.7949
F1 Score:  0.7783


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# reverse mapping (numeric → label)
inv_label_map = {0: "NORMAL", 1: "OFFENSIVE", 2: "HATE_SPEECH"}

# convert numeric preds → label names
pred_names = [inv_label_map[p] for p in all_preds]
true_names = [inv_label_map[t] for t in all_labels]

print("\nFinal evaluation on validation set (best epoch metrics above):\n")

print(classification_report(
    true_names,
    pred_names,
    digits=2
))



Final evaluation on validation set (best epoch metrics above):

              precision    recall  f1-score   support

 HATE_SPEECH       0.81      0.78      0.80       100
      NORMAL       0.77      0.95      0.85       211
   OFFENSIVE       0.97      0.41      0.57        79

    accuracy                           0.79       390
   macro avg       0.85      0.71      0.74       390
weighted avg       0.82      0.79      0.78       390



In [None]:
torch.save(model.state_dict(), "kazakh_toxicity_mlp.pt")

import joblib
joblib.dump(vectorizer, "kazakh_vectorizer.pkl")

print("Model & vectorizer saved.")


Model & vectorizer saved.
