3. Задача класифікації текстів

In [4]:
# === Завантаження і підготовка GloVe 6B 100d для Colab ===
import os
import gdown
import zipfile

# Створимо папку для GloVe
glove_dir = "/content/glove"
os.makedirs(glove_dir, exist_ok=True)

# URL на GloVe 6B на офіційному сайті (можемо скачати через gdown або wget)
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
zip_path = os.path.join(glove_dir, "glove.6B.zip")

# Завантажуємо, якщо ще не скачано
if not os.path.exists(zip_path):
    !wget -O {zip_path} {glove_url}

# Розпаковуємо
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(glove_dir)

# Шлях до потрібного файлу 100d
glove_path = os.path.join(glove_dir, "glove.6B.100d.txt")
print("GloVe 100d готовий за шляхом:", glove_path)


--2025-12-18 22:26:11--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-12-18 22:26:12--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-12-18 22:26:12--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/content/glove/glov

In [6]:

# === Імпорт ====================================
import pandas as pd
import numpy as np
import re
import string
import time
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# === Завантаження датасету =====================
url = "https://raw.githubusercontent.com/ilchukjulia059-cyber/NeuroLabs/refs/heads/main/Lab2/df_file.csv"
df = pd.read_csv(url)

# Встановлюємо назви колонок
df.columns = ['text', 'label']

# === Очищення тексту ==========================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = [w for w in text.split() if len(w) > 2]  # простий токенізатор
    return tokens

df['tokens'] = df['text'].apply(clean_text)

# === Створення словника ========================
MAX_VOCAB = 20000
specials = ["[PAD]", "[UNK]"]
counter = Counter()
for tok_list in df["tokens"]:
    counter.update(tok_list)

most_common = counter.most_common(MAX_VOCAB - len(specials))
itos = specials + [w for w, _ in most_common]
stoi = {w: i for i, w in enumerate(itos)}
PAD_IDX = stoi["[PAD]"]
UNK_IDX = stoi["[UNK]"]

def encode(tokens):
    return [stoi.get(t, UNK_IDX) for t in tokens]

MAX_LEN = 100
def pad_sequence(seq):
    seq = seq[:MAX_LEN] + [PAD_IDX] * max(0, MAX_LEN - len(seq))
    return torch.tensor(seq, dtype=torch.long)

df['encoded'] = df['tokens'].apply(encode)
X = torch.stack(df['encoded'].apply(pad_sequence).tolist())
y = torch.tensor(df['label'].values, dtype=torch.long)

# === Dataset / DataLoader ======================
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# === Модель BiLSTM =============================
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
    def forward(self, x):
        x = self.embedding(x)
        out, (h, c) = self.lstm(x)
        h_cat = torch.cat([h[-2], h[-1]], dim=1)
        return self.fc(h_cat)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def evaluate_accuracy(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb).argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return 100 * correct / total

# === Модель A: випадкові embedding ==============
vocab_size = len(stoi)
num_classes = df['label'].nunique()
model_a = BiLSTM(vocab_size, emb_dim=100, hidden_dim=64, num_classes=num_classes, pad_idx=PAD_IDX).to(device)
criterion = nn.CrossEntropyLoss()
optimizer_a = torch.optim.Adam(model_a.parameters(), lr=1e-3)

train_loss_a, val_loss_a, accs_a = [], [], []
start_a = time.time()
for epoch in range(5):
    model_a.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer_a.zero_grad()
        out = model_a(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer_a.step()
        total_loss += loss.item()
    train_loss_a.append(total_loss / len(train_loader))
    val_acc = evaluate_accuracy(model_a, val_loader)
    accs_a.append(val_acc)
    print(f"[A {epoch+1}] train_loss={train_loss_a[-1]:.4f} | val_acc={val_acc:.2f}%")
time_a = time.time() - start_a
acc_a = evaluate_accuracy(model_a, val_loader)
print(f"\nТочність моделі A (random embedding): {acc_a:.2f}%")

# === Модель B: GloVe embedding =================
# Завантажити свій GloVe файл (наприклад 100d)
glove_path = "/content/glove/glove.6B.100d.txt"  # замініть на свій шлях
def load_glove(path):
    vectors = {}
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            vec = np.asarray(parts[1:], dtype=np.float32)
            vectors[word] = vec
    dim = len(next(iter(vectors.values())))
    return vectors, dim

# Якщо GloVe є
glove, emb_dim = load_glove(glove_path)
emb_matrix = np.random.normal(scale=0.1, size=(len(stoi), emb_dim)).astype(np.float32)
emb_matrix[PAD_IDX] = 0.0
hit = 0
for w, idx in stoi.items():
    v = glove.get(w)
    if v is not None:
        emb_matrix[idx] = v
        hit += 1
print(f"GloVe coverage: {hit}/{len(stoi)} = {hit/len(stoi):.1%}")

pretrained_emb = nn.Embedding.from_pretrained(torch.tensor(emb_matrix), freeze=False, padding_idx=PAD_IDX).to(device)
model_b = BiLSTM(len(stoi), emb_dim, 64, num_classes, PAD_IDX).to(device)
model_b.embedding = pretrained_emb
optimizer_b = torch.optim.Adam(model_b.parameters(), lr=1e-3)

train_loss_b, accs_b = [], []
start_b = time.time()
for epoch in range(5):
    model_b.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer_b.zero_grad()
        out = model_b(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer_b.step()
        total_loss += loss.item()
    train_loss_b.append(total_loss / len(train_loader))
    val_acc = evaluate_accuracy(model_b, val_loader)
    accs_b.append(val_acc)
    print(f"[B {epoch+1}] train_loss={train_loss_b[-1]:.4f} | val_acc={val_acc:.2f}%")
time_b = time.time() - start_b
acc_b = evaluate_accuracy(model_b, val_loader)
print(f"\nТочність моделі B (GloVe embedding): {acc_b:.2f}%")

# === Порівняння =================================
print("\n--- Порівняння моделей ---")
print(f"{'Метод':<20} {'Точність (%)':<15} {'Час навчання (с)':<18}")
print("-" * 55)
print(f"{'A: Random Embedding':<20} {acc_a:.2f}           {time_a:.2f}")
print(f"{'B: GloVe Embedding':<20} {acc_b:.2f}           {time_b:.2f}")


[A 1] train_loss=1.5933 | val_acc=30.56%
[A 2] train_loss=1.4892 | val_acc=34.38%
[A 3] train_loss=1.3090 | val_acc=47.87%
[A 4] train_loss=0.9255 | val_acc=64.72%
[A 5] train_loss=0.5221 | val_acc=70.56%

Точність моделі A (random embedding): 70.56%
GloVe coverage: 19600/20000 = 98.0%
[B 1] train_loss=1.3799 | val_acc=79.78%
[B 2] train_loss=0.5184 | val_acc=90.34%
[B 3] train_loss=0.2662 | val_acc=88.09%
[B 4] train_loss=0.2187 | val_acc=95.28%
[B 5] train_loss=0.1286 | val_acc=95.96%

Точність моделі B (GloVe embedding): 95.96%

--- Порівняння моделей ---
Метод                Точність (%)    Час навчання (с)  
-------------------------------------------------------
A: Random Embedding  70.56           25.70
B: GloVe Embedding   95.96           23.36
