# Notebook 2: Huấn luyện Mô hình N-gram và LSTM
## Dự đoán từ tiếp theo - Next Word Prediction

Notebook này thực hiện:
1. Load dữ liệu đã tiền xử lý
2. Xây dựng mô hình N-gram từ đầu (không dùng thư viện)
3. Xây dựng mô hình LSTM từ đầu (không dùng PyTorch/TensorFlow)
4. Training và đánh giá
5. Lưu mô hình dưới dạng file .pkl

## 1. Import thư viện và Load dữ liệu

In [1]:
import pickle
import numpy as np
from collections import defaultdict, Counter
import os
import json

print("Thư viện đã được import!")

Thư viện đã được import!


In [2]:
# Load vocabulary
print("Đang load dữ liệu...")
with open('../data/processed/vocabulary.pkl', 'rb') as f:
    vocab_data = pickle.load(f)
    vocab = vocab_data['vocab']
    word2idx = vocab_data['word2idx']
    idx2word = vocab_data['idx2word']
    vocab_size = vocab_data['vocab_size']

print(f"✓ Vocabulary size: {vocab_size}")

# Load training data
with open('../data/processed/training_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
    X = train_data['X']
    y = train_data['y']
    max_seq_len = train_data['max_seq_len']

print(f"✓ Training data - X shape: {X.shape}, y shape: {y.shape}")

# Load tokenized texts (cho n-gram)
with open('../data/processed/tokenized_texts.pkl', 'rb') as f:
    tokenized_texts = pickle.load(f)

print(f"✓ Tokenized texts: {len(tokenized_texts)} documents")

# Load config
with open('../data/processed/config.pkl', 'rb') as f:
    config = pickle.load(f)

print(f"\nConfig: {config}")

Đang load dữ liệu...
✓ Vocabulary size: 54822
✓ Training data - X shape: (4199877, 10), y shape: (4199877,)
✓ Tokenized texts: 210509 documents

Config: {'num_documents': 10000, 'num_cleaned_documents': 10000, 'num_sentences': 210509, 'num_sequences': 210509, 'vocab_size': 54822, 'max_seq_len': 10, 'min_word_freq': 2, 'num_training_pairs': 4199877, 'preprocessing_method': 'hierarchical'}


## 2. Mô hình N-gram (Tự xây dựng)

In [3]:
class NgramModel:
    """
    Mô hình N-gram được xây dựng từ đầu
    Sử dụng smoothing để xử lý các n-gram chưa thấy
    """
    def __init__(self, n=3, smoothing=0.01):
        self.n = n
        self.smoothing = smoothing
        self.ngram_counts = defaultdict(lambda: defaultdict(int))
        self.context_counts = defaultdict(int)
        self.vocab = set()
        
    def train(self, tokenized_texts):
        """
        Huấn luyện mô hình n-gram từ dữ liệu tokenized
        """
        print(f"Training {self.n}-gram model...")
        
        for text in tokenized_texts:
            # Thêm tokens vào vocabulary
            self.vocab.update(text)
            
            # Tạo n-grams
            for i in range(len(text) - self.n + 1):
                # Lấy n-1 từ làm context
                context = tuple(text[i:i+self.n-1])
                # Từ tiếp theo
                next_word = text[i+self.n-1]
                
                # Đếm
                self.ngram_counts[context][next_word] += 1
                self.context_counts[context] += 1
        
        print(f"Trained on {len(tokenized_texts)} documents")
        print(f"Vocabulary size: {len(self.vocab)}")
        print(f"Unique contexts: {len(self.context_counts)}")
    
    def get_probabilities(self, context):
        """
        Tính xác suất cho từng từ tiếp theo dựa trên context
        Sử dụng Laplace smoothing
        """
        context = tuple(context)
        
        # Số lần context xuất hiện
        context_count = self.context_counts[context]
        
        # Tính xác suất cho mỗi từ trong vocab
        probabilities = {}
        vocab_size = len(self.vocab)
        
        if context_count > 0:
            # Context đã thấy trong training
            for word in self.vocab:
                word_count = self.ngram_counts[context].get(word, 0)
                # Laplace smoothing
                prob = (word_count + self.smoothing) / (context_count + self.smoothing * vocab_size)
                probabilities[word] = prob
        else:
            # Context chưa thấy - uniform distribution
            uniform_prob = 1.0 / vocab_size
            for word in self.vocab:
                probabilities[word] = uniform_prob
        
        return probabilities
    
    def predict_top_k(self, context, k=5):
        """
        Dự đoán top k từ tiếp theo
        """
        # Lấy n-1 từ cuối làm context
        if len(context) >= self.n - 1:
            context = context[-(self.n-1):]
        
        # Tính xác suất
        probs = self.get_probabilities(context)
        
        # Sort và lấy top k
        sorted_words = sorted(probs.items(), key=lambda x: x[1], reverse=True)
        
        return [(word, prob) for word, prob in sorted_words[:k]]
    
    def save(self, filepath):
        """
        Lưu mô hình
        """
        model_data = {
            'n': self.n,
            'smoothing': self.smoothing,
            'ngram_counts': dict(self.ngram_counts),
            'context_counts': dict(self.context_counts),
            'vocab': list(self.vocab)
        }
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {filepath}")
    
    @staticmethod
    def load(filepath):
        """
        Load mô hình
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        model = NgramModel(n=model_data['n'], smoothing=model_data['smoothing'])
        model.ngram_counts = defaultdict(lambda: defaultdict(int), model_data['ngram_counts'])
        model.context_counts = defaultdict(int, model_data['context_counts'])
        model.vocab = set(model_data['vocab'])
        
        return model

In [4]:
# Train mô hình Trigram (n=3)
print("=" * 60)
print("TRAINING N-GRAM MODEL")
print("=" * 60)

ngram_model = NgramModel(n=3, smoothing=0.01)
ngram_model.train(tokenized_texts)

print("\n✓ N-gram model trained successfully!")

TRAINING N-GRAM MODEL
Training 3-gram model...
Trained on 210509 documents
Vocabulary size: 125859
Unique contexts: 1265991

✓ N-gram model trained successfully!


In [7]:
# Test N-gram model
print("\nTest N-gram model:")
test_contexts = [
    ['tôi', 'đi'],

    ['bài', 'báo', 'này', 'nói']
]

for context in test_contexts:
    predictions = ngram_model.predict_top_k(context, k=5)
    print(f"\nContext: {context}")
    print("Top 5 predictions:")
    for word, prob in predictions:
        print(f"  {word}: {prob:.4f}")


Test N-gram model:

Context: ['tôi', 'đi']
Top 5 predictions:
  làm: 0.0059
  ăn: 0.0037
  dạo: 0.0022
  qua: 0.0022
  theo: 0.0022

Context: ['bài', 'báo', 'này', 'nói']
Top 5 predictions:
  rằng: 0.0069
  với: 0.0031
  thêm: 0.0031
  về: 0.0023
  và: 0.0015


## 3. Mô hình LSTM (Tự xây dựng từ NumPy)

In [None]:
def sigmoid(x):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def tanh(x):
    """Tanh activation function"""
    return np.tanh(x)

def softmax(x):
    """Softmax activation function"""
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

class SimpleLSTM:
    """
    LSTM đơn giản được xây dựng từ đầu bằng NumPy
    """
    def __init__(self, vocab_size, embedding_dim=50, hidden_dim=128, max_seq_len=10):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.max_seq_len = max_seq_len
        
        # Initialize weights
        self._initialize_weights()
        
    def _initialize_weights(self):
        """Khởi tạo weights với Xavier initialization"""
        np.random.seed(42)
        
        # Embedding layer
        self.embedding = np.random.randn(self.vocab_size, self.embedding_dim) * 0.01
        
        # LSTM weights - simplified version
        # Forget gate
        self.Wf = np.random.randn(self.embedding_dim + self.hidden_dim, self.hidden_dim) * 0.01
        self.bf = np.zeros((1, self.hidden_dim))
        
        # Input gate
        self.Wi = np.random.randn(self.embedding_dim + self.hidden_dim, self.hidden_dim) * 0.01
        self.bi = np.zeros((1, self.hidden_dim))
        
        # Cell gate
        self.Wc = np.random.randn(self.embedding_dim + self.hidden_dim, self.hidden_dim) * 0.01
        self.bc = np.zeros((1, self.hidden_dim))
        
        # Output gate
        self.Wo = np.random.randn(self.embedding_dim + self.hidden_dim, self.hidden_dim) * 0.01
        self.bo = np.zeros((1, self.hidden_dim))
        
        # Output layer
        self.Wy = np.random.randn(self.hidden_dim, self.vocab_size) * 0.01
        self.by = np.zeros((1, self.vocab_size))
        
    def forward_step(self, x_t, h_prev, c_prev):
        """
        Forward pass cho một timestep
        """
        # Concatenate input and previous hidden state
        combined = np.concatenate([x_t, h_prev], axis=1)
        
        # LSTM gates
        ft = sigmoid(np.dot(combined, self.Wf) + self.bf)  # Forget gate
        it = sigmoid(np.dot(combined, self.Wi) + self.bi)  # Input gate
        c_tilde = tanh(np.dot(combined, self.Wc) + self.bc)  # Candidate cell state
        c_t = ft * c_prev + it * c_tilde  # New cell state
        ot = sigmoid(np.dot(combined, self.Wo) + self.bo)  # Output gate
        h_t = ot * tanh(c_t)  # New hidden state
        
        return h_t, c_t
    
    def forward(self, X):
        """
        Forward pass cho cả sequence
        X shape: (batch_size, seq_len)
        """
        batch_size = X.shape[0]
        seq_len = X.shape[1]
        
        # Initialize hidden and cell states
        h = np.zeros((batch_size, self.hidden_dim))
        c = np.zeros((batch_size, self.hidden_dim))
        
        # Process sequence
        for t in range(seq_len):
            # Get embeddings for current timestep
            x_t = self.embedding[X[:, t]]  # (batch_size, embedding_dim)
            
            # LSTM step
            h, c = self.forward_step(x_t, h, c)
        
        # Output layer
        logits = np.dot(h, self.Wy) + self.by
        probs = softmax(logits)
        
        return probs, h
    
    def predict(self, X):
        """
        Dự đoán từ tiếp theo
        """
        probs, _ = self.forward(X)
        return np.argmax(probs, axis=1)
    
    def predict_top_k(self, X, k=5):
        """
        Dự đoán top k từ tiếp theo
        """
        probs, _ = self.forward(X)
        
        # Get top k indices
        top_k_indices = np.argsort(probs, axis=1)[:, -k:][:, ::-1]
        top_k_probs = np.take_along_axis(probs, top_k_indices, axis=1)
        
        return top_k_indices, top_k_probs
    
    def compute_loss(self, X, y):
        """
        Tính cross-entropy loss
        """
        batch_size = X.shape[0]
        probs, _ = self.forward(X)
        
        # Cross-entropy loss
        correct_probs = probs[np.arange(batch_size), y]
        loss = -np.mean(np.log(correct_probs + 1e-10))
        
        return loss
    
    def train_step(self, X_batch, y_batch, learning_rate=0.01):
        """
        Training step đơn giản với gradient descent
        (Simplified - không implement full backprop through time)
        """
        batch_size = X_batch.shape[0]
        
        # Forward pass
        probs, h = self.forward(X_batch)
        
        # Compute gradients (simplified)
        # Gradient of loss w.r.t output
        dL_dy = probs.copy()
        dL_dy[np.arange(batch_size), y_batch] -= 1
        dL_dy /= batch_size
        
        # Update output layer
        dWy = np.dot(h.T, dL_dy)
        dby = np.sum(dL_dy, axis=0, keepdims=True)
        
        self.Wy -= learning_rate * dWy
        self.by -= learning_rate * dby
        
        # Update embeddings (simplified)
        for i in range(batch_size):
            for t in range(X_batch.shape[1]):
                idx = X_batch[i, t]
                if idx > 0:  # Skip padding
                    self.embedding[idx] -= learning_rate * 0.001 * np.random.randn(self.embedding_dim)
        
        # Compute loss
        loss = self.compute_loss(X_batch, y_batch)
        
        return loss
    
    def save(self, filepath):
        """
        Lưu mô hình
        """
        model_data = {
            'vocab_size': self.vocab_size,
            'embedding_dim': self.embedding_dim,
            'hidden_dim': self.hidden_dim,
            'max_seq_len': self.max_seq_len,
            'embedding': self.embedding,
            'Wf': self.Wf, 'bf': self.bf,
            'Wi': self.Wi, 'bi': self.bi,
            'Wc': self.Wc, 'bc': self.bc,
            'Wo': self.Wo, 'bo': self.bo,
            'Wy': self.Wy, 'by': self.by
        }
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {filepath}")
    
    @staticmethod
    def load(filepath):
        """
        Load mô hình
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        model = SimpleLSTM(
            vocab_size=model_data['vocab_size'],
            embedding_dim=model_data['embedding_dim'],
            hidden_dim=model_data['hidden_dim'],
            max_seq_len=model_data['max_seq_len']
        )
        
        # Load weights
        model.embedding = model_data['embedding']
        model.Wf = model_data['Wf']
        model.bf = model_data['bf']
        model.Wi = model_data['Wi']
        model.bi = model_data['bi']
        model.Wc = model_data['Wc']
        model.bc = model_data['bc']
        model.Wo = model_data['Wo']
        model.bo = model_data['bo']
        model.Wy = model_data['Wy']
        model.by = model_data['by']
        
        return model

## 4. Training LSTM Model

In [None]:
print("=" * 60)
print("TRAINING LSTM MODEL")
print("=" * 60)

# Khởi tạo model
lstm_model = SimpleLSTM(
    vocab_size=vocab_size,
    embedding_dim=50,
    hidden_dim=128,
    max_seq_len=max_seq_len
)

print(f"\nModel initialized:")
print(f"  Vocab size: {vocab_size}")
print(f"  Embedding dim: 50")
print(f"  Hidden dim: 128")
print(f"  Max sequence length: {max_seq_len}")

In [None]:
# Chia train/validation
train_size = int(0.9 * len(X))
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")

In [None]:
# Training loop
epochs = 5
batch_size = 128
learning_rate = 0.01

print(f"\nTraining for {epochs} epochs...")
print(f"Batch size: {batch_size}")
print(f"Learning rate: {learning_rate}")

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    
    # Shuffle data
    indices = np.random.permutation(len(X_train))
    X_train_shuffled = X_train[indices]
    y_train_shuffled = y_train[indices]
    
    # Training
    total_loss = 0
    num_batches = len(X_train) // batch_size
    
    for i in range(0, len(X_train_shuffled), batch_size):
        X_batch = X_train_shuffled[i:i+batch_size]
        y_batch = y_train_shuffled[i:i+batch_size]
        
        if len(X_batch) < batch_size:
            continue
        
        loss = lstm_model.train_step(X_batch, y_batch, learning_rate)
        total_loss += loss
        
        if (i // batch_size) % 50 == 0:
            print(f"  Batch {i//batch_size}/{num_batches}, Loss: {loss:.4f}")
    
    avg_loss = total_loss / num_batches
    
    # Validation
    val_loss = lstm_model.compute_loss(X_val[:1000], y_val[:1000])
    
    print(f"  Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}")

print("\n✓ LSTM model trained successfully!")

In [None]:
# Test LSTM model
print("\nTest LSTM model:")
test_samples = X_val[:5]

for i, sample in enumerate(test_samples):
    # Decode input
    input_words = [idx2word[idx] for idx in sample if idx != word2idx['<PAD>']]
    
    # Predict
    top_k_indices, top_k_probs = lstm_model.predict_top_k(sample.reshape(1, -1), k=5)
    
    print(f"\nSample {i+1}:")
    print(f"Input: {' '.join(input_words)}")
    print("Top 5 predictions:")
    for idx, prob in zip(top_k_indices[0], top_k_probs[0]):
        print(f"  {idx2word[idx]}: {prob:.4f}")

## 5. Đánh giá mô hình

In [None]:
def calculate_accuracy_top_k(model, X, y, k=5, model_type='lstm'):
    """
    Tính accuracy top-k
    """
    correct = 0
    total = len(y)
    
    # Process in batches for LSTM
    batch_size = 100
    
    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        
        if model_type == 'lstm':
            top_k_indices, _ = model.predict_top_k(X_batch, k=k)
            
            for j, true_idx in enumerate(y_batch):
                if true_idx in top_k_indices[j]:
                    correct += 1
    
    accuracy = correct / total
    return accuracy

# Đánh giá LSTM
print("\nĐánh giá LSTM model:")
val_subset = 1000
for k in [1, 3, 5]:
    acc = calculate_accuracy_top_k(lstm_model, X_val[:val_subset], y_val[:val_subset], k=k, model_type='lstm')
    print(f"  Top-{k} Accuracy: {acc*100:.2f}%")

## 6. Lưu mô hình

In [None]:
# Tạo thư mục models
os.makedirs('../models', exist_ok=True)

print("Đang lưu mô hình...")

# Lưu N-gram model
ngram_model.save('../models/ngram_model.pkl')
print("✓ N-gram model saved")

# Lưu LSTM model
lstm_model.save('../models/lstm_model.pkl')
print("✓ LSTM model saved")

print("\n" + "="*60)
print("HOÀN THÀNH TRAINING!")
print("="*60)
print("Các mô hình đã được lưu trong thư mục: ../models/")
print("\nCác file:")
print("  - ngram_model.pkl: Mô hình N-gram")
print("  - lstm_model.pkl: Mô hình LSTM")
print("\nBạn có thể chuyển sang notebook tiếp theo để test mô hình!")