# =====================================================

# Phase 4 - BiLSTM Evaluation Notebook

# =====================================================


In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import sys
import os

# ----------------------------

# 1. Load Dataset + Vocab

# ----------------------------


In [2]:
sys.path.append('..')

df = pd.read_csv("../data/clean_dataset.csv")
sample_df = df.sample(100, random_state=42)

# ----------------------------
# 2. Load BPE Tokenizer
# (re-train small tokenizer here if needed)
# ----------------------------
class BPETokenizer:
    def __init__(self):
        pass
    def encode(self, text):
        return list(text)   # simple char-level for demo
    def decode(self, tokens):
        return "".join(tokens)

bpe = BPETokenizer()
sequences = [bpe.encode(c) for c in sample_df['code'].astype(str)]

# Build vocab
all_tokens = [t for seq in sequences for t in seq]
vocab = list(set(all_tokens))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

print("Vocab size:", len(vocab))
sequences = [bpe.encode(c) for c in sample_df['code'].astype(str)]

# Build vocab
all_tokens = [t for seq in sequences for t in seq]
vocab = list(set(all_tokens))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

print("Vocab size:", len(vocab))

Vocab size: 99
Vocab size: 99


# ----------------------------

# 2. Define BiLSTM Model

# ----------------------------


In [3]:
class BiLSTMLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, vocab_size)
    def forward(self, x):
        x = self.embed(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out

device = "cuda" if torch.cuda.is_available() else "cpu"

model = BiLSTMLM(len(vocab)).to(device)
model.load_state_dict(torch.load("../bilstm/bilstm_model.pt", map_location=device))
model.eval()

print("✅ Model loaded")

RuntimeError: Error(s) in loading state_dict for BiLSTMLM:
	size mismatch for embed.weight: copying a param with shape torch.Size([677, 100]) from checkpoint, the shape in current model is torch.Size([99, 100]).
	size mismatch for fc.weight: copying a param with shape torch.Size([677, 256]) from checkpoint, the shape in current model is torch.Size([99, 256]).
	size mismatch for fc.bias: copying a param with shape torch.Size([677]) from checkpoint, the shape in current model is torch.Size([99]).

# ----------------------------

# 3. Evaluate Perplexity

# ----------------------------


In [None]:
def compute_perplexity(model, sequences, seq_len=20):
    model.eval()
    losses = []
    criterion = nn.CrossEntropyLoss()
    for seq in sequences[:200]:  # limit to 200 samples for speed
        idxs = [word_to_idx[t] for t in seq if t in word_to_idx]
        for i in range(len(idxs) - seq_len):
            X = torch.tensor(idxs[i:i+seq_len], dtype=torch.long).unsqueeze(0).to(device)
            Y = torch.tensor(idxs[i+1:i+seq_len+1], dtype=torch.long).unsqueeze(0).to(device)
            with torch.no_grad():
                out = model(X)
                loss = criterion(out.view(-1, len(vocab)), Y.view(-1))
                losses.append(loss.item())
    avg_loss = np.mean(losses)
    return np.exp(avg_loss)

ppl = compute_perplexity(model, sequences)
print(f"\nModel Perplexity: {ppl:.2f}")


# ----------------------------

# 4. Text Generation

# ----------------------------


In [None]:
def generate_text(model, start_token="d", length=100):
    model.eval()
    tokens = [word_to_idx.get(start_token, 0)]
    for _ in range(length):
        inp = torch.tensor(tokens[-20:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            out = model(inp)
            next_token = torch.argmax(out[0, -1]).item()
        tokens.append(next_token)
    return bpe.decode([idx_to_word[i] for i in tokens])

print("\nGenerated text sample:\n")
print(generate_text(model, start_token="d", length=200))
