# SNLI with spaCy & ESIM
Load the SNLI dataset, preprocess with spaCy, build embeddings, pad sequences, and prepare inputs for the ESIM model.

In [None]:
import spacy
import numpy as np
import spacy
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.amp import autocast, GradScaler
from models.esim import ESIM
from tqdm.notebook import tqdm

from utils import load_nli_data

nlp = spacy.load(
    "en_core_web_lg",
    exclude=[
        "parser",
        "tagger",
        "ner",
        "textcat",
        "lemmatizer",
        "attribute_ruler",
        "tok2vec",
    ],
)
print("unique vector size", len(nlp.vocab.vectors))

# Hyper‑parameters
MAX_LEN = 64
BATCH_SIZE = 128
EPOCHS = 3
HIDDEN = 512
NUM_CLASSES = 3
LR = 1e-3
NR_UNK = 100

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)

In [None]:
snli_train = load_nli_data("data/snli_1.0_train.jsonl")
snli_dev = load_nli_data("data/snli_1.0_dev.jsonl")
snli_test = load_nli_data("data/snli_1.0_test.jsonl")

# 1-Tokenization and Preprocessing NLI Pairs
* Following function tokenizes the premise and hypothesis sentences using spaCy. Gets their token ids from NLP object.

In [None]:
def tokenize_to_ids(texts, max_length=64, nr_unk=100):
    """
    Convert texts to token IDs using spaCy vocabulary.
    0=PAD, 1..nr_unk=OOV buckets, nr_unk+1..=vector tokens.
    """
    vec_key2row = nlp.vocab.vectors.key2row  # dict: lex_id -> row_index
    all_ids = np.zeros((len(texts), max_length), dtype=np.int32)

    for i, doc in enumerate(tqdm(nlp.pipe(texts, n_process=-1), total=len(texts), desc="Tokenizing")):
        seq_ids = []
        for token in doc[:max_length]:
            row = vec_key2row.get(token.orth)
            if row is not None and token.vector_norm > 0:
                seq_ids.append(row + nr_unk + 1)
            else:
                seq_ids.append(1 + (hash(token.text) % nr_unk))

        # pad/truncate
        arr = np.zeros(max_length, dtype=np.int32)
        arr[: len(seq_ids)] = seq_ids
        all_ids[i] = arr

    return all_ids

* Following function uses "tokenize_to_ids" to convert the sentences into token ids. Process is repeated for all NLI sets

In [None]:
def process_and_save_nli_data(df, name, max_length=MAX_LEN, nr_unk=NR_UNK):

    # Save the processed data
    output_path = f"data/{name}.npz"

    np.savez_compressed(
        output_path,
        sentence1_tokens=tokenize_to_ids(
            df["sentence1"], max_length=max_length, nr_unk=nr_unk
        ),
        sentence2_tokens=tokenize_to_ids(
            df["sentence2"], max_length=max_length, nr_unk=nr_unk
        ),
        label=df["label"],
    )

    print(f"Saved {output_path}")


# Process each dataset
process_and_save_nli_data(snli_train, "train")
process_and_save_nli_data(snli_dev, "dev")
process_and_save_nli_data(snli_test, "test")

# 2-Extract Embedding Matrix from Spacy NLP

In [None]:
def get_embeddings_spacy3(nlp, nr_unk=100):
    """
    Build (PAD + OOV + spaCy) matrix for ESIM.
      PAD row = 0
      next nr_unk rows = random OOV
      then spaCy.vectors rows
    """
    vecs = nlp.vocab.vectors
    n_rows, dim = vecs.shape
    total = 1 + nr_unk + n_rows

    # init
    emb = np.zeros((total, dim), dtype="float32")

    # random OOV vectors (unit‑norm)
    oov = np.random.normal(size=(nr_unk, dim)).astype("float32")
    oov /= np.linalg.norm(oov, axis=1, keepdims=True)
    emb[1: nr_unk+1] = oov

    # copy spaCy vectors
    for lex_id, row in vecs.key2row.items():
        emb[nr_unk + 1 + row] = vecs.data[row]

    return emb

In [None]:
embedding_matrix = get_embeddings_spacy3(nlp, NR_UNK)
np.save("data/embedding_matrix.npy", embedding_matrix)
print("Saved emb_matrix.npy with shape", embedding_matrix.shape)

# 3-Train the ESIM Model
* Load the Preprocessed data and the embedding matrix.

In [None]:
# Load the data
train_data = np.load("data/train.npz")
dev_data = np.load("data/dev.npz")
test = np.load("data/test.npz")

x1_train = torch.tensor(train_data["sentence1_tokens"], dtype=torch.long)
x2_train = torch.tensor(train_data["sentence2_tokens"], dtype=torch.long)
y_train = torch.tensor(train_data["label"], dtype=torch.long)

x1_dev = torch.tensor(dev_data["sentence1_tokens"], dtype=torch.long)
x2_dev = torch.tensor(dev_data["sentence2_tokens"], dtype=torch.long)
y_dev = torch.tensor(dev_data["label"], dtype=torch.long)

x1_test = torch.tensor(test["sentence1_tokens"], dtype=torch.long)
x2_test = torch.tensor(test["sentence2_tokens"], dtype=torch.long)
y_test = torch.tensor(test["label"], dtype=torch.long)

# Datasets & loaders
train_ds = TensorDataset(x1_train, x2_train, y_train)
dev_ds = TensorDataset(x1_dev, x2_dev, y_dev)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(TensorDataset(x1_test, x2_test, y_test), batch_size=BATCH_SIZE)

# Load embedding matrix
emb_mat = torch.tensor(np.load("data/embedding_matrix.npy"), dtype=torch.float32)

In [None]:
# Model, optimizer, scaler, loss
model = ESIM(
    embedding_matrix=emb_mat,
    hidden_size=HIDDEN,
    num_classes=NUM_CLASSES,
    dropout=0.5,
    padding_idx=0,
).to(device)

model = torch.compile(model, backend="inductor") 

opt = optim.Adam(model.parameters(), lr=LR)
scaler = GradScaler()
crit = nn.CrossEntropyLoss()

In [None]:
def compute_lengths(x):
    # count non-zero tokens per row
    return (x != 0).sum(dim=1)


def evaluate(loader, return_loss=False):
    """
    Evaluate model on a DataLoader.
    If return_loss=True, returns (avg_loss, accuracy), otherwise just accuracy.
    """
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for x1, x2, y in loader:
            x1, x2, y = x1.to(device), x2.to(device), y.to(device)
            l1, l2 = compute_lengths(x1), compute_lengths(x2)
            logits = model(x1, l1, x2, l2)
            preds = logits.argmax(dim=1)
            total_correct += (preds == y).sum().item()
            total_samples += y.size(0)
            if return_loss:
                loss = crit(logits, y)
                total_loss += loss.item() * y.size(0)

    accuracy = total_correct / total_samples
    if return_loss:
        return total_loss / total_samples, accuracy
    return accuracy

In [None]:
for epoch in range(1, EPOCHS+1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False)
    running_loss = 0
    running_correct = 0
    samples = 0

    for x1, x2, y in pbar:
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)
        l1, l2 = compute_lengths(x1), compute_lengths(x2)

        opt.zero_grad()
        with autocast(device_type=device.type):
            logits = model(x1, l1, x2, l2)
            loss   = crit(logits, y)
        scaler.scale(loss).backward()
        scaler.step(opt); scaler.update()

        bs = y.size(0)
        running_loss    += loss.item() * bs
        running_correct += (logits.argmax(1)==y).sum().item()
        samples         += bs

        avg_loss = running_loss / samples
        avg_acc  = running_correct / samples
        pbar.set_postfix(loss=f"{avg_loss:.4f}", acc=f"{avg_acc:.4f}")

    # optional end‑of‑epoch eval
    dev_loss, dev_acc = evaluate(dev_loader, return_loss=True)
    print(f"→ Dev  loss: {dev_loss:.4f}, acc: {dev_acc:.4f}")

In [None]:
torch.save(model, "data/esim_nli_model.pt")

In [None]:
model = torch.load("data/esim_nli_model.pt", map_location=device, weights_only=False)
model.eval()

In [None]:
test_loss, test_acc = evaluate(test_loader, return_loss=True)
print(f"Test  loss: {test_loss:.4f},  acc: {test_acc:.4f}")