In [1]:
!pip install torch conllu tqdm

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvid

In [3]:
from google.colab import drive

# Mount the drive to /content/drive (change if needed)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence
from conllu import parse_incr
from sklearn.metrics import accuracy_score
from tqdm import tqdm, trange
import collections

# -----------------------------
# HYPERPARAMETERS (You can play with these)
# -----------------------------
EMBEDDING_DIM = 128    # Size of the word embedding vectors
HIDDEN_DIM = 256       # Size of the hidden dimension in the LSTM and MLP layers
BATCH_SIZE = 64       # Number of samples per training batch
EPOCHS = 10             # Training epochs
LEARNING_RATE = 0.001  # Initial learning rate

# Detect GPU (CUDA) or default to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ------------------------------------------------------------------
# DATASET CLASS (WORD-LEVEL) + VOCAB BUILDING
# ------------------------------------------------------------------
class DependencyDataset(Dataset):

    def __init__(self, data_file, word2idx=None, build_vocab=False):
        self.sentences = []
        self.heads = []
        self.word2idx = word2idx
        self.build_vocab = build_vocab
        self.word_counter = collections.Counter() if build_vocab else None
        self._load_data(data_file)
        if self.build_vocab:
            # Reserve index 0 for <PAD>, 1 for <UNK>
            self.word2idx = {"<PAD>": 0, "<UNK>": 1}
            for w, _ in self.word_counter.most_common():
                if w not in self.word2idx:
                    self.word2idx[w] = len(self.word2idx)

    def _load_data(self, data_file):
        with open(data_file, encoding="utf-8") as f:
            for sentence in parse_incr(f):
                tokens = [token["form"] for token in sentence]
                heads_raw = [token["head"] for token in sentence]

                if any(h is None for h in heads_raw):
                    continue

                if self.build_vocab:
                    self.word_counter.update(tokens)

                shifted_heads = []
                for h in heads_raw:
                    if h == 0:
                        shifted_heads.append(-1)   # root -> ignore index
                    else:
                        shifted_heads.append(h - 1)  # 1-based -> 0-based

                self.sentences.append(tokens)
                self.heads.append(shifted_heads)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.sentences[idx]
        heads = self.heads[idx]

        indexed_tokens = []
        for w in tokens:
            if self.word2idx is not None:
                indexed_tokens.append(self.word2idx.get(w, self.word2idx["<UNK>"]))
            else:
                indexed_tokens.append(0)

        indexed_tokens_tensor = torch.tensor(indexed_tokens, dtype=torch.long)
        heads_tensor = torch.tensor(heads, dtype=torch.long)
        return indexed_tokens_tensor, heads_tensor


# ------------------------------------------------------------------
# COLLATE FUNCTION FOR DATALOADER
# ------------------------------------------------------------------
def collate_fn(batch):
    """
    Pad sequences of different lengths.
    For word indices, we use 0 (PAD).
    For heads, we use -1 (ignored by the loss).
    """
    indexed_list = [item[0] for item in batch]
    heads_list = [item[1] for item in batch]

    padded_inputs = pad_sequence(indexed_list, batch_first=True, padding_value=0)
    padded_heads = pad_sequence(heads_list, batch_first=True, padding_value=-1)

    return {"input_ids": padded_inputs}, padded_heads


# ------------------------------------------------------------------
# MODEL: BILINEAR PARSER (TODO)
# ------------------------------------------------------------------
class BilinearParser(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(BilinearParser, self).__init__()
        # TODO: define your layers here
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        self.mlp_head = nn.Linear(hidden_dim * 2, hidden_dim)
        self.mlp_dependent = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bilinear = nn.Bilinear(hidden_dim, hidden_dim, 1)
        #pass

    def forward(self, x):
        """
        :param x: [B, T] tensor of word indices
        :return:  [B, T, T+1] arc scores where score[b, i, j] is how likely
                  it is for token i to have head j in batch b.
        """

        # TODO: implement the forward pass

        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)

        head_reps = self.mlp_head(lstm_out)  # [B, T, H]
        dependent_reps = self.mlp_dependent(lstm_out)  # [B, T, H]


        batch_size, seq_len, hidden_dim = dependent_reps.shape
        scores = torch.zeros(batch_size, seq_len, seq_len + 1, device=x.device)

        for i in range(seq_len):
            for j in range(seq_len+1):

                if j==0:
                    score = torch.tensor(0.0, device=x.device)
                else:
                    score = self.bilinear(dependent_reps[:, i, :], head_reps[:,j-1,:]).squeeze(-1)  # [B]


                scores[:, i, j] = score

        return scores

        #pass


# ------------------------------------------------------------------
# MAIN SCRIPT
# ------------------------------------------------------------------
if __name__ == "__main__":
    # Path to your UD file (update path if needed):
    data_file = "/content/drive/MyDrive/Bilinear_parser/en_ewt-ud-train.conllu"
    dataset = DependencyDataset(data_file)
    print(f"Loaded {len(dataset)} sentences.")

    # Build vocabulary
    print("Building vocabulary from dataset...")
    vocab_builder_ds = DependencyDataset(data_file, build_vocab=True)
    word2idx = vocab_builder_ds.word2idx
    print(f"Vocabulary size: {len(word2idx)}")

    # Load final dataset using the built vocabulary
    dataset = DependencyDataset(data_file, word2idx=word2idx, build_vocab=False)

    # Split data
    dataset_size = len(dataset)
    train_size = int(0.8 * dataset_size)
    val_size = int(0.1 * dataset_size)
    test_size = dataset_size - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, [train_size, val_size, test_size]
    )

    print(f"Dataset size: {dataset_size}")
    print(f"Train/Val/Test sizes: {train_size}/{val_size}/{test_size}")

    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             collate_fn=collate_fn)

    # Initialize Model, Optimizer, Loss (TODO)
    vocab_size = len(word2idx)
    model = BilinearParser(vocab_size=vocab_size, embedding_dim=128, hidden_dim=256).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)

    train_loader = DataLoader(dataset, batch_size=512, shuffle=True, collate_fn=collate_fn)

    # ---------------------------------------------------------
    # TRAINING LOOP (TODO)
    # ---------------------------------------------------------
    # Hints:
    #  - Iterate for EPOCHS
    #  - For each batch, forward pass, compute loss, backprop, step optimizer
    #  - Keep track of average training loss each epoch
    #  - Print or log training loss

    print("Starting training...")
    # TODO: Implement your training loop here

     # Training Loop
    def train_model(model, train_loader, optimizer, criterion, epochs):
        model.train()
        for epoch in range(epochs):
            total_loss = 0
            for batch in train_loader:
                batch_input, batch_heads = batch  # Unpack the tuple
                inputs = batch_input["input_ids"].to(device)
                heads = batch_heads.to(device)

                optimizer.zero_grad()
                scores = model(inputs)
                scores = scores.view(-1, scores.shape[-1])
                heads = heads.view(-1)
                valid_mask = heads != -1
                loss = criterion(scores[valid_mask], heads[valid_mask])
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")




    # ---------------------------------------------------------
    # (optional) Validation Loop (TODO)
    # ---------------------------------------------------------


    # ---------------------------------------------------------
    # TEST EVALUATION LOOP (TODO)
    # ---------------------------------------------------------

    print("Evaluating on test set...")
    # TODO: Implement your test loop here
    def evaluate_model(model, test_loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for batch in test_loader:
                batch_input, batch_heads = batch  # Unpack the tuple
                inputs = batch_input["input_ids"].to(device)
                heads = batch_heads.to(device)

                scores = model(inputs)
                predictions = torch.argmax(scores, dim=2)
                mask = heads != -1
                correct += torch.sum((predictions == heads) & mask).item()
                total += torch.sum(mask).item()
        print(f"Test Accuracy: {correct / total:.4f}")




    train_model(model, train_loader, optimizer, criterion, EPOCHS)
    evaluate_model(model, test_loader)

    # ---------------------------------------------------------
    # SAMPLE PREDICTIONS
    # ---------------------------------------------------------
    # You can pick a couple of samples from test_dataset to manually
    # inspect the predicted heads vs. the gold heads.
    #

    # Example:
    idx2word = {v: k for k, v in word2idx.items()}
    for i in range(2):
          tokens_tensor, heads_tensor = test_dataset[i]
          input_ids = tokens_tensor.unsqueeze(0).to(device)
          logits = model(input_ids)
          preds = torch.argmax(logits, dim=2).squeeze(0).cpu().numpy()
          words = [idx2word[idx.item()] for idx in tokens_tensor]
          print(f"Sentence: {words}")
          print(f"True Heads: {heads_tensor.tolist()}")
          print(f"Predicted Heads: {preds.tolist()}")
          print("----")



Using device: cuda
Loaded 10324 sentences.
Building vocabulary from dataset...
Vocabulary size: 17384
Dataset size: 10324
Train/Val/Test sizes: 8259/1032/1033
Starting training...
Evaluating on test set...
Epoch 1: Loss = 3.1285
Epoch 2: Loss = 2.5313
Epoch 3: Loss = 1.8489
Epoch 4: Loss = 1.4490
Epoch 5: Loss = 1.2404
Epoch 6: Loss = 1.0698
Epoch 7: Loss = 0.9380
Epoch 8: Loss = 0.8198
Epoch 9: Loss = 0.7281
Epoch 10: Loss = 0.6247
Test Accuracy: 0.8162
Sentence: ['HEAVEN', 'ON', 'EARTHHHHHHH', '!!!!']
True Heads: [-1, 2, 0, 0]
Predicted Heads: [0, 4, 4, 4]
----
Sentence: ['You', 'may', 'also', 'want', 'to', 'avoid', 'Thanksgiving', 'week', 'and', 'the', 'Christmas', 'holidays', '.']
True Heads: [3, 3, 3, -1, 5, 3, 7, 5, 11, 11, 11, 7, 3]
Predicted Heads: [3, 3, 3, 3, 5, 3, 7, 3, 11, 11, 11, 11, 3]
----
