In [49]:
# Masked Word Prediction Challenge
# Authors: Lindy Bujak & Melia [Last Name if you want]
# Course: DATASCI 315 - Machine Learning in Python
# (Private) Kaggle Competition: https://www.kaggle.com/t/579b97152fa0478698f9574c589539a8

In [50]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import random

from collections import Counter # to build vocab easily

In [51]:
# Load text file
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

train_data = load_text("data/train_data.txt")
test_data = load_text("data/test_data.txt")

In [52]:
# Tokenization and Vocabulary - assuming everything is in the vocab from train # note: no option for pad or unknown token
def build_vocab(text, min_freq=1):
    tokens = text.split()
    counter = Counter(tokens)
    vocab = {word: idx for idx, (word, count) in enumerate(counter.items(), start=2) if count >= min_freq}
    vocab["<mask>"] = 0
    vocab["<unk>"] = 1
    return vocab

vocab = build_vocab(train_data)

In [53]:
# tokenizer
class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inv_vocab = {idx: word for word, idx in vocab.items()}

    def encode(self, text):
        text = text.lower().replace('.', ' .') # to ensure <mask>. can be spit
        special_tokens = {"<mask>"}  # Ensure special tokens are not split
        tokens = text.split()  # Use simple split() to preserve "<mask>" as a single token
        return [self.vocab.get(token, vocab["<unk>"]) for token in tokens]

    def decode(self, tokens):
        return ' '.join([self.inv_vocab.get(token, "<unk>") for token in tokens])

tokenizer = Tokenizer(vocab)

In [54]:
class BERTInspiredTransformer(nn.Module):
    def __init__(self, vocab_size, hidden_dim=256, num_layers=6, num_heads=8,
                 intermediate_size=1024, dropout=0.1, max_position_embeddings=512):
        self.setting = {
            'name': 'BERTInspiredTransformer',
            'vocab_size': vocab_size,
            'hidden_dim': hidden_dim,
            'num_layers': num_layers,
            'num_heads': num_heads,
            'intermediate_size': intermediate_size,
            'dropout': dropout,
            'max_position_embeddings': max_position_embeddings
        }
        super(BERTInspiredTransformer, self).__init__()

        # Word embeddings
        self.word_embeddings = nn.Embedding(vocab_size, hidden_dim)

        # Position embeddings (learnable instead of fixed)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_dim)

        # Layer normalization for embeddings
        self.LayerNorm = nn.LayerNorm(hidden_dim, eps=1e-12)
        self.dropout = nn.Dropout(dropout)

        # Register position ids buffer
        position_ids = torch.arange(max_position_embeddings).expand((1, -1))
        self.register_buffer('position_ids', position_ids)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=intermediate_size,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-LN architecture
        )

        # Encoder with layer normalization
        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
            norm=nn.LayerNorm(hidden_dim)
        )

        # Special MLM prediction head - with transformation similar to BERT
        self.prediction_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, vocab_size)
        )

        # Initialize weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        """ Initialize the weights based on BERT initialization pattern """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly larger range for small vocabulary
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        position_ids = self.position_ids[:, :seq_length]

        # Get embeddings
        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        # Sum embeddings
        embeddings = words_embeddings + position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        # Pass through transformer encoder
        encoder_output = self.encoder(embeddings)

        # Apply prediction head for masked language modeling
        prediction_scores = self.prediction_head(encoder_output)

        return prediction_scores

In [55]:
class FocusedMaskedDataset(Dataset):
    def __init__(self, text, vocab, tokenizer, seq_length=32, mask_prob=0.15, random_seed=42):
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.tokens = tokenizer.encode(text)
        self.seq_length = seq_length
        self.mask_prob = mask_prob
        self.mask_token_id = vocab["<mask>"]

        random.seed(random_seed)

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        start_idx = idx
        tokens = self.tokens[start_idx:start_idx+self.seq_length]
        masked_tokens, mask, actual_tokens = self.mask_tokens(tokens)
        return torch.tensor(masked_tokens), torch.tensor(mask), torch.tensor(actual_tokens)

    def mask_tokens(self, tokens):
        masked_tokens = tokens.copy()
        mask = [0] * len(tokens)
        actual_tokens = tokens.copy()

        # Ensure at least one token is masked (80% of the time)
        if random.random() < 0.8:
            # Choose a position to definitely mask
            pos = random.randint(0, len(tokens) - 1)
            masked_tokens[pos] = self.mask_token_id
            mask[pos] = 1

        # Then apply regular masking with probability
        for i in range(len(tokens)):
            if mask[i] == 0 and random.random() < self.mask_prob:
                # BERT-style masking:
                # 80% of the time, replace with [MASK]
                # 10% of the time, replace with random word
                # 10% of the time, keep the word unchanged
                rand = random.random()
                if rand < 0.8:
                    masked_tokens[i] = self.mask_token_id
                elif rand < 0.9:
                    # Replace with random word
                    masked_tokens[i] = random.randint(2, len(self.vocab) - 1)  # Skip special tokens
                # else: keep the token unchanged
                mask[i] = 1

        return masked_tokens, mask, actual_tokens

# Modified training function with learning rate warmup and focus on masked tokens
def specialized_train(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=15):
    model.to(device)
    best_val_acc = 0
    best_model = None

    # For tracking metrics
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        correct_train = 0
        total_train = 0

        for masked_tokens, mask, actual_tokens in train_loader:
            masked_tokens, mask, actual_tokens = masked_tokens.to(device), mask.to(device), actual_tokens.to(device)

            optimizer.zero_grad()
            output = model(masked_tokens)

            # Calculate loss only on masked tokens
            masked_output = output.view(-1, output.size(-1))
            masked_targets = actual_tokens.view(-1)
            masked_loss = criterion(masked_output, masked_targets)

            # Backward and optimize
            masked_loss.backward()

            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_train_loss += masked_loss.item()

            # Calculate accuracy only on masked positions
            predicted_tokens = output.argmax(dim=-1)
            correct_train += ((predicted_tokens == actual_tokens) & (mask == 1)).sum().item()
            total_train += mask.sum().item()

        train_accuracy = (correct_train / total_train) * 100 if total_train > 0 else 0
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        train_accs.append(train_accuracy)

        # Validation
        model.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for masked_tokens, mask, actual_tokens in val_loader:
                masked_tokens, mask, actual_tokens = masked_tokens.to(device), mask.to(device), actual_tokens.to(device)

                output = model(masked_tokens)

                # Loss on masked tokens only
                masked_output = output.view(-1, output.size(-1))
                masked_targets = actual_tokens.view(-1)
                masked_loss = criterion(masked_output, masked_targets)

                total_val_loss += masked_loss.item()

                # Calculate accuracy only on masked positions
                predicted_tokens = output.argmax(dim=-1)
                correct_val += ((predicted_tokens == actual_tokens) & (mask == 1)).sum().item()
                total_val += mask.sum().item()

        val_accuracy = (correct_val / total_val) * 100 if total_val > 0 else 0
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        val_accs.append(val_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
              f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

        # Save best model
        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            best_model = model.state_dict().copy()
            print(f"  New best model saved with validation accuracy: {best_val_acc:.2f}%")

    # Load best model
    if best_model is not None:
        model.load_state_dict(best_model)
        print(f"Loaded best model with validation accuracy: {best_val_acc:.2f}%")

    return model, {
        'train_loss': train_losses,
        'train_acc': train_accs,
        'val_loss': val_losses,
        'val_acc': val_accs
    }

# Training setup
def prepare_and_train():
    # Load data
    train_data = load_text("/content/train_data.txt")

    # Build vocabulary
    vocab = build_vocab(train_data)
    tokenizer = Tokenizer(vocab)
    vocab_size = len(vocab)
    print(f"Vocabulary size: {vocab_size}")

    # Create dataset with longer sequence length
    seq_length = 32  # Longer sequences for more context
    dataset = FocusedMaskedDataset(train_data, vocab, tokenizer, seq_length=seq_length, mask_prob=0.15)

    # Split dataset with fixed seed for reproducibility
    generator = torch.Generator().manual_seed(42)
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=generator)

    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    print(f"Training samples: {train_size}, Validation samples: {val_size}")

    model = BERTInspiredTransformer(
        vocab_size=vocab_size,
        hidden_dim=128,
        num_layers=4,
        num_heads=8,
        intermediate_size=512,
        dropout=0.1
    )

    optimizer = optim.AdamW(
        model.parameters(),
        lr=5e-4,
        weight_decay=0.01
    )

    # Loss function with label smoothing
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model, history = specialized_train(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        num_epochs=15 ###########
    )

    # Save the model
    torch.save({
        'state_dict': model.state_dict(),
        'setting': model.setting
    }, "/content/bert_inspired_transformer.pth")

    return model, tokenizer, vocab, history

In [56]:
# Specialized prediction function for BERT-inspired model
def predict_with_bert_model(model, text, vocab, tokenizer, seq_length, device):
    model.to(device)
    model.eval()

    # Tokenize the text
    tokenized = tokenizer.encode(text)
    mask_token_id = vocab["<mask>"]

    # Find mask positions
    mask_positions = [i for i, token in enumerate(tokenized) if token == mask_token_id]
    predicted_words = [""] * len(mask_positions)

    # For each mask position, create context window
    for i, pos in enumerate(mask_positions):
        # Create window centered on mask
        half_len = seq_length // 2
        start = max(0, pos - half_len)
        end = min(len(tokenized), start + seq_length)

        # Adjust window if at the edges
        if end - start < seq_length:
            if start == 0:
                end = min(len(tokenized), seq_length)
            else:
                start = max(0, end - seq_length)

        # Extract the window
        window = tokenized[start:end]

        # Calculate mask position in window
        window_mask_pos = pos - start

        # Convert to tensor
        input_tensor = torch.tensor([window], device=device)

        # Get predictions
        with torch.no_grad():
            outputs = model(input_tensor)

            # Get prediction for masked position
            if window_mask_pos < len(window):
                logits = outputs[0, window_mask_pos]

                # Get top prediction
                pred_id = torch.argmax(logits).item()
                pred_word = tokenizer.inv_vocab.get(pred_id, "<unk>")

                # Store prediction
                predicted_words[i] = pred_word

    # Check predictions
    missing = sum(1 for word in predicted_words if not word)
    if missing > 0:
        print(f"Warning: {missing} mask positions have no predictions.")

        # Fill missing predictions
        for i in range(len(predicted_words)):
            if not predicted_words[i]:
                predicted_words[i] = "the"  # Fallback to common word

    return predicted_words

# Full prediction and submission function
def generate_predictions_and_submit():
    # Load the trained model
    model_data = torch.load("/content/bert_inspired_transformer.pth")

    # Reload data and vocabulary
    train_data = load_text("/content/train_data.txt")
    test_data = load_text("/content/test_data.txt")
    vocab = build_vocab(train_data)
    tokenizer = Tokenizer(vocab)

    # Create model with the same architecture
    model = BERTInspiredTransformer(
        vocab_size=len(vocab),
        hidden_dim=128,
        num_layers=4,
        num_heads=8,
        intermediate_size=512,
        dropout=0.1
    )

    # Load state dictionary
    model.load_state_dict(model_data['state_dict'])

    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Generate predictions
    predicted_words = predict_with_bert_model(
        model=model,
        text=test_data,
        vocab=vocab,
        tokenizer=tokenizer,
        seq_length=32,
        device=device
    )

    # Check predictions
    print(f"Total predictions: {len(predicted_words)}")
    print(f"First 10 predictions: {predicted_words[:10]}")
    print(f"Last 10 predictions: {predicted_words[-10:]}")

    # Save predictions to CSV
    df = pd.DataFrame({
        'id': range(len(predicted_words)),
        'prediction': predicted_words
    })

    # Save to CSV
    csv_path = "/content/bert_predictions.csv"
    df.to_csv(csv_path, index=False)
    print(f"Predictions saved to {csv_path}")

    return predicted_words

# Run everything
if __name__ == "__main__":
    import pandas as pd

    # Train model
    print("================= TRAINING MODEL =================")
    model, tokenizer, vocab, history = prepare_and_train()

    # Generate predictions
    print("\n=============== GENERATING PREDICTIONS ===============")
    predictions = generate_predictions_and_submit()

Vocabulary size: 22
Training samples: 96921, Validation samples: 10770
Using device: cuda
Epoch 1/15, Train Loss: 0.8410, Train Acc: 48.21%, Val Loss: 0.7634, Val Acc: 58.11%
  New best model saved with validation accuracy: 58.11%
Epoch 2/15, Train Loss: 0.7647, Train Acc: 58.31%, Val Loss: 0.7564, Val Acc: 60.01%
  New best model saved with validation accuracy: 60.01%
Epoch 3/15, Train Loss: 0.7588, Train Acc: 59.25%, Val Loss: 0.7525, Val Acc: 60.01%
  New best model saved with validation accuracy: 60.01%
Epoch 4/15, Train Loss: 0.7556, Train Acc: 59.79%, Val Loss: 0.7497, Val Acc: 60.85%
  New best model saved with validation accuracy: 60.85%
Epoch 5/15, Train Loss: 0.7534, Train Acc: 60.07%, Val Loss: 0.7489, Val Acc: 61.23%
  New best model saved with validation accuracy: 61.23%
Epoch 6/15, Train Loss: 0.7523, Train Acc: 60.40%, Val Loss: 0.7478, Val Acc: 61.32%
  New best model saved with validation accuracy: 61.32%
Epoch 7/15, Train Loss: 0.7513, Train Acc: 60.54%, Val Loss: 0.7