In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm

# Building BILSTM-CRF (pytorch)

In [2]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim=100, hidden_dim=128, num_layers=1, dropout=0.5):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        # Embedding layer
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim // 2,  
                           num_layers=num_layers, 
                           bidirectional=True,
                           dropout=dropout if num_layers > 1 else 0,
                           batch_first=True)
        
        # Maps the output of the LSTM into tag space
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        
        # Matrix of transition parameters
        # transitions[i, j] is the score of transitioning from j to i
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))
        
        # These two statements enforce constraints on the transitions:
        # 1. Don't transition to the padding tag
        # 2. Don't transition from the padding tag
        self.transitions.data[tag_to_ix["<PAD>"], :] = -10000
        self.transitions.data[:, tag_to_ix["<PAD>"]] = -10000
        
        self.dropout = nn.Dropout(dropout)
    
    def _get_lstm_features(self, input_ids, attention_mask):
        # Get sequence lengths from attention mask
        seq_lengths = attention_mask.sum(dim=1).cpu()
        
        # Embed the tokens
        embeds = self.word_embeds(input_ids)
        embeds = self.dropout(embeds)
        
        # Pack padded sequence for LSTM
        packed = pack_padded_sequence(embeds, seq_lengths, batch_first=True, enforce_sorted=False)
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(packed)
        
        # Unpack sequence
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        
        # Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Project to tag space
        lstm_feats = self.hidden2tag(lstm_out)
        
        return lstm_feats
    
    def _score_sentence(self, feats, tags, mask):
        # Gives the score of a provided tag sequence
        batch_size, seq_len, _ = feats.shape
        
        score = torch.zeros(batch_size, device=feats.device)
        
        # Add transition from start tag to first tag for each sequence
        start_tags = torch.full((batch_size, 1), self.tag_to_ix["<PAD>"], dtype=torch.long, device=feats.device)
        tags = torch.cat([start_tags, tags], dim=1)  # (batch_size, seq_len+1)
        
        for i in range(seq_len):
            # Get mask for current position (batch_size)
            mask_i = mask[:, i]
            
            # Emission score for current position
            emit_score = torch.zeros(batch_size, device=feats.device)
            emit_score[mask_i] = feats[mask_i, i, tags[mask_i, i+1]]
            
            # Transition score from previous to current tag
            trans_score = torch.zeros(batch_size, device=feats.device)
            trans_score[mask_i] = self.transitions[tags[mask_i, i+1], tags[mask_i, i]]
            
            # Add both scores
            score = score + emit_score + trans_score
        
        return score
    
    def _forward_alg(self, feats, mask):
        # Forward algorithm to compute partition function
        batch_size, seq_len, tagset_size = feats.shape
        
        # Initialize forward variables with -10000 (log-space)
        alphas = torch.full((batch_size, tagset_size), -10000.0, device=feats.device)
        # Start with all score from <PAD>
        alphas[:, self.tag_to_ix["<PAD>"]] = 0.
        
        for i in range(seq_len):
            # Get mask for current position (batch_size)
            mask_i = mask[:, i]
            
            # (batch_size, tagset_size, 1)
            alphas_t = alphas.unsqueeze(2)
            # (batch_size, 1, tagset_size)
            emit_scores = feats[:, i].unsqueeze(1)
            
            # (batch_size, tagset_size, tagset_size)
            next_tag_var = alphas_t + self.transitions + emit_scores
            
            # Get log sum exp over the tagset_size dimension
            next_tag_var = torch.logsumexp(next_tag_var, dim=1)
            
            # ✅ Convert mask to boolean before using in torch.where
            mask_i = mask_i.unsqueeze(1).expand_as(next_tag_var).bool()
            alphas = torch.where(mask_i, next_tag_var, alphas)
        
        # Add transition to STOP_TAG (using <PAD> as stop tag here)
        terminal_var = alphas + self.transitions[self.tag_to_ix["<PAD>"]]
        alphas = torch.logsumexp(terminal_var, dim=1)
        
        return alphas

    
    def neg_log_likelihood(self, input_ids, tags, attention_mask):
        # Get the emission scores from the BiLSTM
        feats = self._get_lstm_features(input_ids, attention_mask)
        
        # Find the best path, and the score of that path
        forward_score = self._forward_alg(feats, attention_mask)
        gold_score = self._score_sentence(feats, tags, attention_mask)
        
        # Return negative log likelihood
        return torch.mean(forward_score - gold_score)
    
    def _viterbi_decode(self, feats, mask):
        # Find the best path using Viterbi algorithm
        batch_size, seq_len, tagset_size = feats.shape
        
        # Initialize backpointers and viterbi variables
        backpointers = torch.zeros((batch_size, seq_len, tagset_size), dtype=torch.long, device=feats.device)
        
        # Initialize viterbi variables with -10000 (log-space)
        viterbi_vars = torch.full((batch_size, tagset_size), -10000.0, device=feats.device)
        viterbi_vars[:, self.tag_to_ix["<PAD>"]] = 0
        
        for i in range(seq_len):
            # Get mask for current position (batch_size)
            mask_i = mask[:, i]
            
            # (batch_size, tagset_size, 1)
            viterbi_vars_t = viterbi_vars.unsqueeze(2)
            # (batch_size, tagset_size, tagset_size)
            viterbi_scores = viterbi_vars_t + self.transitions
            
            # Find the best tag for each previous tag
            # (batch_size, tagset_size)
            best_tag_id = torch.argmax(viterbi_scores, dim=1)
            best_scores = torch.gather(viterbi_scores, 1, best_tag_id.unsqueeze(1)).squeeze(1)
            
            # Add emission scores
            best_scores = best_scores + feats[:, i]
            
            # Save backpointers and best scores
            backpointers[:, i, :] = best_tag_id
            
            # Set viterbi variables if mask is valid, otherwise keep previous value
            mask_i = mask_i.unsqueeze(1).expand_as(best_scores)
            viterbi_vars = torch.where(mask_i, best_scores, viterbi_vars)
        
        # Transition to STOP_TAG
        terminal_var = viterbi_vars + self.transitions[self.tag_to_ix["<PAD>"]]
        best_tag_id = torch.argmax(terminal_var, dim=1)
        
        # Follow the backpointers to decode the best path
        best_path = torch.zeros((batch_size, seq_len), dtype=torch.long, device=feats.device)
        
        # Start with the best tag for the last position
        best_path[:, -1] = best_tag_id
        
        # Follow the backpointers to find the best path
        for i in range(seq_len-2, -1, -1):
            # Get the best tag for the current position based on the next tag
            best_tag_id = torch.gather(
                backpointers[:, i+1, :], 
                1, 
                best_path[:, i+1].unsqueeze(1)
            ).squeeze(1)
            
            # Only update positions where mask is valid
            mask_i = mask[:, i+1]
            best_path[:, i] = torch.where(mask_i, best_tag_id, torch.zeros_like(best_tag_id))
        
        return best_path
    
    def forward(self, input_ids, attention_mask):
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(input_ids, attention_mask)
        
        # Find the best path using Viterbi algorithm
        tag_seq = self._viterbi_decode(lstm_feats, attention_mask)
        
        return tag_seq

# Training Function

In [4]:
def train_bilstm_crf(model, train_loader, val_loader, device, epochs=10, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]')
        for batch in progress_bar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            tags = batch['tags'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            loss = model.neg_log_likelihood(input_ids, tags, attention_mask)
            
            # Backward pass
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # Update parameters
            optimizer.step()
            
            # Update progress bar
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})
        
        train_loss = total_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            progress_bar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]')
            for batch in progress_bar:
                # Move batch to device
                input_ids = batch['input_ids'].to(device)
                tags = batch['tags'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                # Forward pass
                loss = model.neg_log_likelihood(input_ids, tags, attention_mask)
                
                # Update progress bar
                val_loss += loss.item()
                progress_bar.set_postfix({'loss': val_loss / (progress_bar.n + 1)})
        
        val_loss = val_loss / len(val_loader)
        
        # Update learning rate
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
            print(f'New best model saved with validation loss: {val_loss:.4f}')
    
    # Load best model
    model.load_state_dict(best_model)
    
    return model

# Model Training

In [8]:
from datasets import load_dataset

# Load NER dataset
ner_dataset = load_dataset("conll2003",trust_remote_code=True)

ner_train_dataset = NERDataset(ner_dataset['train'])
ner_val_dataset = NERDataset(ner_dataset['validation'])

# Create DataLoaders
ner_train_loader = DataLoader(ner_train_dataset, batch_size=32, shuffle=True)
ner_val_loader = DataLoader(ner_val_dataset, batch_size=32)

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = BiLSTM_CRF(
    vocab_size=len(ner_train_dataset.word2idx),
    tag_to_ix=ner_train_dataset.tag2idx,
    embedding_dim=100,
    hidden_dim=256,
    num_layers=2,
    dropout=0.5
).to(device)

# Train model
trained_model = train_bilstm_crf(
    model=model,
    train_loader=ner_train_loader,
    val_loader=ner_val_loader,
    device=device,
    epochs=5,
    lr=0.001
)

# Save the model
torch.save(trained_model.state_dict(), 'bilstm_crf_ner.pt')

Downloading data: 100%|██████████| 983k/983k [00:01<00:00, 758kB/s] 
Generating train split: 100%|██████████| 14041/14041 [00:01<00:00, 7184.17 examples/s]
Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 6091.06 examples/s]
Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 7641.89 examples/s]
Building vocabulary: 100%|██████████| 14041/14041 [00:01<00:00, 9794.58it/s]
Building vocabulary: 100%|██████████| 3250/3250 [00:00<00:00, 10671.84it/s]


Using device: cpu


Epoch 1/5 [Train]: 100%|██████████| 439/439 [01:35<00:00,  4.60it/s, loss=5.18e+3]
Epoch 1/5 [Val]: 100%|██████████| 102/102 [00:06<00:00, 15.33it/s, loss=2.28e+3]


Epoch 1/5 - Train Loss: 5178.6807, Val Loss: 2261.7244
New best model saved with validation loss: 2261.7244


Epoch 2/5 [Train]: 100%|██████████| 439/439 [01:55<00:00,  3.82it/s, loss=3.62e+3]
Epoch 2/5 [Val]: 100%|██████████| 102/102 [00:06<00:00, 14.60it/s, loss=716]    


Epoch 2/5 - Train Loss: 3624.9354, Val Loss: 716.1565
New best model saved with validation loss: 716.1565


Epoch 3/5 [Train]: 100%|██████████| 439/439 [01:36<00:00,  4.55it/s, loss=2.08e+3]
Epoch 3/5 [Val]: 100%|██████████| 102/102 [00:06<00:00, 16.45it/s, loss=-781]    


Epoch 3/5 - Train Loss: 2076.3720, Val Loss: -773.2838
New best model saved with validation loss: -773.2838


Epoch 4/5 [Train]: 100%|██████████| 439/439 [01:36<00:00,  4.53it/s, loss=960]    
Epoch 4/5 [Val]: 100%|██████████| 102/102 [00:07<00:00, 14.32it/s, loss=-2.34e+3]


Epoch 4/5 - Train Loss: 960.2039, Val Loss: -2313.8004
New best model saved with validation loss: -2313.8004


Epoch 5/5 [Train]: 100%|██████████| 439/439 [01:57<00:00,  3.73it/s, loss=-612]   
Epoch 5/5 [Val]: 100%|██████████| 102/102 [00:07<00:00, 13.43it/s, loss=-3.82e+3]

Epoch 5/5 - Train Loss: -611.9151, Val Loss: -3817.7824
New best model saved with validation loss: -3817.7824



