In [39]:
import sys
print(sys.executable)

c:\Users\admin\AppData\Local\Programs\Python\Python312\python.exe


In [40]:
!{sys.executable} -m pip install torch transformers



# Libraries

In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import random
from typing import Dict, List, Tuple, Optional
import time

# BERT Implementation

In [42]:
class MultiHeadAttention(nn.Module):
    """Multi-Head Attention mechanism with improved numerical stability"""
    
    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        # Separate weights for Q, K, V projections
        self.w_q = nn.Linear(d_model, d_model, bias=False)
        self.w_k = nn.Linear(d_model, d_model, bias=False)
        self.w_v = nn.Linear(d_model, d_model, bias=False)
        self.w_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.scale = math.sqrt(self.d_k)
        
    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, 
                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        batch_size, seq_len, _ = query.size()
        
        # Linear transformations and reshape for multi-head
        Q = self.w_q(query).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        K = self.w_k(key).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        V = self.w_v(value).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        
        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        
        # Apply mask if provided
        if mask is not None:
            # Expand mask for multi-head attention
            mask = mask.unsqueeze(1).unsqueeze(1)
            scores.masked_fill_(mask == 0, -1e9)
        
        # Apply softmax
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention to values
        context = torch.matmul(attention_weights, V)
        
        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(
            batch_size, seq_len, self.d_model
        )
        
        # Final linear transformation
        output = self.w_o(context)
        
        return output

class PositionwiseFeedForward(nn.Module):
    """Position-wise Feed-Forward Network with GELU activation"""
    
    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear2(self.dropout(self.activation(self.linear1(x))))

class TransformerBlock(nn.Module):
    """Transformer block with pre-norm architecture"""
    
    def __init__(self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model, eps=1e-12)
        self.norm2 = nn.LayerNorm(d_model, eps=1e-12)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Self-attention with residual connection and layer norm
        attn_output = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

class BertEmbeddings(nn.Module):
    """BERT embeddings with token, position, and segment embeddings"""
    
    def __init__(self, vocab_size: int, d_model: int, max_position_embeddings: int = 512,
                 type_vocab_size: int = 2, dropout: float = 0.1):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.position_embeddings = nn.Embedding(max_position_embeddings, d_model)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, d_model)
        
        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize position ids
        self.register_buffer("position_ids", torch.arange(max_position_embeddings).expand((1, -1)))
        
    def forward(self, input_ids: torch.Tensor, 
                token_type_ids: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
        seq_length = input_ids.size(1)
        
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]
            
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)
        
        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        
        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        
        return embeddings

class BertEncoder(nn.Module):
    """Stack of Transformer blocks"""
    
    def __init__(self, n_layers: int, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])
        
    def forward(self, hidden_states: torch.Tensor, 
                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        for layer in self.layers:
            hidden_states = layer(hidden_states, attention_mask)
        return hidden_states

class BertPooler(nn.Module):
    """Pooler for [CLS] token representation"""
    
    def __init__(self, d_model: int):
        super().__init__()
        self.dense = nn.Linear(d_model, d_model)
        self.activation = nn.Tanh()
        
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # Take the hidden state corresponding to the first token ([CLS])
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class BertPredictionHeadTransform(nn.Module):
    """Transform for MLM predictions"""
    
    def __init__(self, d_model: int):
        super().__init__()
        self.dense = nn.Linear(d_model, d_model)
        self.activation = nn.GELU()
        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)
        
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

class BertLMPredictionHead(nn.Module):
    """Language Model prediction head"""
    
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.transform = BertPredictionHeadTransform(d_model)
        self.decoder = nn.Linear(d_model, vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(vocab_size))
        
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias
        return hidden_states

class BertPreTrainingHeads(nn.Module):
    """Pre-training heads for MLM and NSP"""
    
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.predictions = BertLMPredictionHead(d_model, vocab_size)
        self.seq_relationship = nn.Linear(d_model, 2)
        
    def forward(self, sequence_output: torch.Tensor, 
                pooled_output: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

class BertModel(nn.Module):
    """BERT model with improved architecture"""
    
    def __init__(self, config: Dict):
        super().__init__()
        self.config = config
        
        self.embeddings = BertEmbeddings(
            vocab_size=config['vocab_size'],
            d_model=config['hidden_size'],
            max_position_embeddings=config['max_position_embeddings'],
            type_vocab_size=config['type_vocab_size'],
            dropout=config['hidden_dropout_prob']
        )
        
        self.encoder = BertEncoder(
            n_layers=config['num_hidden_layers'],
            d_model=config['hidden_size'],
            n_heads=config['num_attention_heads'],
            d_ff=config['intermediate_size'],
            dropout=config['hidden_dropout_prob']
        )
        
        self.pooler = BertPooler(config['hidden_size'])
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config['initializer_range'])
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config['initializer_range'])
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, input_ids: torch.Tensor,
                attention_mask: Optional[torch.Tensor] = None,
                token_type_ids: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        
        # Create attention mask if not provided
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        
        # Get embeddings
        embedding_output = self.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids
        )
        
        # Pass through encoder
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=attention_mask
        )
        
        # Pool the [CLS] token
        pooled_output = self.pooler(encoder_outputs)
        
        return {
            'last_hidden_state': encoder_outputs,
            'pooler_output': pooled_output
        }

class BertForPreTraining(nn.Module):
    """BERT model with pre-training heads"""
    
    def __init__(self, config: Dict):
        super().__init__()
        self.bert = BertModel(config)
        self.cls = BertPreTrainingHeads(config['hidden_size'], config['vocab_size'])
        
        # Tie weights between input embeddings and output embeddings
        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
        
    def forward(self, input_ids: torch.Tensor,
                attention_mask: Optional[torch.Tensor] = None,
                token_type_ids: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.Tensor] = None,
                labels: Optional[torch.Tensor] = None,
                next_sentence_label: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids
        )
        
        sequence_output = outputs['last_hidden_state']
        pooled_output = outputs['pooler_output']
        
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
        
        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, prediction_scores.size(-1)), labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss
            
        return {
            'loss': total_loss,
            'prediction_logits': prediction_scores,
            'seq_relationship_logits': seq_relationship_score,
            'hidden_states': outputs['last_hidden_state'],
            'pooler_output': outputs['pooler_output']
        }

class BertForSequenceClassification(nn.Module):
    """BERT for sequence classification tasks"""
    
    def __init__(self, config: Dict, num_labels: int):
        super().__init__()
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config['hidden_dropout_prob'])
        self.classifier = nn.Linear(config['hidden_size'], num_labels)
        
    def forward(self, input_ids: torch.Tensor,
                attention_mask: Optional[torch.Tensor] = None,
                token_type_ids: Optional[torch.Tensor] = None,
                labels: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs['pooler_output']
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        return {
            'loss': loss,
            'logits': logits,
            'hidden_states': outputs['last_hidden_state'],
            'pooler_output': outputs['pooler_output']
        }



In [43]:
# Configuration for BERT-base
def get_bert_config(model_size='base'):
    """Get BERT configuration"""
    if model_size == 'base':
        return {
            'vocab_size': 30522,
            'hidden_size': 768,
            'num_hidden_layers': 12,
            'num_attention_heads': 12,
            'intermediate_size': 3072,
            'hidden_dropout_prob': 0.1,
            'attention_probs_dropout_prob': 0.1,
            'max_position_embeddings': 512,
            'type_vocab_size': 2,
            'initializer_range': 0.02,
        }
    elif model_size == 'large':
        return {
            'vocab_size': 30522,
            'hidden_size': 1024,
            'num_hidden_layers': 24,
            'num_attention_heads': 16,
            'intermediate_size': 4096,
            'hidden_dropout_prob': 0.1,
            'attention_probs_dropout_prob': 0.1,
            'max_position_embeddings': 512,
            'type_vocab_size': 2,
            'initializer_range': 0.02,
        }
    else:
        raise ValueError(f"Unknown model size: {model_size}")


In [44]:
# Create BERT model
config = get_bert_config('base')
model = BertForPreTraining(config)

# Example input
batch_size = 2
seq_length = 128
input_ids = torch.randint(0, config['vocab_size'], (batch_size, seq_length))
attention_mask = torch.ones(batch_size, seq_length)
token_type_ids = torch.zeros(batch_size, seq_length, dtype=torch.long)

# Forward pass
outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

print(f"Model created successfully!")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Output shapes:")
print(f"  - Prediction logits: {outputs['prediction_logits'].shape}")
print(f"  - NSP logits: {outputs['seq_relationship_logits'].shape}")

Model created successfully!
Total parameters: 110,078,780
Output shapes:
  - Prediction logits: torch.Size([2, 128, 30522])
  - NSP logits: torch.Size([2, 2])


# Dataset

In [45]:
class BertDataset(Dataset):
    """Dataset for BERT pre-training with MLM and NSP tasks"""
    
    def __init__(self, 
                 texts: List[str],
                 tokenizer: BertTokenizer,
                 max_length: int = 512,
                 mlm_probability: float = 0.15,
                 short_seq_prob: float = 0.1):
        """
        Args:
            texts: List of text documents
            tokenizer: BERT tokenizer
            max_length: Maximum sequence length
            mlm_probability: Probability of masking tokens for MLM
            short_seq_prob: Probability of creating shorter sequences
        """
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mlm_probability = mlm_probability
        self.short_seq_prob = short_seq_prob
        
        # Pre-process texts into sentences
        self.documents = self._preprocess_texts()
        
    def _preprocess_texts(self) -> List[List[str]]:
        """Split texts into sentences"""
        documents = []
        for text in self.texts:
            # Simple sentence splitting (can be improved with NLTK or spaCy)
            sentences = [s.strip() for s in text.split('.') if s.strip()]
            if sentences:
                documents.append(sentences)
        return documents
    
    def _get_random_sentence(self, exclude_doc_idx: int) -> str:
        """Get a random sentence from a different document"""
        doc_idx = random.choice([i for i in range(len(self.documents)) if i != exclude_doc_idx])
        if self.documents[doc_idx]:
            return random.choice(self.documents[doc_idx])
        return ""
    
    def _create_training_instance(self, doc_idx: int) -> Tuple[str, str, int]:
        """Create a training instance with sentence A, sentence B, and NSP label"""
        document = self.documents[doc_idx]
        
        # Get sentence A
        sent_idx_a = random.randint(0, len(document) - 1)
        sent_a = document[sent_idx_a]
        
        # Create sentence B and NSP label
        if random.random() < 0.5 and sent_idx_a < len(document) - 1:
            # Next sentence (positive example)
            sent_b = document[sent_idx_a + 1]
            is_next = 1
        else:
            # Random sentence (negative example)
            sent_b = self._get_random_sentence(doc_idx)
            is_next = 0
            
        return sent_a, sent_b, is_next
    
    def _truncate_seq_pair(self, tokens_a: List[int], tokens_b: List[int], max_length: int):
        """Truncate sequence pair to fit max_length"""
        while len(tokens_a) + len(tokens_b) > max_length - 3:  # Account for [CLS], [SEP], [SEP]
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
    
    def _create_masked_lm_predictions(self, 
                                     tokens: List[int],
                                     mlm_probability: float) -> Tuple[List[int], List[int]]:
        """Create masked language model predictions"""
        output_tokens = tokens.copy()
        output_labels = [-100] * len(tokens)  # -100 is ignored by CrossEntropyLoss
        
        # Get candidates for masking (exclude [CLS], [SEP], [PAD])
        candidate_indices = []
        for i, token in enumerate(tokens):
            if token not in [self.tokenizer.cls_token_id, 
                           self.tokenizer.sep_token_id,
                           self.tokenizer.pad_token_id]:
                candidate_indices.append(i)
        
        # Sample indices to mask
        random.shuffle(candidate_indices)
        num_to_mask = max(1, int(len(candidate_indices) * mlm_probability))
        mask_indices = candidate_indices[:num_to_mask]
        
        for idx in mask_indices:
            # 80% of the time, replace with [MASK]
            if random.random() < 0.8:
                output_tokens[idx] = self.tokenizer.mask_token_id
            else:
                # 10% of the time, replace with random token
                if random.random() < 0.5:
                    output_tokens[idx] = random.randint(0, self.tokenizer.vocab_size - 1)
                # 10% of the time, keep original token
                
            output_labels[idx] = tokens[idx]
            
        return output_tokens, output_labels
    
    def __len__(self) -> int:
        return len(self.documents) * 100  # Create multiple instances per document
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        # Get document index
        doc_idx = idx % len(self.documents)
        
        # Create training instance
        sent_a, sent_b, is_next = self._create_training_instance(doc_idx)
        
        # Tokenize sentences
        tokens_a = self.tokenizer.tokenize(sent_a)
        tokens_b = self.tokenizer.tokenize(sent_b) if sent_b else []
        
        # Truncate to fit max_length
        self._truncate_seq_pair(tokens_a, tokens_b, self.max_length)
        
        # Build input sequence: [CLS] A [SEP] B [SEP]
        tokens = [self.tokenizer.cls_token_id]
        segment_ids = [0]
        
        for token in tokens_a:
            tokens.append(self.tokenizer.convert_tokens_to_ids(token))
            segment_ids.append(0)
            
        tokens.append(self.tokenizer.sep_token_id)
        segment_ids.append(0)
        
        if tokens_b:
            for token in tokens_b:
                tokens.append(self.tokenizer.convert_tokens_to_ids(token))
                segment_ids.append(1)
                
            tokens.append(self.tokenizer.sep_token_id)
            segment_ids.append(1)
        
        # Create attention mask
        attention_mask = [1] * len(tokens)
        
        # Pad sequences
        padding_length = self.max_length - len(tokens)
        tokens.extend([self.tokenizer.pad_token_id] * padding_length)
        segment_ids.extend([0] * padding_length)
        attention_mask.extend([0] * padding_length)
        
        # Create MLM predictions
        masked_tokens, mlm_labels = self._create_masked_lm_predictions(tokens, self.mlm_probability)
        
        return {
            'input_ids': torch.tensor(masked_tokens, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(segment_ids, dtype=torch.long),
            'labels': torch.tensor(mlm_labels, dtype=torch.long),
            'next_sentence_label': torch.tensor(is_next, dtype=torch.long)
        }

class BertTrainer:
    """Trainer class for BERT pre-training"""
    
    def __init__(self,
                 model: torch.nn.Module,
                 train_dataloader: DataLoader,
                 val_dataloader: Optional[DataLoader] = None,
                 learning_rate: float = 1e-4,
                 warmup_steps: int = 10000,
                 weight_decay: float = 0.01,
                 max_grad_norm: float = 1.0,
                 device: Optional[str] = None):
        
        # Auto-detect device if not specified
        if device is None:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device
            
        self.model = model.to(self.device)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.max_grad_norm = max_grad_norm
        
        # Optimizer with weight decay
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in model.named_parameters() 
                          if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay,
            },
            {
                'params': [p for n, p in model.named_parameters() 
                          if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
            }
        ]
        
        self.optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)
        
        # Learning rate scheduler with warmup
        total_steps = len(train_dataloader) * 10  # Assuming 10 epochs
        self.scheduler = self._get_linear_schedule_with_warmup(
            self.optimizer, warmup_steps, total_steps
        )
        
        self.train_losses = []
        self.val_losses = []
        
    def _get_linear_schedule_with_warmup(self, optimizer, num_warmup_steps, num_training_steps):
        """Create a schedule with a learning rate that decreases linearly after warmup"""
        def lr_lambda(current_step):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            return max(
                0.0, float(num_training_steps - current_step) / 
                float(max(1, num_training_steps - num_warmup_steps))
            )
        return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    
    def train_epoch(self) -> float:
        """Train for one epoch"""
        self.model.train()
        total_loss = 0
        
        for batch_idx, batch in enumerate(self.train_dataloader):
            # Move batch to device
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            # Forward pass
            outputs = self.model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                token_type_ids=batch['token_type_ids'],
                labels=batch['labels'],
                next_sentence_label=batch['next_sentence_label']
            )
            
            loss = outputs['loss']
            
            # Backward pass
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
            
            # Update weights
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()
            
            total_loss += loss.item()
            
            # Log progress
            if batch_idx % 100 == 0:
                print(f"Batch {batch_idx}/{len(self.train_dataloader)}, "
                      f"Loss: {loss.item():.4f}, "
                      f"LR: {self.scheduler.get_last_lr()[0]:.6f}")
        
        avg_loss = total_loss / len(self.train_dataloader)
        self.train_losses.append(avg_loss)
        return avg_loss
    
    def validate(self) -> float:
        """Validate the model"""
        if self.val_dataloader is None:
            return 0.0
            
        self.model.eval()
        total_loss = 0
        
        with torch.no_grad():
            for batch in self.val_dataloader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                
                outputs = self.model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    token_type_ids=batch['token_type_ids'],
                    labels=batch['labels'],
                    next_sentence_label=batch['next_sentence_label']
                )
                
                total_loss += outputs['loss'].item()
        
        avg_loss = total_loss / len(self.val_dataloader)
        self.val_losses.append(avg_loss)
        return avg_loss
    
    def train(self, num_epochs: int):
        """Main training loop"""
        print(f"Starting training for {num_epochs} epochs...")
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")
            
            # Train
            train_loss = self.train_epoch()
            print(f"Average training loss: {train_loss:.4f}")
            
            # Validate
            if self.val_dataloader:
                val_loss = self.validate()
                print(f"Average validation loss: {val_loss:.4f}")
            
            # Save checkpoint
            self.save_checkpoint(epoch + 1)
    
    def save_checkpoint(self, epoch: int):
        """Save model checkpoint"""
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
        }
        torch.save(checkpoint, f'bert_checkpoint_epoch_{epoch}.pt')
        print(f"Checkpoint saved: bert_checkpoint_epoch_{epoch}.pt")

In [46]:
def demo_bert_training():
    """Demonstrate BERT training pipeline"""
    
    # Sample texts for training
    texts = [
        "BERT is a transformer-based machine learning technique for natural language processing. "
        "It was developed by Google and introduced in 2018. BERT stands for Bidirectional Encoder "
        "Representations from Transformers.",
        
        "Natural language processing is a subfield of linguistics, computer science, and artificial "
        "intelligence concerned with the interactions between computers and human language. "
        "In particular, it focuses on programming computers to process and analyze large amounts of natural language data.",
        
        "The transformer architecture relies entirely on self-attention mechanisms to compute "
        "representations of its input and output. Unlike recurrent neural networks, transformers "
        "do not require sequential processing and can process all positions in parallel.",
    ]
    
    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Create dataset and dataloader
    dataset = BertDataset(texts, tokenizer, max_length=128)
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    
    # Create model
    config = get_bert_config('base')
    model = BertForPreTraining(config)
    
    # Create trainer
    trainer = BertTrainer(
        model=model,
        train_dataloader=dataloader,
        learning_rate=5e-5,
        warmup_steps=100,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    
    # Train for 1 epoch (for demonstration)
    trainer.train(num_epochs=1)
    
    print("\nTraining completed!")

In [47]:
# demo_bert_training()

# Usage

In [48]:
# # Sample texts
# texts = [
#     """BERT is a transformer-based machine learning technique for natural language processing. 
#     It was developed by Google and introduced in 2018. BERT stands for Bidirectional Encoder 
#     Representations from Transformers. The model is designed to pre-train deep bidirectional 
#     representations from unlabeled text by jointly conditioning on both left and right context.""",
    
#     """Natural language processing is a subfield of linguistics, computer science, and artificial 
#     intelligence. It focuses on the interactions between computers and human language. In particular, 
#     how to program computers to process and analyze large amounts of natural language data. 
#     The goal is a computer capable of understanding the contents of documents.""",
    
#     """The transformer architecture relies entirely on self-attention mechanisms. It computes 
#     representations of its input and output without using sequence-aligned RNNs or convolution. 
#     Unlike recurrent neural networks, transformers do not require sequential processing. They can 
#     process all positions in parallel, making them much more efficient for training.""",
    
#     """Machine learning is a method of data analysis that automates analytical model building. 
#     It is a branch of artificial intelligence based on the idea that systems can learn from data. 
#     They can identify patterns and make decisions with minimal human intervention. Machine learning 
#     algorithms build a model based on sample data, known as training data.""",
    
#     """Deep learning is part of a broader family of machine learning methods. It is based on 
#     artificial neural networks with representation learning. Learning can be supervised, 
#     semi-supervised or unsupervised. Deep learning architectures have been applied to fields 
#     including computer vision, speech recognition, and natural language processing.""",
    
#     """Attention mechanisms allow models to focus on specific parts of the input when producing output. 
#     In the context of neural networks, attention helps the model learn which parts of the input 
#     are most relevant for the current task. Self-attention, also known as intra-attention, relates 
#     different positions of a single sequence to compute a representation of the sequence.""",
    
#     """Pre-training in deep learning refers to training a model on a large dataset before fine-tuning 
#     it on a smaller, task-specific dataset. This approach has been particularly successful in NLP. 
#     Models like BERT are pre-trained on massive text corpora using self-supervised objectives. 
#     They can then be fine-tuned on downstream tasks with relatively small amounts of labeled data.""",
    
#     """The masked language model is a training technique where some tokens in the input are randomly 
#     masked. The model must predict the original tokens based on the context. This allows BERT to 
#     learn bidirectional representations, as it can use information from both left and right context. 
#     The masking strategy includes replacing tokens with [MASK], random tokens, or keeping them unchanged."""
# ]

# # Initialize BERT tokenizer
# # Using the pre-trained tokenizer from Hugging Face for consistency
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [49]:
# # 1. Tạo model
# config = get_bert_config('base')
# model = BertForPreTraining(config)

# # 2. Chuẩn bị data
# dataset = BertDataset(texts, tokenizer, max_length=128)
# dataloader = DataLoader(dataset, batch_size=16)

# # 3. Training
# trainer = BertTrainer(model, dataloader, device='cpu')
# trainer.train(num_epochs=10)

# # 4. Fine-tuning cho classification
# classifier = BertForSequenceClassification(config, num_labels=2)

In [50]:
def quick_demo():
    """Quick demo that shows BERT working without full training"""
    
    # Minimal texts for demo
    texts = [
        "BERT uses masked language modeling to learn bidirectional representations.",
        "The transformer architecture enables parallel processing of sequences.",
        "Self-attention mechanisms capture long-range dependencies in text."
    ]
    
    # Setup
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Create small dataset - only 3 documents × 5 = 15 samples
    dataset = BertDataset(texts, tokenizer, max_length=64)  # Shorter sequences
    dataset.__len__ = lambda: 15  # Override to create fewer samples
    
    # Small batch size for CPU
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    # Create model
    config = get_bert_config('base')
    # Smaller model for demo
    config['num_hidden_layers'] = 2  # Only 2 layers instead of 12
    config['num_attention_heads'] = 4  # Fewer attention heads
    config['hidden_size'] = 256  # Smaller hidden size
    config['intermediate_size'] = 1024  # Smaller FFN
    
    model = BertForPreTraining(config)
    
    print(f"Demo model created with {sum(p.numel() for p in model.parameters()):,} parameters")
    print(f"Dataset size: {len(dataset)} samples")
    print(f"Batch size: 2")
    print(f"Number of batches: {len(dataloader)}")
    
    # Quick training demo
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    
    print(f"\nRunning 3 training steps on {device}...")
    model.train()
    
    for i, batch in enumerate(dataloader):
        if i >= 3:  # Only 3 steps for demo
            break
            
        start_time = time.time()
        
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs['loss']
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        step_time = time.time() - start_time
        print(f"Step {i+1}: Loss = {loss.item():.4f}, Time = {step_time:.2f}s")
    
    print("\nDemo completed!")
    print("\nFor full training, consider:")
    print("- Using GPU (currently on CPU)" if device == 'cpu' else "- GPU detected ✓")
    print("- Using pre-trained weights instead of training from scratch")
    print("- Using HuggingFace's optimized implementation for production")
    
    return model

def test_model_outputs(model, tokenizer):
    """Test the model with a simple example"""
    device = next(model.parameters()).device
    
    # Create a simple masked input
    text = "The capital of France is [MASK]."
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Add dummy labels for demo
    inputs['labels'] = torch.full_like(inputs['input_ids'], -100)
    inputs['next_sentence_label'] = torch.tensor([1])  # Dummy NSP label
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get predictions for [MASK] token
    mask_token_index = (inputs['input_ids'] == tokenizer.mask_token_id).nonzero()[0, 1]
    mask_token_logits = outputs['prediction_logits'][0, mask_token_index]
    
    # Top 5 predictions
    top_5_tokens = torch.topk(mask_token_logits, 5).indices
    print("\nTop 5 predictions for [MASK]:")
    for i, token_id in enumerate(top_5_tokens):
        token = tokenizer.decode([token_id])
        print(f"{i+1}. {token}")


In [51]:
if __name__ == "__main__":
    # Run quick demo
    model = quick_demo()
    
    # Test the model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    test_model_outputs(model, tokenizer)

Demo model created with 9,686,844 parameters
Dataset size: 300 samples
Batch size: 2
Number of batches: 150

Running 3 training steps on cpu...
Step 1: Loss = 10.8948, Time = 0.18s
Step 2: Loss = 11.1435, Time = 0.12s
Step 3: Loss = 10.9053, Time = 0.18s

Demo completed!

For full training, consider:
- Using GPU (currently on CPU)
- Using pre-trained weights instead of training from scratch
- Using HuggingFace's optimized implementation for production

Top 5 predictions for [MASK]:
1. rich
2. treating
3. crews
4. ##rgh
5. dissatisfied
