In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import json

class SpellingDataset(Dataset):
    def __init__(self, spelling_dict, tokenizer, max_length=128):
        self.pairs = []
        for correct, misspellings in spelling_dict.items():
            for misspelling in misspellings:
                self.pairs.append((misspelling, correct))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        misspelling, correct = self.pairs[idx]
        
        misspelling_encoding = self.tokenizer(
            misspelling,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt'
        )
        
        correct_encoding = self.tokenizer(
            correct,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': misspelling_encoding['input_ids'].squeeze(),
            'attention_mask': misspelling_encoding['attention_mask'].squeeze(),
            'labels': correct_encoding['input_ids'].squeeze()
        }

class SpellingCorrector(nn.Module):
    def __init__(self, vocab_size, hidden_size=768):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.decoder = nn.TransformerDecoderLayer(hidden_size, nhead=8)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, input_ids, attention_mask):
        encoder_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        decoder_output = self.decoder(encoder_output, encoder_output)
        return self.fc(decoder_output)

def train_model():
    # Load data
    with open('spelling_dictionary.json', 'r') as f:
        spelling_dict = json.load(f)
    
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = SpellingCorrector(tokenizer.vocab_size)
    
    # Create dataset and dataloader
    dataset = SpellingDataset(spelling_dict, tokenizer)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Training setup
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    optimizer = torch.optim.Adam(model.parameters())
    
    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        for batch in dataloader:
            optimizer.zero_grad()
            output = model(batch['input_ids'], batch['attention_mask'])
            loss = criterion(output.view(-1, tokenizer.vocab_size), batch['labels'].view(-1))
            loss.backward()
            optimizer.step()
            
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    
    return model, tokenizer

def correct_spelling(model, tokenizer, text):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        predicted_tokens = torch.argmax(outputs, dim=-1)
        corrected_text = tokenizer.decode(predicted_tokens[0], skip_special_tokens=True)
    return corrected_text

if __name__ == "__main__":
    model, tokenizer = train_model()
    
    # Save model
    torch.save(model.state_dict(), 'spelling_corrector.pth')
    
    # Test correction
    test_words = ["recieve", "seperate", "accomodate"]
    for word in test_words:
        corrected = correct_spelling(model, tokenizer, word)
        print(f"{word} -> {corrected}")

2025-02-10 14:38:51.079113: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739194731.145880  100786 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739194731.166153  100786 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 14:38:51.302023: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

: 