In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import string
import random
from tqdm import tqdm

In [None]:
print("--- Part 1: Training and Saving the Model ---")

# ==============================================================================
# 1. CONFIGURATION AND VOCABULARY
# ==============================================================================
MAX_LEN = 35
HIDDEN_SIZE = 128
NHEAD = 4
NUM_LAYERS = 4
GAME_STATE_SIZE = 27  # 1 for lives_remaining + 26 for one-hot incorrect guesses
BATCH_SIZE = 64
EPOCHS = 3  # Increase for better performance with a larger wordlist
LEARNING_RATE = 1e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_SAVE_PATH = 'character_bert_hangman_final.pth'
WORDLIST_PATH = 'words.txt' # <-- Make sure this file exists

# Define character vocabulary
VOCAB = {'[PAD]': 0, '[MASK]': 1, **{char: i+2 for i, char in enumerate(string.ascii_lowercase)}}
VOCAB_SIZE = len(VOCAB)

# ==============================================================================
# 2. MODEL AND DATASET DEFINITION
# ==============================================================================

class CharacterBERT(nn.Module):
    """The custom Transformer model for Hangman."""
    def __init__(self, vocab_size, max_len, hidden_size, nhead, num_layers, game_state_size):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, hidden_size)
        self.position_embedding = nn.Embedding(max_len, hidden_size)
        self.game_state_encoder = nn.Linear(game_state_size, hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.mlm_head = nn.Linear(hidden_size, vocab_size)
        self.register_buffer('positions', torch.arange(max_len))

    def forward(self, input_ids, game_state_vector):
        seq_len = input_ids.size(1)
        x = self.char_embedding(input_ids) + self.position_embedding(self.positions[:seq_len])
        x = x + self.game_state_encoder(game_state_vector).unsqueeze(1)
        logits = self.mlm_head(self.transformer_encoder(x))
        return logits

class HangmanDataset(Dataset):
    """Generates Hangman game states for training."""
    def __init__(self, word_list, max_len):
        self.word_list = [w.lower() for w in word_list if len(w) <= max_len and w.isalpha()]
        self.max_len = max_len
        self.alphabet = string.ascii_lowercase

    def __len__(self):
        return len(self.word_list) * 20  # Generate 20 variations per word per epoch

    def __getitem__(self, idx):
        word = random.choice(self.word_list)
        unique_letters = list(set(word))

        # Simulate guesses
        num_to_reveal = random.randint(0, max(0, len(unique_letters) - 1))
        correct_guesses = set(random.sample(unique_letters, num_to_reveal))

        # *** LOGIC CHANGE: incorrect guesses are from letters NOT in the word ***
        possible_incorrect = list(set(self.alphabet) - set(unique_letters))
        num_incorrect = random.randint(0, 5)
        incorrect_guesses = set(random.sample(possible_incorrect, num_incorrect))

        # Create input pattern (shows correct guesses) and labels (shows missing chars)
        input_pattern = [VOCAB[c] if c in correct_guesses else VOCAB['[MASK]'] for c in word]
        labels = [VOCAB[c] if c not in correct_guesses else -100 for c in word]

        # Pad sequences
        padding_len = self.max_len - len(word)
        input_ids = input_pattern + [VOCAB['[PAD]']] * padding_len
        labels = labels + [-100] * padding_len

        # *** LOGIC CHANGE: game_state_vector only encodes INCORRECT guesses ***
        lives_remaining = 6 - len(incorrect_guesses)
        incorrect_mask = [1.0 if c in incorrect_guesses else 0.0 for c in self.alphabet]
        game_state_vector = [lives_remaining / 6.0] + incorrect_mask

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long),
            'game_state_vector': torch.tensor(game_state_vector, dtype=torch.float32)
        }

# ==============================================================================
# 3. TRAINING LOOP
# ==============================================================================

from tqdm import tqdm  # Ensure tqdm is imported
import torch.nn as nn  # If not already imported

def train(model, dataloader, epochs, lr, device):
    """Main training loop with conditional logit masking and per-batch tracking."""
    model.to(device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    print(f"Starting training on {device} for {epochs} epochs...")

    for epoch in range(epochs):
        total_loss = 0
        print(f"\nEpoch {epoch+1}/{epochs}")

        # Wrap dataloader in tqdm for batch-level progress
        batch_iterator = tqdm(dataloader, desc="Training Batches", leave=False)

        for batch_idx, batch in enumerate(batch_iterator):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            game_state_vector = batch['game_state_vector'].to(device)

            logits = model(input_ids, game_state_vector)

            # Mask logits using only INCORRECT guesses
            incorrect_guesses_mask = game_state_vector[:, 1:]
            logit_mask = torch.zeros(logits.size(0), VOCAB_SIZE, device=device)
            logit_mask[:, 2:] = incorrect_guesses_mask * -1e9  # large negative value
            masked_logits = logits + logit_mask.unsqueeze(1)

            loss = criterion(masked_logits.view(-1, VOCAB_SIZE), labels.view(-1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # Update tqdm description with current batch loss
            batch_iterator.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} | Average Loss: {avg_loss:.4f}")

    # Save the trained model
    torch.save(model.state_dict(), MODEL_SAVE_PATH)
    print(f"\n--- Training Complete. Model saved to '{MODEL_SAVE_PATH}' ---")

# ==============================================================================
# 4. EXECUTION
# ==============================================================================

if __name__ == '__main__':
    try:
        with open(WORDLIST_PATH, 'r') as f:
            word_list = [line.strip() for line in f]
        print(f"Loaded {len(word_list)} words from '{WORDLIST_PATH}'.")
    except FileNotFoundError:
        print(f"Error: '{WORDLIST_PATH}' not found. Please create it.")
        exit()

    # Initialize model and data
    model = CharacterBERT(VOCAB_SIZE, MAX_LEN, HIDDEN_SIZE, NHEAD, NUM_LAYERS, GAME_STATE_SIZE)
    dataset = HangmanDataset(word_list, MAX_LEN)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # Train the model
    train(model, dataloader, epochs=EPOCHS, lr=LEARNING_RATE, device=DEVICE)

--- Part 1: Training and Saving the Model ---
Loaded 227300 words from '/content/drive/My Drive/HMan/words.txt'.


KeyboardInterrupt: 

In [None]:
# Cell 2: Model and Dataset Classes

# ==============================================================================
# 1. MODEL DEFINITION
# ==============================================================================
class CharacterBERT(nn.Module):
    """The custom Transformer model for Hangman."""
    def __init__(self, vocab_size, max_len, hidden_size, nhead, num_layers, game_state_size):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, hidden_size)
        self.position_embedding = nn.Embedding(max_len, hidden_size)
        self.game_state_encoder = nn.Linear(game_state_size, hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.mlm_head = nn.Linear(hidden_size, vocab_size)
        self.register_buffer('positions', torch.arange(max_len))

    def forward(self, input_ids, game_state_vector):
        seq_len = input_ids.size(1)
        x = self.char_embedding(input_ids) + self.position_embedding(self.positions[:seq_len])
        x = x + self.game_state_encoder(game_state_vector).unsqueeze(1)
        logits = self.mlm_head(self.transformer_encoder(x))
        return logits

# ==============================================================================
# 2. DATASET DEFINITION
# ==============================================================================
class HangmanDataset(Dataset):
    """Generates Hangman game states for training."""
    def __init__(self, word_list, max_len):
        self.word_list = [w.lower() for w in word_list if len(w) <= max_len and w.isalpha()]
        self.max_len = max_len
        self.alphabet = string.ascii_lowercase

    def __len__(self):
        return len(self.word_list) * 20  # Generate 20 variations per word per epoch

    def __getitem__(self, idx):
        word = random.choice(self.word_list)
        unique_letters = list(set(word))

        num_to_reveal = random.randint(0, max(0, len(unique_letters) - 1))
        correct_guesses = set(random.sample(unique_letters, num_to_reveal))

        possible_incorrect = list(set(self.alphabet) - set(unique_letters))
        num_incorrect = random.randint(0, 5)
        incorrect_guesses = set(random.sample(possible_incorrect, num_incorrect))

        input_pattern = [VOCAB[c] if c in correct_guesses else VOCAB['[MASK]'] for c in word]
        labels = [VOCAB[c] if c not in correct_guesses else -100 for c in word]

        padding_len = self.max_len - len(word)
        input_ids = input_pattern + [VOCAB['[PAD]']] * padding_len
        labels = labels + [-100] * padding_len

        lives_remaining = 6 - len(incorrect_guesses)
        incorrect_mask = [1.0 if c in incorrect_guesses else 0.0 for c in self.alphabet]
        game_state_vector = [lives_remaining / 6.0] + incorrect_mask

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long),
            'game_state_vector': torch.tensor(game_state_vector, dtype=torch.float32)
        }

print("Model and Dataset classes defined.")

Model and Dataset classes defined.
