In [2]:
import os
import sys
import io
import time
import random
import json
import gc
import warnings
from pathlib import Path
from datetime import timedelta
import psutil
import numpy as np
from contextlib import contextmanager, nullcontext
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm  # or tqdm.auto if needed
from typing import List, Union, Optional, Dict, Any
from dataclasses import dataclass
import threading
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import tempfile


# Check if running in Google Colab
def is_colab():
    try:
        return 'google.colab' in str(get_ipython())
    except NameError:
        return False

# Mount Google Drive if in Colab
if is_colab():
    from google.colab import drive
    drive.mount('/content/drive')

# Create LLM directory in Drive
base_dir = Path('/content/drive/MyDrive/LLM') if is_colab() else Path('./LLM')
for dir_name in ['checkpoints', 'models', 'logs', 'configs', 'data']:
    (base_dir / dir_name).mkdir(parents=True, exist_ok=True)

def is_package_installed(package_name):
    try:
        __import__(package_name)
        return True
    except ImportError:
        return False

if is_colab():
    # PyTorch packages
    pytorch_packages = ['torch', 'torchvision', 'torchaudio']
    pytorch_install = [pkg for pkg in pytorch_packages if not is_package_installed(pkg)]
    if pytorch_install:
        !pip install {' '.join(pytorch_install)}

    # Additional packages
    other_packages = ['pynvml', 'nvidia_ml_py3', 'gputil', 'fastapi', 'uvicorn', 'pydantic', 'sentencepiece']
    other_install = [pkg for pkg in other_packages if not is_package_installed(pkg)]
    if other_install:
        !pip install {' '.join(other_install)}

# Suppress warnings
warnings.filterwarnings('ignore')

print("Section 1: Initial setup and core components complete")

Mounted at /content/drive
Collecting pynvml
  Downloading pynvml-12.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting nvidia_ml_py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastapi
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting nvidia-ml-py<13.0.0a0,>=12.0.0 (from pynvml)
  Downloading nvidia_ml_py-12.570.86-py3-none-any.whl.metadata (8.7 kB)
Collecting starlette<0.46.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.45.3-py3-none-any.whl.metadata (6.3 kB)
Downloading pynvml-12.0.0-py3-none-any.whl (26 kB)
Downloading fastapi-0.115.8-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadin

In [3]:
@dataclass
class ModelConfig:
    """Configuration for model training and architecture"""
    # Model Architecture
    vocab_size: int = 0
    block_size: int = 64
    n_embed: int = 64
    n_head: int = 4
    n_layer: int = 4
    ff_dim: int = 256
    head_dim: int = 32

    # Regularization
    resid_pdrop: float = 0.2
    weight_decay: float = 0.1

    # Architecture Features
    bias: bool = True
    flash_attn: bool = True
    use_gradient_checkpointing: bool = False

    # Training Parameters
    batch_size: int = 32
    gradient_accumulation_steps: int = 1
    epochs: int = 10
    max_grad_norm: float = 1.0
    gradient_clip_val: float = 1.0
    log_interval: int = 100  # Added this parameter

    # Enhanced Learning Rate Parameters
    learning_rate: float = 1e-4
    min_learning_rate: float = 1e-5
    warmup_steps: int = 200
    lr_schedule: str = 'cosine_with_warmup'  # Options: 'cosine_with_warmup', 'linear_with_warmup', 'step'
    lr_decay_epochs: int = 8  # For step scheduler
    warmup_ratio: float = 0.001  # Alternative to warmup_steps (ratio of total training steps)

    # Early Stopping
    patience: int = 5
    min_delta: float = 1e-4
    loss_threshold: float = 1.0
    within_epoch_loss_threshold: float = 0.3

    # Evaluation and Checkpointing
    eval_steps: int = 10000
    eval_every: int = 10000
    save_every: int = 1
    keep_last_n_checkpoints: int = 5

    # Precision
    use_amp: bool = True
    amp_dtype: torch.dtype = torch.bfloat16
    dtype: torch.dtype = torch.bfloat16

    # System Parameters
    pin_memory: bool = True
    device: str = "cuda"
    num_workers: int = 8
    prefetch_factor: int = 5

    # Paths
    save_dir: Union[str, Path] = Path("/content/drive/MyDrive/LLM/checkpoints")
    tokenizer_model_path='/content/drive/MyDrive/LLM/configs/sentencepiece.model'
    training_data_path='/content/drive/MyDrive/LLM/data/training.txt'
    validation_data_path='/content/drive/MyDrive/LLM/data/validation.txt'

    # Generation Parameters
    gen_temperature: float = 0.8
    max_gen_tokens: int = 256
    top_k: int = 50
    top_p: float = 0.9

    def __post_init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
        if not isinstance(self.save_dir, Path):
            self.save_dir = Path(self.save_dir)
        self.save_dir.mkdir(parents=True, exist_ok=True)

    def print_params(self) -> None:
        """Print model and training parameters"""
        print("Model Parameters:")
        for key, value in self.__dict__.items():
            print(f"  {key}: {value}")

    def save(self, path: Union[str, Path]) -> None:
        """Save configuration to JSON"""
        path = Path(path)
        config_dict = {
            k: str(v) if isinstance(v, (Path, torch.dtype)) else v
            for k, v in self.__dict__.items()
        }
        path.parent.mkdir(parents=True, exist_ok=True)
        with path.open('w') as f:
            json.dump(config_dict, f, indent=4)

    @classmethod
    def load(cls, path: Union[str, Path]) -> 'ModelConfig':
        """Load configuration from JSON"""
        path = Path(path)
        with path.open('r') as f:
            config_dict = json.load(f)

        if 'dtype' in config_dict:
            config_dict['dtype'] = getattr(torch, config_dict['dtype'].split('.')[-1])
        if 'save_dir' in config_dict:
            config_dict['save_dir'] = str(config_dict['save_dir'])

        return cls(**config_dict)

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from typing import Union
import sentencepiece as spm


class TokenDataset(Dataset):
    """Dataset for tokenized text sequences."""
    def __init__(self, tokens: list, block_size: int):
        self.tokens = tokens
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx: int):
        x = self.tokens[idx : idx + self.block_size]
        y = self.tokens[idx + 1 : idx + self.block_size + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

class DataManager:
    def __init__(self, config: ModelConfig):
        self.config = config
        self._tokenizer_initialized = False
        self.tokenizer = None
        self.train_loader = None
        self.val_loader = None

    def initialize_tokenizer(self):
        """Initialize SentencePiece tokenizer from training data."""
        try:
            model_dir = Path(self.config.tokenizer_model_path).parent
            model_dir.mkdir(parents=True, exist_ok=True)
            model_prefix = model_dir / 'sentencepiece'

            spm.SentencePieceTrainer.train(
                input=str(self.config.training_data_path),
                model_prefix=str(model_prefix),
                vocab_size=self.config.vocab_size,
                character_coverage=1.0,
                model_type='bpe'
            )

            self.tokenizer = spm.SentencePieceProcessor()
            self.tokenizer.load(str(model_prefix) + '.model')

            self.char_to_idx = {self.tokenizer.id_to_piece(i): i for i in range(self.tokenizer.get_piece_size())}
            self.idx_to_char = {i: self.tokenizer.id_to_piece(i) for i in range(self.tokenizer.get_piece_size())}

            self._tokenizer_initialized = True

        except Exception as e:
            print(f"Error initializing tokenizer: {str(e)}")
            raise

    def _load_and_tokenize(self, file_path: Union[str, Path]) -> list:
        """Tokenize a text file into a list of token IDs."""
        tokens = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue
                line_tokens = self.tokenizer.encode(line.strip(), out_type=int)
                tokens.extend(line_tokens)
        return tokens

    def load_data(self):
        """Load and tokenize training/validation data into DataLoaders."""
        if not self._tokenizer_initialized:
            self.initialize_tokenizer()

        # Load training data
        train_tokens = self._load_and_tokenize(self.config.training_data_path)
        self.train_dataset = TokenDataset(train_tokens, self.config.block_size)
        self.train_loader = DataLoader(
            self.train_dataset,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=self.config.num_workers,
            pin_memory=self.config.pin_memory,
            prefetch_factor=self.config.prefetch_factor
        )

        # Load validation data
        val_tokens = self._load_and_tokenize(self.config.validation_data_path)
        self.val_dataset = TokenDataset(val_tokens, self.config.block_size)
        self.val_loader = DataLoader(
            self.val_dataset,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=self.config.num_workers,
            pin_memory=self.config.pin_memory,
            prefetch_factor=self.config.prefetch_factor
        )

    def cleanup(self):
        """Cleanup resources and memory."""
        for attr in ['train_loader', 'val_loader', 'train_dataset', 'val_dataset', 'tokenizer']:
            if hasattr(self, attr):
                delattr(self, attr)
        torch.cuda.empty_cache()

    def encode(self, text: str) -> list:
        """Encode text into token IDs."""
        if not self._tokenizer_initialized:
            raise RuntimeError("Tokenizer not initialized")
        return self.tokenizer.encode(text, out_type=int)

    def decode(self, token_ids: list) -> str:
        """Decode token IDs back into a human-readable string."""
        if not self._tokenizer_initialized:
            raise RuntimeError("Tokenizer not initialized")
        return self.tokenizer.decode(token_ids, out_type=str)

In [5]:
import sentencepiece as spm

# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('/content/drive/MyDrive/LLM/configs/sentencepiece.model')

# encode: text => id
print(sp.encode_as_pieces('his is a sentence to encode.t'))
print(sp.encode_as_ids('This is a sentence to encode.'))

# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
#print(sp.decode_ids([326, 54, 5, 7366, 21, 3369, 1719, 7961]))

['▁h', 'is', '▁', 'is', '▁a', '▁s', 'en', 't', 'en', 'c', 'e', '▁to', '▁', 'en', 'c', 'o', 'd', 'e', '.', 't']
[40, 66, 48, 17, 40, 17, 5, 12, 28, 42, 28, 53, 41, 21, 40, 28, 53, 43, 52, 41, 61]
▁This▁is a test


In [6]:
class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)
        self.max_seq_len_cached = max_position_embeddings

        # Initialize cache
        t = torch.arange(max_position_embeddings, device=inv_freq.device).type_as(inv_freq)
        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.cos_cached = emb.cos()[None, None, :, :]
        self.sin_cached = emb.sin()[None, None, :, :]

    def forward(self, x, seq_len=None):
        if seq_len > self.max_seq_len_cached:
            self.max_seq_len_cached = seq_len
            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
            emb = torch.cat((freqs, freqs), dim=-1)
            self.cos_cached = emb.cos()[None, None, :, :]
            self.sin_cached = emb.sin()[None, None, :, :]

        return (
            self.cos_cached[:, :, :seq_len, ...].to(x.device),
            self.sin_cached[:, :, :seq_len, ...].to(x.device)
        )

def rotate_half(x):
    x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, k, cos, sin):
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

class CausalSelfAttention(nn.Module):
    """Multi-head causal self-attention layer with RoPE and residual dropout"""
    def __init__(self, config):
        super().__init__()
        assert config.n_embed % config.n_head == 0

        # Store necessary dimensions
        self.n_embed = config.n_embed
        self.n_head = config.n_head
        self.head_dim = config.n_embed // config.n_head

        # Key, query, value projections for all heads
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embed, config.n_embed, bias=config.bias)

        # Regularization
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # Initialize RoPE
        self.rope = RotaryEmbedding(
            self.head_dim,
            max_position_embeddings=config.block_size
        )

        # Flash attention support
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer(
                "mask",
                torch.tril(torch.ones(config.block_size, config.block_size))
                .view(1, 1, config.block_size, config.block_size)
            )

        # Scaling factor for attention
        self.scale = 1.0 / math.sqrt(self.head_dim)

    def forward(self, x):
        B, T, C = x.size()

        # Calculate query, key, values for all heads
        q, k, v = self.c_attn(x).split(self.n_embed, dim=2)

        # Reshape for multi-head attention
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        # Apply RoPE to queries and keys
        cos, sin = self.rope(q, seq_len=T)
        q, k = apply_rotary_pos_emb(q, k, cos, sin)

        # Causal self-attention with flash attention optimization
        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v,
                attn_mask=None,
                dropout_p=0.0,  # Disabled in favor of residual dropout
                is_causal=True,
                scale=self.scale
            )
        else:
            att = (q @ k.transpose(-2, -1)) * self.scale
            att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            y = att @ v

        # Re-assemble and project
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class LayerNorm(nn.Module):
    """LayerNorm with optional bias"""
    def __init__(self, ndim, bias=True):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class GPTBlock(nn.Module):
    """Transformer block with residual dropout"""
    def __init__(self, config):
        super().__init__()
        self.ln1 = LayerNorm(config.n_embed, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln2 = LayerNorm(config.n_embed, bias=config.bias)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embed, 4 * config.n_embed, bias=config.bias),
            nn.GELU(),
            nn.Linear(4 * config.n_embed, config.n_embed, bias=config.bias),
            nn.Dropout(config.resid_pdrop)
        )

        # Training optimizations
        self.use_checkpointing = config.use_gradient_checkpointing
        self.layer_scale_1 = nn.Parameter(torch.ones(config.n_embed) * 0.1)
        self.layer_scale_2 = nn.Parameter(torch.ones(config.n_embed) * 0.1)

    def forward(self, x):
        # Attention block
        if self.use_checkpointing and x.requires_grad:
            attn_output = torch.utils.checkpoint.checkpoint(self.attn, self.ln1(x))
        else:
            attn_output = self.attn(self.ln1(x))
        x = x + self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attn_output

        # MLP block
        if self.use_checkpointing and x.requires_grad:
            mlp_output = torch.utils.checkpoint.checkpoint(self.mlp, self.ln2(x))
        else:
            mlp_output = self.mlp(self.ln2(x))
        x = x + self.layer_scale_2.unsqueeze(0).unsqueeze(0) * mlp_output

        return x

class GPT(nn.Module):
    """GPT-like transformer with residual dropout"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embed),
            'h': nn.ModuleList([GPTBlock(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embed)
        })
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)

        # Init
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))

        print(f"Parameters: {sum(p.numel() for p in self.parameters()) / 1e6:.2f}M")

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, LayerNorm):
            torch.nn.init.ones_(module.weight)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()

        with torch.cuda.amp.autocast(enabled=self.config.use_amp, dtype=torch.bfloat16):
            x = self.transformer.wte(idx)
            for block in self.transformer.h:
                x = block(x)
            x = self.transformer.ln_f(x)

            logits = self.lm_head(x)

            # Fixed loss calculation
            if targets is not None:
                loss = F.cross_entropy(
                    logits.view(-1, logits.size(-1)),
                    targets.view(-1)
                )
            else:
                # For generation mode
                logits = logits[:, -1, :]
                loss = None

        return logits, loss

    @torch.no_grad()
    def generate(self, idx: torch.Tensor, max_new_tokens: int,
                temperature: float = 1.0, top_k: Optional[int] = None,
                top_p: Optional[float] = None) -> torch.Tensor:
        """Generate tokens with shape validation and top-p sampling"""
        if idx.dim() != 2:
            idx = idx.unsqueeze(0)  # Add batch dimension if not present

        for _ in range(max_new_tokens):
            # Crop sequence if needed
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]

            # Forward pass
            logits, _ = self(idx_cond)

            # If logits has 3 dimensions (batch, sequence, vocab), take last token
            if logits.dim() == 3:
                logits = logits[:, -1, :]

            # Apply temperature
            logits = logits / temperature

            # Apply top-k sampling if specified
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')

            # Apply top-p (nucleus) sampling if specified
            if top_p is not None:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                # Remove tokens with cumulative probability above the threshold
                sorted_indices_to_remove = cumulative_probs > top_p
                # Shift the indices to the right to keep also the first token above the threshold
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                logits[indices_to_remove] = float('-inf')

            # Sample next token
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            # Append next token
            idx = torch.cat((idx, next_token), dim=1)

        return idx

In [7]:
class ModelInterface:
    """Interface for model operations with comprehensive error handling and state management"""

    def __init__(self, config: ModelConfig):
        """Initialize the interface with configuration and necessary components"""
        self.config = config
        self.original_batch_size = config.batch_size  # Store original batch size
        self.model = None
        self.data_manager = None
        self.scaler = torch.cuda.amp.GradScaler(enabled=getattr(config, 'use_amp', True))
        self._is_initialized = False
        self.checkpoint_info = None
        self._initialize()

    def _initialize(self):
        """Safe initialization with resource management and state verification"""
        try:
            # Verify and adjust batch size if needed
            if self.config.batch_size != self.original_batch_size:
                print(f"Batch size was modified from {self.original_batch_size} to {self.config.batch_size}")
                self.config.batch_size = self.original_batch_size  # Restore original batch size

            # Initialize data manager first
            self.data_manager = DataManager(self.config)

            # Initialize model with proper device placement
            self.model = GPT(self.config)
            self.model = self.model.to(self.config.device)

            self._is_initialized = True

            # Log initialization details
            print(f"Model initialized with {sum(p.numel() for p in self.model.parameters())/1e6:.2f}M parameters")
            print(f"Using device: {self.config.device}")
            print(f"Batch size: {self.config.batch_size}")

            # Adjust gradient accumulation based on GPU memory if needed
            if self.config.device == "cuda":
                gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
                if gpu_mem < 8:  # Only adjust if very limited memory
                    self.config.gradient_accumulation_steps = 2
                    print(f"Adjusted gradient accumulation steps to {self.config.gradient_accumulation_steps} due to limited GPU memory")

        except Exception as e:
            self.cleanup()
            raise RuntimeError(f"Failed to initialize model interface: {str(e)}")

    def verify_state(self):
        """Verify that the model and vocabulary are in a valid state"""
        if not self._is_initialized:
            raise RuntimeError("Model interface not properly initialized")

        if not self.model:
            raise RuntimeError("Model not loaded")

        if not self.data_manager:
            raise RuntimeError("Data manager not initialized")

        if not self.data_manager.char_to_idx:
            raise RuntimeError("Vocabulary mappings not loaded")

        # Verify model vocabulary size matches config
        if self.model.config.vocab_size != self.config.vocab_size:
            raise ValueError(f"Model vocabulary size mismatch: {self.model.config.vocab_size} != {self.config.vocab_size}")

        # Verify batch size hasn't been modified
        if self.config.batch_size != self.original_batch_size:
            print(f"Batch size mismatch detected. Restoring original batch size: {self.original_batch_size}")
            self.config.batch_size = self.original_batch_size


    def load_model(self, model_path: Union[str, Path]):
        """Load model with strict config validation and Tiktoken tokenizer checks"""
        try:
            # Load checkpoint with device mapping
            checkpoint = torch.load(model_path, map_location=self.config.device)

            # 1. Update config from checkpoint
            self.config.__dict__.update(checkpoint['config'])
            self.data_manager.config.__dict__.update(checkpoint['config'])

            # 2. Initialize Tiktoken tokenizer
            self.data_manager.initialize_tokenizer()

            # 4. Initialize model with updated config
            self.model = GPT(self.config).to(self.config.device)

            # 5. Load weights with architecture validation
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.model.eval()

            # 6. Force device alignment
            self.data_manager.config.device = self.config.device

            # 7. Warmup GPU and verify state
            with torch.cuda.amp.autocast():
                _ = self.model.generate(
                    torch.zeros((1,1), dtype=torch.long, device=self.config.device),
                    max_new_tokens=1
                )

            self._is_initialized = True
            print(f"Model loaded successfully on {self.config.device}")

        except Exception as e:
            self.cleanup()
            raise RuntimeError(f"Model loading failed: {str(e)}")

    def generate_text(self, prompt: str, max_tokens: int = 100,
                      temperature: float = 0.7, top_k: int = 50) -> str:
        """Safe text generation with full state validation"""
        self.verify_state()  # Includes tokenizer check

        with torch.no_grad(), torch.cuda.amp.autocast():
            # Encode the prompt into token IDs and convert to tensor
            input_ids = torch.tensor(
                self.data_manager.encode(prompt),
                dtype=torch.long
            ).unsqueeze(0).to(self.config.device)

            # Generate new tokens
            generated_ids = self.model.generate(
                input_ids,
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_k=top_k
            )

            # Decode the generated token IDs back into text
            generated_text = self.data_manager.decode(generated_ids[0].tolist())

            return generated_text

    def cleanup(self):
        """Comprehensive cleanup of resources and memory"""
        try:
            if hasattr(self, 'model') and self.model is not None:
                self.model.cpu()
                del self.model
                self.model = None

            if hasattr(self, 'data_manager') and self.data_manager is not None:
                del self.data_manager
                self.data_manager = None

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            gc.collect()

            self._is_initialized = False
            print("Model interface cleaned up successfully")

        except Exception as e:
            print(f"Error during cleanup: {str(e)}")

In [8]:
import inspect
import torch
import gc
def run_interactive_chat(model_path: str):
    """Run interactive chat with the trained model."""
    try:
        print(f"Loading model from {model_path}...")
        checkpoint = torch.load(model_path, map_location='cuda')

        # Get the expected parameters for ModelConfig
        valid_config_params = inspect.signature(ModelConfig.__init__).parameters.keys()

        # Filter the config dictionary to only include valid parameters
        config_dict = {
            k: v for k, v in checkpoint['config'].items()
            if k in valid_config_params
        }

        # Create config with only valid parameters
        config = ModelConfig(**config_dict)
        config.device = 'cuda'  # Force CUDA device

        # Initialize model with filtered config
        model = GPT(config)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(config.device)
        model.eval()

        # Initialize SentencePiece tokenizer
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load(config.tokenizer_model_path)

        # Set generation parameters
        max_new_tokens = 128  # Default value for generation

        print("\nModel loaded successfully!")
        print(f"Vocabulary size: {tokenizer.get_piece_size()}")
        print("\nEnter your prompt (or 'quit' to exit):")

        while True:
            prompt = input("\nYou: ")
            if prompt.lower() in ['quit', 'exit']:
                break

            if not prompt:
                continue

            try:
                # Convert input to tensor using SentencePiece
                input_ids = torch.tensor([tokenizer.encode_as_ids(prompt)], dtype=torch.long)
                input_ids = input_ids.to(config.device)

                # Generate response
                with torch.no_grad():
                    output_ids = model.generate(
                        input_ids,
                        max_new_tokens=max_new_tokens,
                        temperature=0.8,
                        top_k=40,
                        top_p=0.9
                    )

                # Convert output tokens to text using SentencePiece
                output_text = tokenizer.decode(output_ids[0].tolist())
                print("\nModel:", output_text.strip())

            except Exception as e:
                print(f"Error processing input: {str(e)}")
                continue

    except Exception as e:
        print(f"Error during inference: {str(e)}")
        print("\nCheckpoint keys:", checkpoint.keys())
        raise

    finally:
        # Cleanup
        torch.cuda.empty_cache()
        gc.collect()

In [9]:
'''
from dataclasses import dataclass, field
from pathlib import Path
import sentencepiece as spm
import math
import os
from typing import List, Tuple
import random

@dataclass
class VocabularyOptimizer:
    data_path: Path
    max_candidates: int = field(default=14, metadata={"max_value": 10})
    _dataset_stats: dict = field(init=False)

    def __post_init__(self):
        if not self.data_path.exists():
            raise FileNotFoundError(f"Dataset file not found: {self.data_path}")
        self._dataset_stats = self._analyze_dataset()

    def _analyze_dataset(self) -> dict:
        with open(self.data_path, 'r', encoding='utf-8') as f:
            text = f.read()

        return {
            'size_mb': os.path.getsize(self.data_path) / (1024 ** 2),
            'total_chars': len(text),
            'unique_chars': len(set(text)),
            'avg_word_length': self._calculate_avg_word_length(text)
        }

    def _calculate_avg_word_length(self, text: str) -> float:
        words = text.split()
        return sum(len(word) for word in words) / len(words) if words else 0

    def _generate_vocab_candidates(self) -> List[int]:
        # Generate candidates from 100 to 20,000
        candidates = [100 * (2 ** i) for i in range(14)]  # 100, 200, 400, ..., 1638400
        candidates = [c for c in candidates if 100 <= c <= 20000]  # Filter to keep within the range
        candidates = sorted(set(candidates))  # Remove duplicates and sort

        # Ensure the list does not exceed max_candidates
        return candidates[:self.max_candidates]

    def _evaluate_vocab_size(self, vocab_size: int) -> Tuple[float, float]:
        model_prefix = f"temp_model_{vocab_size}"

        spm.SentencePieceTrainer.train(
            input=str(self.data_path),
            model_prefix=model_prefix,
            vocab_size=vocab_size,
            character_coverage=1.0,
            model_type='bpe',
            num_threads=os.cpu_count()
        )

        sp_model = spm.SentencePieceProcessor()
        sp_model.load(f"{model_prefix}.model")

        encoded = sp_model.encode_as_ids(open(self.data_path).read())
        entropy = self._calculate_entropy(encoded, vocab_size)
        compression_ratio = len(encoded) / self._dataset_stats['total_chars']

        os.remove(f"{model_prefix}.model")
        os.remove(f"{model_prefix}.vocab")

        return entropy, compression_ratio

    def _calculate_entropy(self, token_ids: List[int], vocab_size: int) -> float:
        from collections import Counter
        counts = Counter(token_ids)
        total = len(token_ids)
        return -sum((count/total) * math.log2(count/total)
                    for count in counts.values() if count > 0)

    def optimize_vocab_size(self) -> int:
        candidates = self._generate_vocab_candidates()
        best_size = None
        best_score = float('inf')
        results = []

        for vocab_size in sorted(candidates):
            try:
                entropy, compression = self._evaluate_vocab_size(vocab_size)
                score = 0.7 * entropy + 0.3 * compression
                results.append((vocab_size, score, entropy, compression))

                if score < best_score:
                    best_score = score
                    best_size = vocab_size

            except Exception as e:
                print(f"Failed training with vocab_size {vocab_size}: {str(e)}")
                continue

        print("Vocab Size\tTotal Score\tEntropy\tCompression Ratio")
        for result in results:
            print(f"{result[0]}\t{result[1]:.4f}\t{result[2]:.4f}\t{result[3]:.4f}")

        return best_size if best_size is not None else 8000

# Usage example
if __name__ == "__main__":
    dataset_path = Path('/content/drive/MyDrive/LLM/data/training.txt')
    optimizer = VocabularyOptimizer(dataset_path)
    optimal_vocab_size = optimizer.optimize_vocab_size()
    print(f"Recommended vocabulary size: {optimal_vocab_size}")
    '''

'\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nimport sentencepiece as spm\nimport math\nimport os\nfrom typing import List, Tuple\nimport random\n\n@dataclass\nclass VocabularyOptimizer:\n    data_path: Path\n    max_candidates: int = field(default=14, metadata={"max_value": 10})\n    _dataset_stats: dict = field(init=False)\n\n    def __post_init__(self):\n        if not self.data_path.exists():\n            raise FileNotFoundError(f"Dataset file not found: {self.data_path}")\n        self._dataset_stats = self._analyze_dataset()\n\n    def _analyze_dataset(self) -> dict:\n        with open(self.data_path, \'r\', encoding=\'utf-8\') as f:\n            text = f.read()\n            \n        return {\n            \'size_mb\': os.path.getsize(self.data_path) / (1024 ** 2),\n            \'total_chars\': len(text),\n            \'unique_chars\': len(set(text)),\n            \'avg_word_length\': self._calculate_avg_word_length(text)\n        }\n\n    def _calculate_

In [10]:
'/content/drive/MyDrive/LLM/data/training.txt'

'/content/drive/MyDrive/LLM/data/training.txt'

In [11]:
if __name__ == "__main__":
    # Configure CUDA for optimal performance
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

    model_path = '/content/drive/MyDrive/LLM/checkpoints/best_model_epoch_1_loss_1_6223_20250131_201310.pt'
    run_interactive_chat(model_path)

Loading model from /content/drive/MyDrive/LLM/checkpoints/best_model_epoch_1_loss_1_6223_20250131_201310.pt...
Parameters: 0.62M

Model loaded successfully!
Vocabulary size: 100

Enter your prompt (or 'quit' to exit):

You: what if

Model: what if you end up at the time. He said, its a good experience. And he was really good for us. They also go from an actual responsible version of Twitter Management and say, b

You: what is the name

Model: what is the name where I can get out there. We really did they come on anything, Im sure theyre using it with Record. In they go back to the business. I think its going to be anything in revenue strategy and put a billi

You: what kind of

Model: what kind of millions of everything was actually highlight then up and show notes. Its like it working out there. Theyre very much good. Super important. When you can show up anything abou

You: if this is

Model: if this is a good value for your friends. Were going to go up being a real future. Yeah. The

KeyboardInterrupt: Interrupted by user