In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from dataclasses import dataclass
# import math
# import tiktoken

# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
# 
# # -----------------------------------------------------------------------------
# # Causal Self-Attention Module
# # -----------------------------------------------------------------------------

# class CasualSelfAttention(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         # Linear layer to project input embeddings into Queries (Q), Keys (K), and Values (V).
#         # Output size is 3 times the embedding size because we need Q, K, and V.
#         self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed)

#         # Linear layer to project the attention output back to the original embedding size.
#         self.c_proj = nn.Linear(config.n_embed, config.n_embed)

#         # Number of attention heads and embedding size.
#         self.n_head = config.n_head
#         self.n_embed = config.n_embed

#         # Create a lower triangular matrix to mask future tokens (for causal/self-attention).
#         # This ensures the model can't "peek" at future tokens during training.
#         self.register_buffer(
#             "bias",
#             torch.tril(torch.ones(config.block_size, config.block_size))
#             .view(1, 1, config.block_size, config.block_size)
#         )

#     def forward(self, x):
#         B, T, C = x.size()  # Batch size, sequence length, embedding dimension.

#         # Project the input x to get combined queries, keys, and values.
#         qkv = self.c_attn(x)

#         # Split the combined projections into separate Q, K, and V tensors.
#         q, k, v = qkv.split(self.n_embed, dim=2)

#         # Reshape Q, K, V to handle multiple attention heads.
#         # New shape: (Batch, heads, sequence length, head dimension)
#         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
#         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
#         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

#         # Compute scaled dot-product attention scores.
#         att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

#         # Apply the causal mask to prevent attention to future tokens.
#         att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))

#         # Apply softmax to convert attention scores to probabilities.
#         att = F.softmax(att, dim=-1)

#         # Apply attention weights to the values (V).
#         y = att @ v

#         # Reshape back to the original input format.
#         y = y.transpose(1, 2).contiguous().view(B, T, C)

#         # Final linear projection to match the input embedding size.
#         y = self.c_proj(y)
#         return y

# # -----------------------------------------------------------------------------
# # Multi-Layer Perceptron (MLP)
# # -----------------------------------------------------------------------------

# class MLP(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         # First linear layer expands the embedding dimension by 4x for richer learning.
#         self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)

#         # GELU activation introduces non-linearity to help the model learn complex patterns.
#         self.gelu = nn.GELU(approximate="tanh")
#         # Second linear layer projects the output back to the original embedding size.
#         self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed)
#         # Correcting initialization: applying constant scaling of weights.
#         nn.init.constant_(self.c_proj.weight, 1)

#     def forward(self, x):
#         x = self.c_fc(x)
#         x = self.gelu(x)
#         x = self.c_proj(x)
#         return x

# # -----------------------------------------------------------------------------
# # Transformer Block
# # -----------------------------------------------------------------------------

# class Block(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         # Layer normalization to stabilize training and improve convergence.
#         self.ln_1 = nn.LayerNorm(config.n_embed)
#         # Causal self-attention to capture relationships between tokens.
#         self.attn = CasualSelfAttention(config)
#         # Another layer normalization before feeding into the MLP.
#         self.ln_2 = nn.LayerNorm(config.n_embed)
#         # Feed-forward neural network (MLP) to process representations from the attention output.
#         self.mlp = MLP(config)

#     def forward(self, x):
#         # First sub-layer: apply LayerNorm, then attention, then add a residual connection.
#         x = x + self.attn(self.ln_1(x))
#         # Second sub-layer: apply LayerNorm, then MLP, then add a residual connection.
#         x = x + self.mlp(self.ln_2(x))
#         return x

# # -----------------------------------------------------------------------------
# # Configuration Class for SMOLL Model
# # -----------------------------------------------------------------------------

# @dataclass
# class SMOLConfig:
#     block_size: int = 1024  # Maximum tokens (context length) the model can see at once.
#     vocab_size: int = 50257  # Vocabulary size.
#     n_layer: int = 12       # Number of transformer blocks (layers).
#     n_head: int = 12        # Number of attention heads.
#     n_embed: int = 768      # Dimensionality of token embeddings.

# # -----------------------------------------------------------------------------
# # SMOLL Model Definition
# # -----------------------------------------------------------------------------

# class SMOLL(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.config = config

#         # Build transformer components in a ModuleDict.
#         self.transformer = nn.ModuleDict({
#             # Token embedding: maps token indices to embedding vectors.
#             'wte': nn.Embedding(config.vocab_size, config.n_embed),
#             # Positional embedding: provides a unique embedding for each position.
#             'wpe': nn.Embedding(config.block_size, config.n_embed),
#             # A list of transformer blocks.
#             'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
#             # Final layer normalization.
#             'ln_f': nn.LayerNorm(config.n_embed),
#         })

#         # Output layer (language model head) to project transformer output to vocabulary logits.
#         self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)

#     def forward(self, idx, targets=None):
#         B, T = idx.size()
#         # Ensure the sequence length does not exceed the model's block size.
#         assert T <= self.config.block_size, (
#             f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
#         )

#         # Create a tensor of position indices.
#         pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
#         # Obtain positional embeddings for each position.
#         pos_emb = self.transformer.wpe(pos)
#         # Obtain token embeddings for the input tokens.
#         tok_emb = self.transformer.wte(idx)
#         # Combine token and positional embeddings.
#         x = tok_emb + pos_emb

#         # Pass through each transformer block.
#         for block in self.transformer.h:
#             x = block(x)

#         # Apply the final layer normalization.
#         x = self.transformer.ln_f(x)
#         # Compute logits by projecting the transformer output to the vocabulary size.
#         logits = self.lm_head(x)

#         loss = None
#         if targets is not None:
#             # Flatten logits and targets for computing cross-entropy loss.
#             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
#         return logits, loss

#     @classmethod
#     def from_pretrained(cls, model_type):
#         """
#         Create a SMOLL model instance with weights loaded from a pretrained smollvm2 model.

#         Args:
#             model_type (str): Must be 'smollvm2-135'.

#         Returns:
#             model (SMOLL): A SMOLL instance with pretrained weights.
#         """
#         # Validate that the requested model type is supported.
#         assert model_type in {'smollvm2-135'}, "Unsupported model type. Only 'smollvm2-135' is supported."

#         # Import the generic pretrained model class from Hugging Face Transformers.
#         from transformers import AutoModelForCausalLM
#         print("Loading weights from pretrained smollvm2 model: %s" % model_type)

#         # Define model hyperparameters based on the smollvm2-135 variant.
#         # (Adjust these values as necessary to match the actual model's configuration.)
#         config_args = {
#             'smollvm2-135': dict(n_layer=12, n_head=12, n_embed=768),  # Example: 135M parameter variant.
#         }[model_type]
#         # Set constant parameters for the smollvm2 model.
#         config_args['vocab_size'] = 50257
#         config_args['block_size'] = 1024

#         # Create a configuration object and initialize a new SMOLL model.
#         config = SMOLConfig(**config_args)
#         model = cls(config)

#         # Retrieve the state dictionary (all parameters) of the new model.
#         sd = model.state_dict()
#         # Exclude keys that correspond to buffers (e.g., attention bias) that are not actual parameters.
#         sd_keys = [k for k in sd.keys() if not k.endswith('.attn.bias')]

#         # Load the Hugging Face smollvm2 model with pretrained weights.
        
#         model_hf = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
#         # model_hf = AutoModelForCausalLM.from_pretrained(model_type)
#         sd_hf = model_hf.state_dict()

#         # Filter out buffer keys from the Hugging Face state dictionary.
#         sd_keys_hf = [k for k in sd_hf.keys() if not k.endswith('.attn.masked_bias')]
#         sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]

#         # List of parameter names that require transposition.
#         # This is necessary because the original smollvm2 weights may use a Conv1D implementation,
#         # whereas our implementation uses a standard Linear layer.
#         transposed = [
#             'attn.c_attn.weight',
#             'attn.c_proj.weight',
#             'mlp.c_fc.weight',
#             'mlp.c_proj.weight'
#         ]

#         # Ensure the number of parameters matches between our model and the Hugging Face model.
#         assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"

#         # Copy the weights from the Hugging Face model into our model's state dictionary.
#         for k in sd_keys_hf:
#             if any(k.endswith(w) for w in transposed):
#                 # For these weights, verify that transposing the Hugging Face weight gives the correct shape.
#                 assert sd_hf[k].shape[::-1] == sd[k].shape, f"Shape mismatch for key {k}"
#                 with torch.no_grad():
#                     sd[k].copy_(sd_hf[k].t())
#             else:
#                 assert sd_hf[k].shape == sd[k].shape, f"Shape mismatch for key {k}"
#                 with torch.no_grad():
#                     sd[k].copy_(sd_hf[k])
#         return model

# import torch
# import torch.nn.functional as F
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # -----------------------------------------------------------------------------
# # Device Setup
# # -----------------------------------------------------------------------------

# device = 'cpu'
# if torch.cuda.is_available():
#     device = 'cuda'
# elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
#     device = "mps"
# print(f"Using device: {device}")

# # -----------------------------------------------------------------------------
# # Data Preparation
# # -----------------------------------------------------------------------------

# # Hyperparameters for training data.
# B, T = 4, 32  # Batch size and sequence length for the training batch.

# # Define the model name (make sure this model exists on Hugging Face).

# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

# # Load the tokenizer from the model repository.
# # tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Read a sample text file (ensure 'input.txt' exists in your working directory).
# with open('/kaggle/input/input-text/input.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

# # Optionally trim the text (here we use the first 1000 characters).
# text = text[:1000]

# # Tokenize the text.
# # (Set add_special_tokens=False if you want raw token ids without extra tokens.)
# tokens = tokenizer.encode(text, add_special_tokens=False)

# # Ensure there are enough tokens to form a training batch (B * T + 1 tokens).
# if len(tokens) < B * T + 1:
#     raise ValueError("Not enough tokens in input.txt for one batch.")

# # Create a tensor from tokens and move it to the selected device.
# buf = torch.tensor(tokens[:B * T + 1], dtype=torch.long, device=device)
# # x: input tokens, y: target tokens (shifted one position).
# x = buf[:-1].view(B, T)
# y = buf[1:].view(B, T)

# # -----------------------------------------------------------------------------
# # Model Initialization and Training
# # -----------------------------------------------------------------------------

# # Load the pre-trained smollvm2 model as a causal language model.
# model = AutoModelForCausalLM.from_pretrained(model_name,use_auth_token=token)
# model.to(device)

# # Define an optimizer (e.g., Adam).
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# # A simple training loop (for demonstration purposes).
# model.train()
# num_steps = 500  # Adjust the number of training steps as needed.
# for step in range(num_steps):
#     optimizer.zero_grad()
#     # When providing labels, the model returns a loss.
#     outputs = model(input_ids=x, labels=y)
#     loss = outputs.loss
#     logits = outputs.logits  # (not used further in this simple loop)

#     loss.backward()
#     optimizer.step()

#     if step % 10 == 0:
#         print(f"Step {step}, Loss: {loss.item()}")

# # -----------------------------------------------------------------------------
# # Inference: Text Generation
# # -----------------------------------------------------------------------------

# def generate(model, prompt, max_length=30, temperature=1.0):
#     """
#     Generate a sequence of tokens from the model given a prompt.

#     Args:
#         model (AutoModelForCausalLM): The trained smollvm2 model.
#         prompt (str): The text prompt to start generation.
#         max_length (int): The number of new tokens to generate.
#         temperature (float): Temperature parameter for sampling diversity.

#     Returns:
#         List[int]: List of token IDs representing the generated sequence.
#     """
#     model.eval()
#     # Encode the prompt into token IDs and add a batch dimension.
#     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

#     with torch.no_grad():
#         for _ in range(max_length):
#             outputs = model(input_ids=input_ids)
#             logits = outputs.logits

#             # Focus on the logits of the last token and apply temperature scaling.
#             next_token_logits = logits[:, -1, :] / temperature

#             # Convert logits to probabilities.
#             probs = F.softmax(next_token_logits, dim=-1)

#             # Sample the next token (alternatively, you could use greedy decoding with argmax).
#             next_token = torch.multinomial(probs, num_samples=1)

#             # Append the sampled token to the input_ids.
#             input_ids = torch.cat([input_ids, next_token], dim=1)

#     # Remove the batch dimension and return the list of token IDs.
#     return input_ids[0].tolist()

# # Generate several sequences from a given prompt.
# num_return_sequences = 5
# max_gen_length = 120
# prompt = "Once upon a time"

# generated_sequences = []
# for i in range(num_return_sequences):
#     seq_ids = generate(model, prompt, max_length=max_gen_length, temperature=1.0)
#     # Decode the token IDs back to text.
#     generated_text = tokenizer.decode(seq_ids, skip_special_tokens=True)
#     generated_sequences.append(generated_text)

# print("\nGenerated sequences:")
# for idx, seq in enumerate(generated_sequences, start=1):
#     print(f"\nSequence {idx}:\n{seq}")


In [None]:

model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"


In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader
# from dataclasses import dataclass
# import math
# import tiktoken
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # -----------------------------------------------------------------------------
# # TECHNIQUE: Set matmul precision to 'high'
# # -----------------------------------------------------------------------------
# torch.set_float32_matmul_precision('high')  # TECHNIQUE: Ensuring high-precision matmul operations

# # -----------------------------------------------------------------------------
# # Causal Self-Attention Module with Flash Attention
# # -----------------------------------------------------------------------------
# class CasualSelfAttention(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed)  # TECHNIQUE: Q, K, V projection
#         self.c_proj = nn.Linear(config.n_embed, config.n_embed)       # TECHNIQUE: Projection after attention
#         self.n_head = config.n_head
#         self.n_embed = config.n_embed
#         self.register_buffer(
#             "bias",
#             torch.tril(torch.ones(config.block_size, config.block_size))
#             .view(1, 1, config.block_size, config.block_size)
#         )  # TECHNIQUE: Causal mask buffer
#     def forward(self, x):
#         B, T, C = x.size()
#         qkv = self.c_attn(x)
#         q, k, v = qkv.split(self.n_embed, dim=2)
#         # TECHNIQUE: Reshape for multi-head attention
#         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
#         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
#         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
#         # TECHNIQUE: Use Flash Attention via scaled_dot_product_attention
#         y = torch.nn.functional.scaled_dot_product_attention(
#             q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
#         )
#         # TECHNIQUE: Recombine attention heads
#         y = y.transpose(1, 2).contiguous().view(B, T, C)
#         y = self.c_proj(y)
#         return y

# # -----------------------------------------------------------------------------
# # Multi-Layer Perceptron (MLP)
# # -----------------------------------------------------------------------------
# class MLP(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)  # TECHNIQUE: Expand dimensions 4x
#         self.gelu = nn.GELU(approximate="tanh")                     # TECHNIQUE: GELU activation
#         self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed)  # TECHNIQUE: Project back to embedding size
#         nn.init.constant_(self.c_proj.weight, 1)                    # TECHNIQUE: Constant initialization
#     def forward(self, x):
#         x = self.c_fc(x)
#         x = self.gelu(x)
#         x = self.c_proj(x)
#         return x

# # -----------------------------------------------------------------------------
# # Transformer Block with Residual Scaling
# # -----------------------------------------------------------------------------
# class Block(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.ln_1 = nn.LayerNorm(config.n_embed)
#         self.attn = CasualSelfAttention(config)
#         self.ln_2 = nn.LayerNorm(config.n_embed)
#         self.mlp = MLP(config)
#         self.res_scale = 1.0 / math.sqrt(2)  # TECHNIQUE: Scale residual outputs by 1/√2
#     def forward(self, x):
#         # TECHNIQUE: Apply residual scaling to both attention and MLP outputs
#         x = x + self.res_scale * self.attn(self.ln_1(x))
#         x = x + self.res_scale * self.mlp(self.ln_2(x))
#         return x

# # -----------------------------------------------------------------------------
# # Configuration Class for SMOLL Model with Power-of-2 Check
# # -----------------------------------------------------------------------------
# @dataclass
# class SMOLConfig:
#     block_size: int = 1024   # Maximum context length.
#     vocab_size: int = 49152  # Updated TECHNIQUE: Set vocabulary size as per checkpoint.
#     n_layer: int = 12        # Number of transformer layers.
#     n_head: int = 9          # Updated TECHNIQUE: Set number of heads so that head_dim = 576/9 = 64.
#     n_embed: int = 576       # Updated TECHNIQUE: Set embedding dimension as per checkpoint.
#     def __post_init__(self):
#         head_dim = self.n_embed // self.n_head
#         if head_dim & (head_dim - 1) != 0:
#             raise ValueError("Head dimension (n_embed/n_head) is not a power of 2!")  # TECHNIQUE: Enforce per-head dimension as a power of 2

# # -----------------------------------------------------------------------------
# # SMOLL Model Definition with Weight Sharing
# # -----------------------------------------------------------------------------
# class SMOLL(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.config = config
#         self.transformer = nn.ModuleDict({
#             'wte': nn.Embedding(config.vocab_size, config.n_embed),  # TECHNIQUE: Token embeddings
#             'wpe': nn.Embedding(config.block_size, config.n_embed),  # TECHNIQUE: Positional embeddings
#             'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
#             'ln_f': nn.LayerNorm(config.n_embed),
#         })
#         self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)  # TECHNIQUE: Output projection layer
#         self.lm_head.weight = self.transformer['wte'].weight  # TECHNIQUE: Tie LM head weights with token embeddings
#     def forward(self, idx, targets=None):
#         B, T = idx.size()
#         assert T <= self.config.block_size, (
#             f"Sequence length {T} exceeds block size {self.config.block_size}"
#         )
#         pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
#         pos_emb = self.transformer['wpe'](pos)
#         tok_emb = self.transformer['wte'](idx)
#         x = tok_emb + pos_emb
#         for block in self.transformer['h']:
#             x = block(x)
#         x = self.transformer['ln_f'](x)
#         logits = self.lm_head(x)
#         loss = None
#         if targets is not None:
#             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
#         return logits, loss
#     @classmethod
#     def from_pretrained(cls, model_type):
#         """
#         Loads a SMOLL model instance using a pretrained HF model.
#         This version loads the state_dict with strict=False.
#         """
#         assert model_type in {'smollvm2-135'}, "Unsupported model type. Only 'smollvm2-135' is supported."
#         from transformers import AutoModelForCausalLM
#         print("Loading weights from pretrained smollvm2 model: %s" % model_type)
#         # Updated configuration to match checkpoint dimensions.
#         config_args = {
#             'smollvm2-135': dict(n_layer=12, n_head=9, n_embed=576),
#         }[model_type]
#         config_args['vocab_size'] = 49152
#         config_args['block_size'] = 1024
#         config = SMOLConfig(**config_args)
#         model = cls(config)
#         hf_model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
#         # TECHNIQUE: Load HF weights with strict=False to allow mismatched/missing keys.
#         model.load_state_dict(hf_model.state_dict(), strict=False)
#         return model

# # -----------------------------------------------------------------------------
# # Device Setup
# # -----------------------------------------------------------------------------
# device = 'cpu'
# if torch.cuda.is_available():
#     device = 'cuda'
# elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
#     device = "mps"
# print(f"Using device: {device}")

# # -----------------------------------------------------------------------------
# # Data Preparation and Dataloader Creation
# # -----------------------------------------------------------------------------
# B, T = 4, 32  # Batch size and sequence length.
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)  # TECHNIQUE: Initialize tokenizer
# with open('/kaggle/input/input-text/input.txt', 'r', encoding='utf-8') as f:
#     text = f.read()
# text = text[:1000]  # TECHNIQUE: Optionally trim the text for demonstration
# tokens = tokenizer.encode(text, add_special_tokens=False)
# if len(tokens) < T + 1:
#     raise ValueError("Not enough tokens in input.txt for one training sample.")

# # -----------------------------------------------------------------------------
# # TECHNIQUE: Full Dataloader and Profiling Setup
# # -----------------------------------------------------------------------------
# class TextDataset(Dataset):
#     def __init__(self, tokens, block_size):
#         self.tokens = tokens
#         self.block_size = block_size
#     def __len__(self):
#         return len(self.tokens) - self.block_size
#     def __getitem__(self, idx):
#         x = torch.tensor(self.tokens[idx:idx+self.block_size], dtype=torch.long)
#         y = torch.tensor(self.tokens[idx+1:idx+self.block_size+1], dtype=torch.long)
#         return x, y

# dataset = TextDataset(tokens, T)
# dataloader = DataLoader(dataset, batch_size=B, shuffle=True)  # TECHNIQUE: Create DataLoader for batch loading

# # -----------------------------------------------------------------------------
# # Model Initialization and Training Loop with Autocast and Profiling
# # -----------------------------------------------------------------------------
# model = SMOLL.from_pretrained("smollvm2-135")
# model.to(device)
# # Note: torch.compile has been removed to avoid backend issues.

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# # TECHNIQUE: Setup PyTorch profiler to capture performance metrics and trace information
# with torch.profiler.profile(
#     schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
#     on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
#     record_shapes=True,
#     profile_memory=True,
#     with_stack=True
# ) as prof:
#     model.train()
#     step = 0
#     num_steps = 8000  # Total training iterations
#     for epoch in range(100):  # Loop over epochs
#         for x_batch, y_batch in dataloader:
#             x_batch, y_batch = x_batch.to(device), y_batch.to(device)
#             optimizer.zero_grad()
#             # TECHNIQUE: Autocast for mixed-precision training (works on CUDA; on CPU it is a no-op)
#             with torch.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16):
#                 logits, loss = model(x_batch, targets=y_batch)
#             loss.backward()
#             optimizer.step()
#             step += 1
#             prof.step()  # TECHNIQUE: Step the profiler after each iteration
#             if step % 10 == 0:
#                 print(f"Step {step}, Loss: {loss.item()}")
#             if step >= num_steps:
#                 break
#         if step >= num_steps:
#             break

# # -----------------------------------------------------------------------------
# # Inference: Text Generation
# # -----------------------------------------------------------------------------
# def generate(model, prompt, max_length=100, temperature=1.0):
#     model.eval()
#     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
#     with torch.no_grad():
#         for _ in range(max_length):
#             logits, _ = model(input_ids)
#             next_token_logits = logits[:, -1, :] / temperature
#             probs = F.softmax(next_token_logits, dim=-1)
#             next_token = torch.multinomial(probs, num_samples=1)
#             input_ids = torch.cat([input_ids, next_token], dim=1)
#     return input_ids[0].tolist()

# num_return_sequences = 5
# max_gen_length = 100
# prompt = "Once upon a time"
# generated_sequences = []
# for i in range(num_return_sequences):
#     seq_ids = generate(model, prompt, max_length=max_gen_length, temperature=1.0)
#     generated_text = tokenizer.decode(seq_ids, skip_special_tokens=True)
#     generated_sequences.append(generated_text)

# print("\nGenerated sequences:")
# for idx, seq in enumerate(generated_sequences, start=1):
#     print(f"\nSequence {idx}:\n{seq}")


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import math
import tiktoken
from transformers import AutoTokenizer, AutoModelForCausalLM

# Make sure to define these globals before running the code:
# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
# token = "your_hf_access_token_here"

# -----------------------------------------------------------------------------
# TECHNIQUE: Set matmul precision to 'high'
# -----------------------------------------------------------------------------
torch.set_float32_matmul_precision('high')  # TECHNIQUE: Ensuring high-precision matmul operations

# -----------------------------------------------------------------------------
# Causal Self-Attention Module with Flash Attention
# -----------------------------------------------------------------------------
class CasualSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed)  # TECHNIQUE: Q, K, V projection
        self.c_proj = nn.Linear(config.n_embed, config.n_embed)       # TECHNIQUE: Projection after attention
        self.n_head = config.n_head
        self.n_embed = config.n_embed
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size)
        )  # TECHNIQUE: Causal mask buffer
    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embed, dim=2)
        # TECHNIQUE: Reshape for multi-head attention
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        # TECHNIQUE: Use Flash Attention via scaled_dot_product_attention
        y = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
        )
        # TECHNIQUE: Recombine attention heads
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y

# -----------------------------------------------------------------------------
# Multi-Layer Perceptron (MLP)
# -----------------------------------------------------------------------------
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)  # TECHNIQUE: Expand dimensions 4x
        self.gelu = nn.GELU(approximate="tanh")                     # TECHNIQUE: GELU activation
        self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed)  # TECHNIQUE: Project back to embedding size
        nn.init.constant_(self.c_proj.weight, 1)                    # TECHNIQUE: Constant initialization
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

# -----------------------------------------------------------------------------
# Transformer Block with Residual Scaling
# -----------------------------------------------------------------------------
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embed)
        self.attn = CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embed)
        self.mlp = MLP(config)
        self.res_scale = 1.0 / math.sqrt(2)  # TECHNIQUE: Scale residual outputs by 1/√2
    def forward(self, x):
        # TECHNIQUE: Apply residual scaling to both attention and MLP outputs
        x = x + self.res_scale * self.attn(self.ln_1(x))
        x = x + self.res_scale * self.mlp(self.ln_2(x))
        return x

# -----------------------------------------------------------------------------
# Configuration Class for SMOLL Model with Power-of-2 Check
# -----------------------------------------------------------------------------
@dataclass
class SMOLConfig:
    block_size: int = 1024   # Maximum context length.
    vocab_size: int = 49152  # Updated TECHNIQUE: Set vocabulary size as per checkpoint.
    n_layer: int = 12        # Number of transformer layers.
    n_head: int = 9          # Updated TECHNIQUE: Set number of heads so that head_dim = 576/9 = 64.
    n_embed: int = 576       # Updated TECHNIQUE: Set embedding dimension as per checkpoint.
    def __post_init__(self):
        head_dim = self.n_embed // self.n_head
        if head_dim & (head_dim - 1) != 0:
            raise ValueError("Head dimension (n_embed/n_head) is not a power of 2!")  # TECHNIQUE: Enforce per-head dimension as a power of 2

# -----------------------------------------------------------------------------
# SMOLL Model Definition with Weight Sharing
# -----------------------------------------------------------------------------
class SMOLL(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embed),  # TECHNIQUE: Token embeddings
            'wpe': nn.Embedding(config.block_size, config.n_embed),  # TECHNIQUE: Positional embeddings
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embed),
        })
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)  # TECHNIQUE: Output projection layer
        self.lm_head.weight = self.transformer['wte'].weight  # TECHNIQUE: Tie LM head weights with token embeddings
    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, (
            f"Sequence length {T} exceeds block size {self.config.block_size}"
        )
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer['wpe'](pos)
        tok_emb = self.transformer['wte'](idx)
        x = tok_emb + pos_emb
        for block in self.transformer['h']:
            x = block(x)
        x = self.transformer['ln_f'](x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    @classmethod
    def from_pretrained(cls, model_type):
        """
        Loads a SMOLL model instance using a pretrained HF model.
        This version loads the state_dict with strict=False.
        """
        assert model_type in {'smollvm2-135'}, "Unsupported model type. Only 'smollvm2-135' is supported."
        from transformers import AutoModelForCausalLM
        print("Loading weights from pretrained smollvm2 model: %s" % model_type)
        # Updated configuration to match checkpoint dimensions.
        config_args = {
            'smollvm2-135': dict(n_layer=12, n_head=9, n_embed=576),
        }[model_type]
        config_args['vocab_size'] = 49152
        config_args['block_size'] = 1024
        config = SMOLConfig(**config_args)
        model = cls(config)
        hf_model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
        # TECHNIQUE: Load HF weights with strict=False to allow mismatched/missing keys.
        model.load_state_dict(hf_model.state_dict(), strict=False)
        return model

# -----------------------------------------------------------------------------
# Device Setup
# -----------------------------------------------------------------------------
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"Using device: {device}")

# -----------------------------------------------------------------------------
# Data Preparation and Dataloader Creation
# -----------------------------------------------------------------------------
B, T = 8, 256  # Batch size and sequence length.
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)  # TECHNIQUE: Initialize tokenizer
with open('/kaggle/input/input-text/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
text = text[:1000]  # TECHNIQUE: Optionally trim the text for demonstration
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < T + 1:
    raise ValueError("Not enough tokens in input.txt for one training sample.")

# -----------------------------------------------------------------------------
# TECHNIQUE: Full Dataloader and Profiling Setup
# -----------------------------------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size
    def __len__(self):
        return len(self.tokens) - self.block_size
    def __getitem__(self, idx):
        x = torch.tensor(self.tokens[idx:idx+self.block_size], dtype=torch.long)
        y = torch.tensor(self.tokens[idx+1:idx+self.block_size+1], dtype=torch.long)
        return x, y

dataset = TextDataset(tokens, T)
dataloader = DataLoader(dataset, batch_size=B, shuffle=True)  # TECHNIQUE: Create DataLoader for batch loading

# -----------------------------------------------------------------------------
# Model Initialization and Training Loop with Autocast and Profiling
# -----------------------------------------------------------------------------
model = SMOLL.from_pretrained("smollvm2-135")
model.to(device)
# Note: torch.compile has been removed to avoid backend issues.
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ---------------------
# First Training Stage: 5000 Steps with Inference Every 500 Steps
# ---------------------
num_steps = 8400
step = 0
with torch.profiler.profile(
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    model.train()
    for epoch in range(50):  # Epoch loop; will break once 5000 steps are reached.
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            # TECHNIQUE: Autocast for mixed-precision training (works on CUDA; on CPU it is a no-op)
            with torch.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16):
                logits, loss = model(x_batch, targets=y_batch)
            loss.backward()
            optimizer.step()
            step += 1
            prof.step()  # TECHNIQUE: Step the profiler after each iteration
            # Every 500 steps, generate text from a fixed prompt and print it.
            # Log every 500 steps
            if step % 500 == 0:
                print(f"*** Completed step {step} ***")
                sample_ids = generate(model, "Once upon a time", max_length=200, temperature=1)
                print(f"Step {step}, Sample: {tokenizer.decode(sample_ids, skip_special_tokens=True)}")
            if step >= num_steps:
                break
        if step >= num_steps:
            break

# Save a checkpoint after 5000 steps.
torch.save(model.state_dict(), "/kaggle/working/smoll_checkpoint.pt")
print("Checkpoint saved after 5000 training steps.")

# ---------------------
# Second Training Stage: Load Checkpoint and Train for 50 More Steps
# ---------------------
model.load_state_dict(torch.load("/kaggle/working/smoll_checkpoint.pt"))
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)  # Reinitialize optimizer if desired

num_steps_extra = 50
step_extra = 0
model.train()
for epoch in range(100):  # This loop will break after 50 steps.
    for x_batch, y_batch in dataloader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        with torch.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16):
            logits, loss = model(x_batch, targets=y_batch)
        loss.backward()
        optimizer.step()
        step_extra += 1
        if step_extra % 10 == 0:
            print(f"Extra Training Step {step_extra}, Loss: {loss.item()}")
        if step_extra >= num_steps_extra:
            break
    if step_extra >= num_steps_extra:
        break

print("Additional 50 training steps completed.")

# -----------------------------------------------------------------------------
# Final Inference: Text Generation
# -----------------------------------------------------------------------------
def generate(model, prompt, max_length=200, temperature=1.0):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        for _ in range(max_length):
            logits, _ = model(input_ids)
            next_token_logits = logits[:, -1, :] / temperature
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_ids = torch.cat([input_ids, next_token], dim=1)
    return input_ids[0].tolist()

final_sample_ids = generate(model, "Once upon a time", max_length=200, temperature=1.0)
print("Final generated text:", tokenizer.decode(final_sample_ids, skip_special_tokens=True))


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import math
import tiktoken
from transformers import AutoTokenizer, AutoModelForCausalLM
import pytorch_lightning as pl
from pytorch_lightning import Trainer

# ============================================================================
# GLOBAL VARIABLES (define these before running)
# ============================================================================

# ============================================================================
# TECHNIQUE: Set matmul precision to 'high'
# ============================================================================
torch.set_float32_matmul_precision('high')  # TECHNIQUE: Ensuring high-precision matmul operations

# ============================================================================
# Model Components (same as before)
# ============================================================================

# -----------------------------------------------------------------------------
# Causal Self-Attention Module with Flash Attention
# -----------------------------------------------------------------------------
class CasualSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed)  # TECHNIQUE: Q, K, V projection
        self.c_proj = nn.Linear(config.n_embed, config.n_embed)       # TECHNIQUE: Projection after attention
        self.n_head = config.n_head
        self.n_embed = config.n_embed
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size)
        )  # TECHNIQUE: Causal mask buffer
    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embed, dim=2)
        # TECHNIQUE: Reshape for multi-head attention
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        # TECHNIQUE: Use Flash Attention via scaled_dot_product_attention
        y = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
        )
        # TECHNIQUE: Recombine attention heads
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y

# -----------------------------------------------------------------------------
# Multi-Layer Perceptron (MLP)
# -----------------------------------------------------------------------------
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)  # TECHNIQUE: Expand dimensions 4x
        self.gelu = nn.GELU(approximate="tanh")                     # TECHNIQUE: GELU activation
        self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed)  # TECHNIQUE: Project back to embedding size
        nn.init.constant_(self.c_proj.weight, 1)                    # TECHNIQUE: Constant initialization
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

# -----------------------------------------------------------------------------
# Transformer Block with Residual Scaling
# -----------------------------------------------------------------------------
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embed)
        self.attn = CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embed)
        self.mlp = MLP(config)
        self.res_scale = 1.0 / math.sqrt(2)  # TECHNIQUE: Scale residual outputs by 1/√2
    def forward(self, x):
        # TECHNIQUE: Apply residual scaling to both attention and MLP outputs
        x = x + self.res_scale * self.attn(self.ln_1(x))
        x = x + self.res_scale * self.mlp(self.ln_2(x))
        return x

# -----------------------------------------------------------------------------
# Configuration Class for SMOLL Model with Power-of-2 Check
# -----------------------------------------------------------------------------
@dataclass
class SMOLConfig:
    block_size: int = 1024   # Maximum context length.
    vocab_size: int = 49152  # Updated TECHNIQUE: Set vocabulary size as per checkpoint.
    n_layer: int = 12        # Number of transformer layers.
    n_head: int = 9          # Updated TECHNIQUE: Set number of heads so that head_dim = 576/9 = 64.
    n_embed: int = 576       # Updated TECHNIQUE: Set embedding dimension as per checkpoint.
    def __post_init__(self):
        head_dim = self.n_embed // self.n_head
        if head_dim & (head_dim - 1) != 0:
            raise ValueError("Head dimension (n_embed/n_head) is not a power of 2!")  # TECHNIQUE: Enforce per-head dimension as a power of 2

# -----------------------------------------------------------------------------
# SMOLL Model Definition with Weight Sharing
# -----------------------------------------------------------------------------
class SMOLL(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embed),  # TECHNIQUE: Token embeddings
            'wpe': nn.Embedding(config.block_size, config.n_embed),  # TECHNIQUE: Positional embeddings
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embed),
        })
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)  # TECHNIQUE: Output projection layer
        self.lm_head.weight = self.transformer['wte'].weight  # TECHNIQUE: Tie LM head weights with token embeddings
    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, (
            f"Sequence length {T} exceeds block size {self.config.block_size}"
        )
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer['wpe'](pos)
        tok_emb = self.transformer['wte'](idx)
        x = tok_emb + pos_emb
        for block in self.transformer['h']:
            x = block(x)
        x = self.transformer['ln_f'](x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    @classmethod
    def from_pretrained(cls, model_type):
        """
        Loads a SMOLL model instance using a pretrained HF model.
        This version loads the state_dict with strict=False.
        """
        assert model_type in {'smollvm2-135'}, "Unsupported model type. Only 'smollvm2-135' is supported."
        from transformers import AutoModelForCausalLM
        print("Loading weights from pretrained smollvm2 model: %s" % model_type)
        # Updated configuration to match checkpoint dimensions.
        config_args = {
            'smollvm2-135': dict(n_layer=12, n_head=9, n_embed=576),
        }[model_type]
        config_args['vocab_size'] = 49152
        config_args['block_size'] = 1024
        config = SMOLConfig(**config_args)
        model = cls(config)
        hf_model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
        # TECHNIQUE: Load HF weights with strict=False to allow mismatched/missing keys.
        model.load_state_dict(hf_model.state_dict(), strict=False)
        return model

# ============================================================================
# PyTorch Lightning Module Definition
# ============================================================================
class SMOLLLightningModule(pl.LightningModule):
    def __init__(self, model_type: str = "smollvm2-135"):
        """
        The model_type parameter now has a default value so that load_from_checkpoint works without errors.
        """
        super().__init__()
        self.model_type = model_type
        self.model = SMOLL.from_pretrained(model_type)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
    def forward(self, x, targets=None):
        return self.model(x, targets)
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits, loss = self.model(x, targets=y)
        self.log("train_loss", loss, on_step=True, prog_bar=True)
        # Log sample every 500 steps
        if self.global_step % 500 == 0:
            sample_ids = self.generate("Once upon a time", max_length=100, temperature=1.0)
            sample_text = self.tokenizer.decode(sample_ids, skip_special_tokens=True)
            print(f"\n\nStep {self.global_step}, \nSample: {sample_text}")
        return loss
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=1e-3)
    def generate(self, prompt, max_length=100, temperature=1.0):
        self.model.eval()
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            for _ in range(max_length):
                logits, _ = self.model(input_ids)
                next_token_logits = logits[:, -1, :] / temperature
                probs = F.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                input_ids = torch.cat([input_ids, next_token], dim=1)
        return input_ids[0].tolist()

# ============================================================================
# Data Preparation and DataLoader
# ============================================================================
class TextDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size
    def __len__(self):
        return len(self.tokens) - self.block_size
    def __getitem__(self, idx):
        x = torch.tensor(self.tokens[idx:idx+self.block_size], dtype=torch.long)
        y = torch.tensor(self.tokens[idx+1:idx+self.block_size+1], dtype=torch.long)
        return x, y

B, T = 8, 128  # Batch size and sequence length.
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)  # TECHNIQUE: Initialize tokenizer
with open('/kaggle/input/input-text/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
text = text[:1000]  # TECHNIQUE: Optionally trim the text for demonstration
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < T + 1:
    raise ValueError("Not enough tokens in input.txt for one training sample.")
dataset = TextDataset(tokens, T)
dataloader = DataLoader(dataset, batch_size=B, shuffle=True)  # TECHNIQUE: Create DataLoader for batch loading

# ============================================================================
# Training Stage 1: Train for 5000 Steps with Logging Every 500 Steps
# ============================================================================
lit_model = SMOLLLightningModule()  # Uses default model_type "smollvm2-135"
trainer = Trainer(
    max_steps=5000,
    log_every_n_steps=50,
    accelerator="auto",
    devices=1
)
trainer.fit(lit_model, train_dataloaders=dataloader)
trainer.save_checkpoint("pl_smoll_checkpoint.ckpt")
print("Checkpoint saved after 5000 training steps.")

# ============================================================================
# Training Stage 2: Load Checkpoint and Train for 50 More Steps
# ============================================================================
lit_model_extra = SMOLLLightningModule.load_from_checkpoint("pl_smoll_checkpoint.ckpt")
trainer_extra = Trainer(
    max_steps=50,
    log_every_n_steps=10,
    accelerator="auto",
    devices=1
)
trainer_extra.fit(lit_model_extra, train_dataloaders=dataloader)
print("Additional 50 training steps completed.")

# ============================================================================
# Final Inference: Text Generation
# ============================================================================
final_sample_ids = lit_model_extra.generate("Once upon a time", max_length=100, temperature=1.0)
final_text = tokenizer.decode(final_sample_ids, skip_special_tokens=True)
print("Final generated text:", final_text)


In [None]:
!zip -r /kaggle/working/output.zip /kaggle/working/

In [None]:
from IPython.display import FileLink
FileLink(r'/kaggle/working/output.zip')