In [1]:
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"



# Saving tokenizer files for hugging face app
import shutil
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

# Paths for saving tokenizer and model
save_dir = "/kaggle/working/smollm_model"

# Ensure the directory exists|
os.makedirs(save_dir, exist_ok=True)

# Load tokenizer and model (replace with your model name or path)

tokenizer_saved = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
model_saved = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)

# Save tokenizer and model to working directory
tokenizer_saved.save_pretrained(save_dir)
model_saved.save_pretrained(save_dir)

print(f">>> Model and tokenizer saved to {save_dir}")



tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

>>> Model and tokenizer saved to /kaggle/working/smollm_model


In [4]:
# without ROPE
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import math
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.quantization import quantize_dynamic

# -----------------------------------------------------------------------------
# Set matmul precision for performance
# -----------------------------------------------------------------------------
torch.set_float32_matmul_precision('high')

# -----------------------------------------------------------------------------
# Configuration Class for SmolLM2 Model
# -----------------------------------------------------------------------------
@dataclass
class SMOLConfig:
    block_size: int = 1024
    vocab_size: int = 49152
    n_layer: int = 30
    n_head: int = 9
    n_embed: int = 576
    rope_theta: float = 10000.0
    rms_norm_eps: float = 1.0e-5
    intermediate_size: int = 1536

    def __post_init__(self):
        head_dim = self.n_embed // self.n_head
        if head_dim & (head_dim - 1) != 0:
            raise ValueError("Head dimension must be a power of 2!")

# -----------------------------------------------------------------------------
# RMSNorm: Root Mean Square Normalization
# -----------------------------------------------------------------------------
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))
    def forward(self, x):
        norm = x.norm(2, dim=-1, keepdim=True)
        return x * (self.scale / (norm + self.eps))

# -----------------------------------------------------------------------------
# Causal Self-Attention Module (WITHOUT RoPE)
# -----------------------------------------------------------------------------
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_head = config.n_head
        self.n_embed = config.n_embed
        self.head_dim = config.n_embed // config.n_head

        # Linear projections for Q, K, V and output
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed)
        self.c_proj = nn.Linear(config.n_embed, config.n_embed)

        # Note: RoPE has been removed to simplify the model and reduce overhead.
        self.norm = RMSNorm(config.n_embed, eps=config.rms_norm_eps)

    def forward(self, x):
        B, T, C = x.size()
        x = self.norm(x)
        qkv = self.c_attn(x).split(self.n_embed, dim=2)
        q, k, v = [qkv[i].view(B, T, self.n_head, self.head_dim).transpose(1, 2)
                   for i in range(3)]
        # Bypassing RoPE – use q and k directly.
        y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.c_proj(y)

# -----------------------------------------------------------------------------
# MLP Block with RMSNorm
# -----------------------------------------------------------------------------
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        hidden_size = config.intermediate_size if config.intermediate_size else 4 * config.n_embed
        self.c_fc = nn.Linear(config.n_embed, hidden_size)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(hidden_size, config.n_embed)
        self.rms_norm = RMSNorm(config.n_embed, eps=config.rms_norm_eps)

    def forward(self, x):
        x = self.gelu(self.c_fc(x))
        x = self.c_proj(x)
        return self.rms_norm(x)

# -----------------------------------------------------------------------------
# Transformer Block with Residual Scaling
# -----------------------------------------------------------------------------
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.rms_1 = RMSNorm(config.n_embed, eps=config.rms_norm_eps)
        self.attn = CausalSelfAttention(config)
        self.rms_2 = RMSNorm(config.n_embed, eps=config.rms_norm_eps)
        self.mlp = MLP(config)
        self.res_scale = 1.0 / math.sqrt(2)

    def forward(self, x):
        x = x + self.res_scale * self.attn(self.rms_1(x))
        x = x + self.res_scale * self.mlp(self.rms_2(x))
        return x

# -----------------------------------------------------------------------------
# SmolLM Model Definition with Weight Sharing
# -----------------------------------------------------------------------------
class SMOLL(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embed),
            'wpe': nn.Embedding(config.block_size, config.n_embed),
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embed, eps=config.rms_norm_eps)
        })
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)
        self.lm_head.weight = self.transformer['wte'].weight

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, "Sequence length exceeds block size"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        x = self.transformer['wte'](idx) + self.transformer['wpe'](pos)
        for block in self.transformer['h']:
            x = block(x)
        logits = self.lm_head(self.transformer['ln_f'](x))
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) if targets is not None else None
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type, model_name, token=None):
        assert model_type == 'SmolLM2-135M', "Only 'SmolLM2-135M' is supported."
        config = SMOLConfig()
        model = cls(config)
        hf_model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
        model.load_state_dict(hf_model.state_dict(), strict=False)
        return model

# -----------------------------------------------------------------------------
# PyTorch Lightning Module for SmolLM
# -----------------------------------------------------------------------------
class SMOLLLightningModule(pl.LightningModule):
    def __init__(self, model_type="SmolLM2-135M", model_name=None, token=None):
        super().__init__()
        self.model = SMOLL.from_pretrained(model_type, model_name, token)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

    def forward(self, x, targets=None):
        return self.model(x, targets)


    def training_step(self, batch, batch_idx):
        x, y = batch
        _, loss = self.model(x, targets=y)
    
        # Log training loss
        self.log("train_loss", loss, on_step=True, prog_bar=True)
    
        
        if self.global_step % 200 == 0:
            print(">>> processing step: ", self.global_step)

        # Display generation samples every 500 steps
        if self.global_step % 500 == 0:
            sample = self.generate("Once upon a time", max_length=100, temperature=1.0)
            print(f"\n\n >>> Step {self.global_step}: \n >>> Generated Sample:\n{sample}\n")

    
        return loss


    def generate(self, prompt, max_length=100, temperature=1.0):
        self.model.eval()
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            for _ in range(max_length):
                logits, _ = self.model(input_ids)
                next_token_logits = logits[:, -1, :] / temperature
                probs = F.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                input_ids = torch.cat([input_ids, next_token], dim=1)
        return self.tokenizer.decode(input_ids[0], skip_special_tokens=True)

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=3e-3, betas=(0.9, 0.95), eps=1e-8, weight_decay=0.01)
        def lr_lambda(step):
            warmup_steps = 2000
            decay_start = 1600000
            decay_steps = 400000
            if step < warmup_steps:
                return step / warmup_steps
            elif step < decay_start:
                return 1.0
            return max(0.0, 1 - (step - decay_start) / decay_steps)
        scheduler = LambdaLR(optimizer, lr_lambda)
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step"}}

# -----------------------------------------------------------------------------
# Custom Dataset for Text Data
# -----------------------------------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) - self.block_size

    def __getitem__(self, idx):
        return (
            torch.tensor(self.tokens[idx:idx + self.block_size], dtype=torch.long),
            torch.tensor(self.tokens[idx + 1:idx + self.block_size + 1], dtype=torch.long)
        )

# -----------------------------------------------------------------------------
# Training Setup
# -----------------------------------------------------------------------------
T = 64
batch_size = 8
file_path = "/kaggle/input/shakespeare-text-data/input.txt"

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Initialize tokenizer and tokenize text
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
text = text[:1000]  # Optionally trim for demonstration
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < T + 1:
    raise ValueError("Not enough tokens in input.txt for one training sample.")
tokens = tokenizer.encode(text, add_special_tokens=False)

dataset = TextDataset(tokens, T)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print(f"Dataset size: {len(dataset)}, Batch size: {batch_size}")

lit_model = SMOLLLightningModule(model_name=model_name, token=token)
trainer = Trainer(max_steps=3500, log_every_n_steps=50, accelerator="auto", devices=1, enable_progress_bar=True)
trainer.fit(lit_model, dataloader)

# -----------------------------------------------------------------------------
# Save the Original (FP32) Checkpoint
# -----------------------------------------------------------------------------
trainer.save_checkpoint("smollm2_135m_lat.ckpt")
print("✅ Original checkpoint saved.")

# -----------------------------------------------------------------------------
# Apply Dynamic Quantization (8-bit) to Reduce Model Size
# -----------------------------------------------------------------------------
lit_model.model = quantize_dynamic(lit_model.model, {nn.Linear}, dtype=torch.qint8)
# Save the quantized model checkpoint (for HF Spaces, this should be < 1GB)
torch.save({"state_dict": lit_model.state_dict()}, "lat_smollm2_135m_quantized.ckpt")
print("✅ Quantized checkpoint saved (reduced model size).")


Dataset size: 222, Batch size: 8


Training: |          | 0/? [00:00<?, ?it/s]

>>> processing step:  0


 >>> Step 0: 
 >>> Generated Sample:
Once upon a time URI ACTION evid strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg enthus comprom strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg necessit necessit authent strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg affili affili affili affili affili strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg strugg interpre strugg strugg strugg dissemin

>>> processing step:  200
>>> processing step:  400


 >>> Step 500: 
 >>> Generated Sample:
Once upon a time:

 be done: away, away, away!

Second Citizen:
We are accounted poor citizens, the patricians good.
All:

In [5]:
# # # # # # # #
# PARAMETERS #
# # # # # # ##

if __name__ == "__main__":
    # Initialize model config
    config = SMOLConfig()
    model = SMOLL(config)

    # Print model architecture
    print(model)


SMOLL(
  (transformer): ModuleDict(
    (wte): Embedding(49152, 576)
    (wpe): Embedding(1024, 576)
    (h): ModuleList(
      (0-29): 30 x Block(
        (rms_1): RMSNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=576, out_features=1728, bias=True)
          (c_proj): Linear(in_features=576, out_features=576, bias=True)
          (norm): RMSNorm()
        )
        (rms_2): RMSNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=576, out_features=1536, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=1536, out_features=576, bias=True)
          (rms_norm): RMSNorm()
        )
      )
    )
    (ln_f): LayerNorm((576,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=576, out_features=49152, bias=False)
)


In [7]:
# ==========================================
# Final Inference Example1: Text Generation
# ==========================================
inference_prompt1 = lit_model.generate("Once upon a time", max_length=100, temperature=1.0)
print("Final generated text:", inference_prompt1)

Final generated text: Once upon a time die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our proceed any:
You cornness good speak good leancius is chief corn at corn at our own price at our own price price. resolvedish their abundance


In [9]:
# # # # # # #  # # # # # # #
# QUANTIZATION & INFERENCE #
# # # # # # #  # # # # # # #

import torch


# Path to your quantized checkpoint
quantized_ckpt_path = "/kaggle/working/lat_smollm2_135m_quantized.ckpt"

# Step 1: Create an instance of your Lightning model
lit_model = SMOLLLightningModule(model_name=model_name, token=token)

# Step 2: Load the quantized state dict using strict=False
checkpoint = torch.load(quantized_ckpt_path, map_location="cpu")
lit_model.load_state_dict(checkpoint["state_dict"], strict=False)
lit_model.eval()  # Set to evaluation mode

print("✅ Quantized model loaded for inference.")

# Step 3: Perform inference
inference_prompt = "Once upon a time"
generated_text = lit_model.generate(inference_prompt, max_length=100, temperature=1.0)

print("Final generated text:", generated_text)


  checkpoint = torch.load(quantized_ckpt_path, map_location="cpu")


✅ Quantized model loaded for inference.
Final generated text: Once upon a time an an an an an an an an an an an anOMrophesnesdaybridsionageitted pillows maize literatures enthus strugg strugg strugg strugg strugg strugg satis enric enric interpre interpre interpre interpre interpre interpre retros interspers strugg strugg strugg affili affili enthusIng propagatingnesday prioritize phthal subprocess dend unem unem includ includ includ evid evid evid comprom policymelyely cites paves offic drierosteringroximatelyrestrialiscopalulesanceanceanceanceanceanceanceanceanceanceanceanceanceanceanceanceanceanceanceanceererererererer


In [11]:
# # # # # # # #
# EXTRA STEPS #
# # # # # # # #

lit_model_extra = SMOLLLightningModule.load_from_checkpoint("smollm2_135m_lat.ckpt",model_name=model_name, token=token)
trainer_extra = Trainer(
    max_steps=50,
    log_every_n_steps=10,
    accelerator="auto",
    devices=1
)
trainer_extra.fit(lit_model_extra, train_dataloaders=dataloader)
print("Additional 50 training steps completed.")

Training: |          | 0/? [00:00<?, ?it/s]

>>> processing step:  0


 >>> Step 0: 
 >>> Generated Sample:
Once upon a time die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we proceed any dear:
You are'll have corn talking on superflu him, you might corn are corn at corn at at resolved rather at our own price price. resolved resolved resolved.



OSError: [Errno 30] Read-only file system: '/tmp/tmpt411p2d3'

In [None]:
import shutil
import os

folder_path = "/kaggle/working/"
output_zip_path = os.path.join(folder_path, "smollM2.zip")

# Create a zip archive of the folder
shutil.make_archive(output_zip_path[:-4], 'zip', folder_path)