In [None]:
# ✅ Adapted version of sprachmodel_final.ipynb for Google Colab

# 🛠️ Install required packages
!pip install torch transformers pandas tqdm wandb ipywidgets datasets --quiet

# 💾 Import libraries
import torch
import torch.nn as nn
import pandas as pd
from transformers import GPT2Tokenizer
import wandb
from tqdm.auto import tqdm
from IPython.display import display
import ipywidgets as widgets
from datasets import load_dataset

# 🔑 Login to Weights & Biases
wandb.login()

# 📊 Configuration for wandb logging
config = {
    "epochs": 5,
    "batch_size": 8,
    "seq_len": 64,
    "lr": 1e-3,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2
}

wandb.init(project="mini-language-model", config=config)

# ✍️ Load and tokenize data
raw_data = load_dataset("tiny_shakespeare")
text = raw_data["train"]["text"]

# Use pretrained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# ✅ Chunk text into smaller samples before tokenizing
chunk_size = config["seq_len"]
chunks = []

for paragraph in text:
    for i in range(0, len(paragraph), chunk_size):
        chunk = paragraph[i:i + chunk_size]
        if len(chunk) >= 10:  # Skip very short fragments
            chunks.append(chunk)

# Tokenize
tokens = tokenizer(chunks, return_tensors="pt", padding="max_length", truncation=True, max_length=config["seq_len"])
inputs = tokens.input_ids[:, :-1]
targets = tokens.input_ids[:, 1:]

# ✂️ Split into train and validation sets
split_idx = int(0.9 * len(inputs))
train_inputs = inputs[:split_idx]
train_targets = targets[:split_idx]
val_inputs = inputs[split_idx:]
val_targets = targets[split_idx:]

# 📦 Dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_data = TextDataset(train_inputs, train_targets)
val_data = TextDataset(val_inputs, val_targets)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=config["batch_size"], shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=config["batch_size"])

# 🧠 Model
class MiniDecoderModel(nn.Module):
    def __init__(self, vocab_size, model_dim, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, model_dim)
        decoder_layer = nn.TransformerDecoderLayer(d_model=model_dim, nhead=num_heads)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.linear = nn.Linear(model_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x).permute(1, 0, 2)
        memory = torch.zeros_like(x)
        out = self.decoder(x, memory)
        return self.linear(out.permute(1, 0, 2))

# 🧠 Initialize model
vocab_size = tokenizer.vocab_size
model = MiniDecoderModel(vocab_size, config["model_dim"], config["num_layers"], config["num_heads"])

# ⚙️ Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

# 🔁 Training loop with validation
for epoch in range(config["epochs"]):
    model.train()
    train_loss = 0
    for x, y in tqdm(train_loader):
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            val_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    wandb.log({"train_loss": avg_train_loss, "val_loss": avg_val_loss})
    print(f"Epoch {epoch+1}: train_loss = {avg_train_loss:.4f}, val_loss = {avg_val_loss:.4f}")

# 💾 Save model and tokenizer
model_path = "hf_model"
torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin")
tokenizer.save_pretrained(model_path)
Path(f"{model_path}/config.json").write_text('{"architectures": ["MiniDecoderModel"]}')
Path(f"{model_path}/README.md").write_text("""
# Mini Language Model

A lightweight autoregressive transformer decoder trained on Tiny Shakespeare using PyTorch.

- 2 layers, 2 heads
- ~1M parameters
- GPT2 tokenizer
""")

# 📤 Upload to Hugging Face (manually)
notebook_login()
repo_id = "Pavloria/mini-language-model"
create_repo(repo_id, private=False)
upload_folder(folder_path=model_path, repo_id=repo_id)




  0%|          | 0/1765 [00:00<?, ?it/s]

Epoch 1: train_loss = 1.6671, val_loss = 1.5044


  0%|          | 0/1765 [00:00<?, ?it/s]

Epoch 2: train_loss = 1.2504, val_loss = 1.3736


  0%|          | 0/1765 [00:00<?, ?it/s]

Epoch 3: train_loss = 1.0641, val_loss = 1.3026


  0%|          | 0/1765 [00:00<?, ?it/s]

Epoch 4: train_loss = 0.9293, val_loss = 1.2842


  0%|          | 0/1765 [00:00<?, ?it/s]

Epoch 5: train_loss = 0.8250, val_loss = 1.2744


RuntimeError: Parent directory hf_model does not exist.

In [None]:
# 💾 Save model and tokenizer to directory
model_path = "hf_model"
Path(model_path).mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin")
tokenizer.save_pretrained(model_path)
Path(f"{model_path}/config.json").write_text('{"architectures": ["MiniDecoderModel"]}')


39

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login, HfApi
notebook_login()  # ✅ This opens a link to get your token




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Save and Upload Model to Hugging Face
api = HfApi()
repo_id = "Pavloria/mini-language-model"
api.create_repo(repo_id, exist_ok=True)
api.upload_folder(folder_path=model_path, repo_id=repo_id)

# Optional: Add a model card
with open("README.md", "w") as f:
    f.write("# Mini Language Model\n\nThis is a toy decoder-only model trained on Tiny Shakespeare.")
api.upload_file(path_or_fileobj="README.md", path_in_repo="README.md", repo_id=repo_id)


pytorch_model.bin:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

- empty or missing yaml metadata in repo card


CommitInfo(commit_url='https://huggingface.co/Pavloria/mini-language-model/commit/81ba1de7a496ead2232ccc754857525ae55792d4', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='81ba1de7a496ead2232ccc754857525ae55792d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Pavloria/mini-language-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Pavloria/mini-language-model'), pr_revision=None, pr_num=None)

In [None]:
# Upload the Model Files to the Repo
notebook_login()
api = HfApi()
repo_id = "Pavloria/mini-language-model"
api.create_repo(repo_id, exist_ok=True)
api.upload_folder(folder_path=model_path, repo_id=repo_id)

# Optional: Add a model card
with open("README.md", "w") as f:
    f.write("""---
language: en
license: mit
tags:
  - pytorch
  - language-model
  - transformer
  - tiny-shakespeare
library_name: transformers
model_name: mini-language-model
pipeline_tag: text-generation
---

# Mini Language Model

This is a toy decoder-only model trained on Tiny Shakespeare.
""")




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
#uploads the README.md file to my model repository on Hugging Face Hub
api.upload_file(path_or_fileobj="README.md", path_in_repo="README.md", repo_id=repo_id)


CommitInfo(commit_url='https://huggingface.co/Pavloria/mini-language-model/commit/7c2b43c7d6ffa04b0118a3909295db55ddfb37e5', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='7c2b43c7d6ffa04b0118a3909295db55ddfb37e5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Pavloria/mini-language-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Pavloria/mini-language-model'), pr_revision=None, pr_num=None)

In [None]:
# ✅ Final version of Mini Language Model notebook
# ✅ Application of the model in practice

# 🧠 Define my custom decoder model
import torch
import torch.nn as nn
import torch.nn.functional as F

class MiniDecoderModel(nn.Module):
    def __init__(self, vocab_size, model_dim, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, model_dim)
        decoder_layer = nn.TransformerDecoderLayer(d_model=model_dim, nhead=num_heads)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.linear = nn.Linear(model_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x).permute(1, 0, 2)
        memory = torch.zeros_like(x)
        out = self.decoder(x, memory)
        return self.linear(out.permute(1, 0, 2))

# 📦 Install dependencies
!pip install huggingface_hub transformers --quiet

# 💾 Imports
from transformers import GPT2Tokenizer
from huggingface_hub import hf_hub_download

# 🔤 Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("Pavloria/mini-language-model")
tokenizer.pad_token = tokenizer.eos_token

# 📥 Download model checkpoint from Hugging Face
model_path = hf_hub_download(repo_id="Pavloria/mini-language-model", filename="pytorch_model.bin")

# 🧠 Load model
model = MiniDecoderModel(vocab_size=tokenizer.vocab_size, model_dim=128, num_layers=2, num_heads=2)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# ✨ Function to generate text with temperature and top-k sampling
def generate_text(prompt, max_length=50, temperature=1.0, top_k=20):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    generated = input_ids.clone()

    for _ in range(max_length):
        with torch.no_grad():
            output = model(generated)
            logits = output[:, -1, :] / temperature

            topk_logits, topk_indices = torch.topk(logits, k=top_k)
            probs = torch.softmax(topk_logits, dim=-1)

            next_token = topk_indices[0, torch.multinomial(probs, num_samples=1)]

            # next_token = next_token.unsqueeze(0).unsqueeze(1)  # 🔧 FIXED SHAPE to [1, 1]
            generated = torch.cat((generated, next_token), dim=1)

            # if next_token.item() == tokenizer.eos_token_id:
            #     break

    return tokenizer.decode(generated[0], skip_special_tokens=True)

# 🚀 Try it out!
output_text = generate_text(
    prompt="In a forgotten village where no one dreams anymore...",
    max_length=50,
    temperature=0.9,
    top_k=30
)

print(output_text)


In a forgotten village where no one dreams anymore... a poison a
In that a poison.
In this poison.
In that removed poison a poison a poison a poison poison poison a poison.
In this poison


In [2]:
# ⬇️ Install Gradio
!pip install gradio --quiet

# 💻 Import Gradio
import gradio as gr

# 🔄 Wrap the generate_text function for Gradio
def gradio_generate(prompt, max_length=50, temperature=1.0, top_k=30):
    return generate_text(prompt, max_length, temperature, top_k)

# 🎨 Build the interface
interface = gr.Interface(
    fn=gradio_generate,
    inputs=[
        gr.Textbox(label="Prompt", placeholder="Once upon a time..."),
        gr.Slider(10, 200, value=50, label="Max Length"),
        gr.Slider(0.1, 2.0, value=1.0, label="Temperature"),
        gr.Slider(1, 50, value=30, step=1, label="Top-k Sampling")
    ],
    outputs="text",
    title="Mini Language Model Generator",
    description="Generate text using your custom decoder-based mini language model trained on Tiny Shakespeare.",
)

# 🚀 Launch the app
interface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://965c075414b9c3898c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [5]:
# 1. Install Gradio if not yet installed
!pip install gradio --quiet
!pip install transformers huggingface_hub --quiet

# 2. Imports
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer
from huggingface_hub import hf_hub_download
import gradio as gr

# 3. Define model class
class MiniDecoderModel(nn.Module):
    def __init__(self, vocab_size, model_dim, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, model_dim)
        decoder_layer = nn.TransformerDecoderLayer(d_model=model_dim, nhead=num_heads)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.linear = nn.Linear(model_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x).permute(1, 0, 2)
        memory = torch.zeros_like(x)
        out = self.decoder(x, memory)
        return self.linear(out.permute(1, 0, 2))

# 4. Load tokenizer & model
tokenizer = GPT2Tokenizer.from_pretrained("Pavloria/mini-language-model")
tokenizer.pad_token = tokenizer.eos_token

model_path = hf_hub_download(repo_id="Pavloria/mini-language-model", filename="pytorch_model.bin")
model = MiniDecoderModel(vocab_size=tokenizer.vocab_size, model_dim=128, num_layers=2, num_heads=2)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# 5. Text generation function
def generate_text(prompt, max_length=50, temperature=1.0, top_k=20):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    generated = input_ids.clone()

    for _ in range(max_length):
        with torch.no_grad():
            output = model(generated)
            logits = output[:, -1, :] / temperature
            topk_logits, topk_indices = torch.topk(logits, k=top_k)
            probs = torch.softmax(topk_logits, dim=-1)
            next_token = topk_indices[0, torch.multinomial(probs, num_samples=1)]
            generated = torch.cat((generated, next_token), dim=1)

    return tokenizer.decode(generated[0], skip_special_tokens=True)

# 6. Gradio Interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(10, 200, value=50, step=1, label="Max Length"),
        gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(1, 50, value=20, step=1, label="Top-k Sampling"),
    ],
    outputs="text",
    title="Mini Language Model Generator",
    description="This app generates text using your custom decoder-based model trained on Tiny Shakespeare."
)

# 7. Launch app
iface.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/57.0M [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e151dcc94dcbe78062.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


