In [2]:
!pip install --upgrade --force-reinstall transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from transformers)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [2]:
# ✅ Step 1: Install dependencies
!pip install transformers datasets huggingface_hub wandb gradio -q

# ✅ Step 2: Download tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny_shakespeare.txt

# ✅ Step 3: Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# ✅ Step 4: Import libraries
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import wandb, time
import torch
import gradio as gr

# ✅ Step 5: Init wandb
wandb.login()
run = wandb.init(project="gpt2-bonus-project", name=f"gpt2-run-{int(time.time())}")

# ✅ Step 6: Load and manually split dataset
with open("tiny_shakespeare.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

split_idx = int(0.9 * len(full_text))
train_text = full_text[:split_idx]
val_text = full_text[split_idx:]

train_dataset = Dataset.from_dict({"text": [train_text]})
val_dataset = Dataset.from_dict({"text": [val_text]})

# ✅ Step 7: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# ✅ Step 8: Tokenize data
def tokenize(example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["text"])
train_dataset.set_format("torch")
val_dataset.set_format("torch")

# ✅ Step 9: Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Step 10: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",
    push_to_hub=True,
    hub_model_id="Pavloria/gpt2-shakespeare-mini"
)

# ✅ Step 11: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 12: Train model
trainer.train()

# ✅ Step 13: Save and push model
trainer.push_to_hub()
tokenizer.push_to_hub("Pavloria/gpt2-shakespeare-mini")

# ✅ Step 14: Text generation function
def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Step 15: Gradio UI
gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(10, 200, value=50, step=1, label="Max Length"),
        gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(1, 100, value=50, step=1, label="Top-k Sampling")
    ],
    outputs="text",
    title="Fine-tuned GPT-2 Text Generator",
    description="Try out your custom GPT-2 model trained on Tiny Shakespeare!"
).launch()

# ✅ Step 16: Finish wandb run
wandb.finish()


--2025-05-04 17:47:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘tiny_shakespeare.txt’


2025-05-04 17:47:32 (16.6 MB/s) - ‘tiny_shakespeare.txt’ saved [1115394/1115394]



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,4.3148
2,3.6754
3,3.115


README.md:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://66c8822288c6437fa3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


0,1
train/epoch,▁▅██
train/global_step,▁▅██
train/grad_norm,█▇▁
train/learning_rate,█▅▁
train/loss,█▄▁

0,1
total_flos,195969024000.0
train/epoch,3.0
train/global_step,3.0
train/grad_norm,19.11436
train/learning_rate,2e-05
train/loss,3.115
train_loss,3.70173
train_runtime,80.9653
train_samples_per_second,0.037
train_steps_per_second,0.037


In [5]:
# ✅ Step 1: Install dependencies
!pip install transformers datasets huggingface_hub wandb gradio -q

# ✅ Step 2: Download tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny_shakespeare.txt

# ✅ Step 3: Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# ✅ Step 4: Import libraries
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import wandb, time
import torch
import gradio as gr
from torch.utils.data import DataLoader
from tqdm import tqdm

# ✅ Step 5: Init wandb
wandb.login()
run = wandb.init(project="gpt2-bonus-project", name=f"gpt2-run-{int(time.time())}")

# ✅ Step 6: Load and manually split dataset
with open("tiny_shakespeare.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

split_idx = int(0.9 * len(full_text))
train_text = full_text[:split_idx]
val_text = full_text[split_idx:]

train_dataset = Dataset.from_dict({"text": [train_text]})
val_dataset = Dataset.from_dict({"text": [val_text]})

# ✅ Step 7: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# ✅ Step 8: Tokenize data
def tokenize(example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["text"])
train_dataset.set_format("torch")
val_dataset.set_format("torch")

# ✅ Step 9: Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Step 10: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    save_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",
    push_to_hub=True,
    hub_model_id="Pavloria/gpt2-shakespeare-mini"
)

# ✅ Step 11: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 12–13: Train and log validation loss after each epoch
num_epochs = 3
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    trainer.train(resume_from_checkpoint=False)

    # Manually evaluate and log validation loss
    val_loader = DataLoader(val_dataset, batch_size=2, collate_fn=data_collator)
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Evaluating Epoch {epoch+1}"):
            batch = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    val_loss /= len(val_loader)
    print(f"Validation loss after epoch {epoch+1}: {val_loss:.4f}")
    wandb.log({"val_loss": val_loss}, step=epoch + 1)

# ✅ Step 14: Save and push model
trainer.push_to_hub()
tokenizer.push_to_hub("Pavloria/gpt2-shakespeare-mini")

# ✅ Step 15: Text generation function
def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Step 16: Gradio UI
gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(10, 200, value=50, step=1, label="Max Length"),
        gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(1, 100, value=50, step=1, label="Top-k Sampling")
    ],
    outputs="text",
    title="Fine-tuned GPT-2 Text Generator",
    description="Try out your custom GPT-2 model trained on Tiny Shakespeare!"
).launch()

# ✅ Step 17: Finish wandb run
wandb.finish()


--2025-05-04 18:30:10--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘tiny_shakespeare.txt’


2025-05-04 18:30:10 (16.1 MB/s) - ‘tiny_shakespeare.txt’ saved [1115394/1115394]



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Trainer(



Epoch 1/3


Step,Training Loss


Evaluating Epoch 1: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


Validation loss after epoch 1: 3.3749

Epoch 2/3


Step,Training Loss


Evaluating Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  1.22it/s]


Validation loss after epoch 2: 3.1630

Epoch 3/3


Step,Training Loss


Evaluating Epoch 3: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


Validation loss after epoch 3: 3.1279


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6253d31b54e044657a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


0,1
train/epoch,▁▁▁
train/global_step,▁▁▁▁
val_loss,█▂▁

0,1
total_flos,195969024000.0
train/epoch,3.0
train/global_step,3.0
train_loss,2.27761
train_runtime,74.4199
train_samples_per_second,0.04
train_steps_per_second,0.04
val_loss,3.12789


In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# ✅ Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("Pavloria/gpt2-shakespeare-mini")
tokenizer = GPT2Tokenizer.from_pretrained("Pavloria/gpt2-shakespeare-mini")
model.eval()

# ✅ Set pad_token to eos_token to avoid padding issues
tokenizer.pad_token = tokenizer.eos_token

# ✅ Generate text function with attention_mask
def generate(prompt, max_length=50):
    # Tokenize with attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            do_sample=True,
            top_k=50,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Try it!
print(generate("To be or not to be"))


To be or not to be, we were always a very small family, and we were very poor, but we still had a little of everything. No matter how much we could eat we could not eat, whether we were rich or poor.



In [9]:
# ✅ Install Gradio if not already installed
!pip install gradio --quiet

# ✅ Import required libraries
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# ✅ Load your fine-tuned model
model = GPT2LMHeadModel.from_pretrained("Pavloria/gpt2-shakespeare-mini")
tokenizer = GPT2Tokenizer.from_pretrained("Pavloria/gpt2-shakespeare-mini")
model.eval()

# ✅ Define the generation function
def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Build the Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(10, 200, value=50, step=1, label="Max Length"),
        gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(1, 100, value=50, step=1, label="Top-k Sampling")
    ],
    outputs="text",
    title="Fine-tuned GPT-2 Text Generator",
    description="Try out your custom GPT-2 model trained on Tiny Shakespeare!"
)

# ✅ Launch the app
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a255420f7f69abdb2a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


