In [2]:
# ✅ Step 1: Install required libraries
!pip install transformers datasets wandb

# ✅ Step 2: Download the tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny_shakespeare.txt

# ✅ Step 3: Login to Hugging Face Hub (optional if pushing the model later)
from huggingface_hub import notebook_login
notebook_login()

# ✅ Step 4: Import required libraries
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import wandb

# ✅ Step 5: Initialize wandb
wandb.init(project="gpt2-bonus-project")

# ✅ Step 6: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# ✅ Step 7: Load the training dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="tiny_shakespeare.txt",
    block_size=128
)

# ✅ Step 8: Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # MLM = Masked Language Model (False for GPT-2)
)

# ✅ Step 9: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_dir="./logs",
    report_to="wandb"
)

# ✅ Step 10: Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# ✅ Step 11: Train the model
trainer.train()

# ✅ Step 12: Save the model and tokenizer
model.save_pretrained("gpt2-finetuned")
tokenizer.save_pretrained("gpt2-finetuned")

# ✅ Optional: Push to Hugging Face Hub
# from huggingface_hub import HfApi
# api = HfApi()
# api.upload_folder(
#     folder_path="gpt2-finetuned",
#     path_in_repo="",
#     repo_id="your-username/gpt2-bonus-model",
#     repo_type="model"
# )



--2025-04-23 13:59:03--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘tiny_shakespeare.txt’


2025-04-23 13:59:03 (13.8 MB/s) - ‘tiny_shakespeare.txt’ saved [1115394/1115394]



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.7487
1000,3.5912
1500,3.4276
2000,3.2552
2500,3.2559
3000,3.1394
3500,3.0873


('gpt2-finetuned/tokenizer_config.json',
 'gpt2-finetuned/special_tokens_map.json',
 'gpt2-finetuned/vocab.json',
 'gpt2-finetuned/merges.txt',
 'gpt2-finetuned/added_tokens.json')

In [4]:
# Testing my fine-tuned model
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load your fine-tuned model
model = GPT2LMHeadModel.from_pretrained("gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-finetuned")
model.eval()

# Set pad_token to eos_token (if not already done in training)
tokenizer.pad_token = tokenizer.eos_token

# Generate text
def generate(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=50,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id  # ✅ Fixes the warning
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Try it!
print(generate("To be or not to be"))


To be or not to be,
Your true meaning is to be
A fool. What, is't you like?

DUKE VINCENTIO:
Nay, I am, sir, as you are.




In [5]:
# ✅ Install Gradio if not already installed
!pip install gradio --quiet

# ✅ Import required libraries
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# ✅ Load your fine-tuned model
model = GPT2LMHeadModel.from_pretrained("gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-finetuned")
model.eval()

# ✅ Define the generation function
def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Build the Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(10, 200, value=50, step=1, label="Max Length"),
        gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(1, 100, value=50, step=1, label="Top-k Sampling")
    ],
    outputs="text",
    title="Fine-tuned GPT-2 Text Generator",
    description="Try out your custom GPT-2 model trained on Tiny Shakespeare!"
)

# ✅ Launch the app
iface.launch()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.6/322.6 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hIt looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in cola



In [7]:
#Saving model to Hugging Face Hub
#Install libraries
!pip install huggingface_hub transformers
#Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()
#Save model and tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Replace with your model if not already saved
model.save_pretrained("gpt2-finetuned")
tokenizer.save_pretrained("gpt2-finetuned")

#Upload to Hugging Face Hub
from huggingface_hub import HfApi

api = HfApi()
repo_id = "Pavloria/gpt2-shakespeare-mini"  # Change to your actual username/repo name

api.create_repo(repo_id=repo_id, exist_ok=True)
api.upload_folder(folder_path="gpt2-finetuned", repo_id=repo_id)





VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Pavloria/gpt2-shakespeare-mini/commit/2ea0b42bd617008612b875587bb909ed2afac564', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2ea0b42bd617008612b875587bb909ed2afac564', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Pavloria/gpt2-shakespeare-mini', endpoint='https://huggingface.co', repo_type='model', repo_id='Pavloria/gpt2-shakespeare-mini'), pr_revision=None, pr_num=None)