In [1]:
import accelerate 
print(accelerate.__version__)

1.8.1


In [2]:
import torch
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

In [3]:
# ✅ Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# ✅ Load tokenizer and dataset
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 has no pad_token

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])

In [5]:
# ✅ Group text for training
def group_texts(examples):
    joined = sum(examples["input_ids"], [])
    chunks = [joined[i:i+128] for i in range(0, len(joined), 128)]
    return {"input_ids": chunks, "labels": chunks}

lm_dataset = tokenized.map(group_texts, batched=True)


In [6]:
# ✅ Load model
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
# ✅ Training config
training_args = TrainingArguments(
    output_dir="./gpt2-wikitext2",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    fp16=True,  # Enable automatic mixed precision (faster training on GPU)
)



In [8]:
# ✅ Trainer setup
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [9]:
# ✅ Start training
trainer.train()


  0%|          | 0/18359 [00:00<?, ?it/s]

{'loss': 4.1846, 'grad_norm': 21.94734001159668, 'learning_rate': 4.974671823084046e-05, 'epoch': 0.01}
{'loss': 3.6294, 'grad_norm': 12.61223030090332, 'learning_rate': 4.947437224249687e-05, 'epoch': 0.01}
{'loss': 3.5844, 'grad_norm': 24.586185455322266, 'learning_rate': 4.920202625415328e-05, 'epoch': 0.02}
{'loss': 3.6875, 'grad_norm': 7.776573181152344, 'learning_rate': 4.8929680265809685e-05, 'epoch': 0.02}
{'loss': 3.565, 'grad_norm': 0.0, 'learning_rate': 4.865733427746609e-05, 'epoch': 0.03}
{'loss': 3.58, 'grad_norm': 19.933956146240234, 'learning_rate': 4.83849882891225e-05, 'epoch': 0.03}
{'loss': 3.4654, 'grad_norm': 44.46742248535156, 'learning_rate': 4.811264230077891e-05, 'epoch': 0.04}
{'loss': 3.6715, 'grad_norm': 22.65001678466797, 'learning_rate': 4.784029631243532e-05, 'epoch': 0.04}
{'loss': 3.4772, 'grad_norm': 7.614764213562012, 'learning_rate': 4.7567950324091725e-05, 'epoch': 0.05}
{'loss': 3.6705, 'grad_norm': 6.97003698348999, 'learning_rate': 4.72956043357

  0%|          | 0/1880 [00:00<?, ?it/s]

{'eval_loss': nan, 'eval_runtime': 47.6033, 'eval_samples_per_second': 78.986, 'eval_steps_per_second': 39.493, 'epoch': 1.0}
{'train_runtime': 2674.4426, 'train_samples_per_second': 13.729, 'train_steps_per_second': 6.865, 'train_loss': 3.2525149183898225, 'epoch': 1.0}


TrainOutput(global_step=18359, training_loss=3.2525149183898225, metrics={'train_runtime': 2674.4426, 'train_samples_per_second': 13.729, 'train_steps_per_second': 6.865, 'total_flos': 2398530207744000.0, 'train_loss': 3.2525149183898225, 'epoch': 1.0})

In [11]:
import math

eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"📊 Perplexity: {perplexity:.2f}")


  0%|          | 0/1880 [00:00<?, ?it/s]

📊 Perplexity: nan


In [10]:
# ✅ Save final model
trainer.save_model("./gpt2-wikitext2-final")
tokenizer.save_pretrained("./gpt2-wikitext2-final")
print("Training complete. Model saved to ./gpt2-wikitext2-final")

Training complete. Model saved to ./gpt2-wikitext2-final


In [16]:
compute_top_k_accuracy(model, tokenizer, lm_dataset["validation"], k=5, sample_size=100)


🎯 Top-5 Accuracy: 20.00% on 100 samples


In [14]:
from torch.nn.functional import softmax
import numpy as np

def compute_top_k_accuracy(model, tokenizer, dataset, k=5, sample_size=100):
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    correct = 0
    total = 0

    for i, example in enumerate(dataset.select(range(sample_size))):
        input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(model.device)
        with torch.no_grad():
            outputs = model(input_ids)
        logits = outputs.logits

        # Get the last token's logits
        next_token_logits = logits[0, -1, :]
        probs = softmax(next_token_logits, dim=0)
        top_k = torch.topk(probs, k).indices.cpu().numpy()

        # Check if the true next token is in the top-k predictions
        if i + 1 < len(dataset):
            target_token_id = dataset[i + 1]['input_ids'][0]  # crude next-token approximation
            if target_token_id in top_k:
                correct += 1
            total += 1

    top_k_acc = correct / total if total > 0 else 0
    print(f"🎯 Top-{k} Accuracy: {top_k_acc*100:.2f}% on {total} samples")


In [15]:
import gradio as gr

def predict_next_words(prompt, max_new_tokens=20, top_k=5):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=top_k,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

interface = gr.Interface(
    fn=predict_next_words,
    inputs=[
        gr.Textbox(label="Prompt", placeholder="Enter a sentence like 'The future of AI is'"),
        gr.Slider(1, 50, value=20, step=1, label="Max New Tokens"),
        gr.Slider(1, 50, value=5, step=1, label="Top-k Sampling")
    ],
    outputs="text",
    title="🧠 GPT-2 Next Word Predictor"
)

interface.launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\nlp\lib\site-packages\gradio\queueing.py", line 626, in process_events
    response = await route_utils.call_process_api(
  File "c:\ProgramData\anaconda3\envs\nlp\lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\ProgramData\anaconda3\envs\nlp\lib\site-packages\gradio\blocks.py", line 2220, in process_api
    result = await self.call_function(
  File "c:\ProgramData\anaconda3\envs\nlp\lib\site-packages\gradio\blocks.py", line 1731, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "c:\ProgramData\anaconda3\envs\nlp\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "c:\ProgramData\anaconda3\envs\nlp\lib\site-packages\anyio\_backends\_asyncio.py", line 2470, in run_sync_in_worker_thread
    return await future
  File