In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_scheduler
from datasets import load_dataset
import gradio as gr
from tqdm import tqdm
import os

In [None]:
# 1. Dataset of the class for the tokenized WikiText
class WikiTextDataset(Dataset):
    def __init__(self, dataset_split, tokenizer, block_size=64):
        self.examples = []
        for item in dataset_split['text']:
            if item.strip():
                tokens = tokenizer(item, truncation=True, max_length=block_size, padding='max_length', return_tensors='pt')
                self.examples.append({
                    'input_ids': tokens['input_ids'].squeeze(),
                    'attention_mask': tokens['attention_mask'].squeeze()
                })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        return {
            'input_ids': item['input_ids'],
            'attention_mask': item['attention_mask'],
            'labels': item['input_ids']
        }

In [None]:
# 2. Loading and preparation of the dataset
def load_and_prepare_data():
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    train_dataset = WikiTextDataset(dataset['train'], tokenizer)
    eval_dataset = WikiTextDataset(dataset['validation'], tokenizer)
    return train_dataset, eval_dataset, tokenizer

In [None]:
# 3. Function to tune the GPT-2 model on the given dataset

def train_model(train_dataset, eval_dataset, tokenizer, batch_size=8, num_epochs=1, lr=5e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
    model.resize_token_embeddings(len(tokenizer))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0,
                              num_training_steps=num_epochs * len(train_loader))

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            if (step + 1) % 100 == 0:
                print(f"Step {step+1}, Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

    return model

In [None]:
# 4. Function to predict the next words

def predict_next_word(model, tokenizer, text, top_k=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    inputs = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        top_k_probs, top_k_indices = torch.topk(probabilities, top_k, dim=-1)
        top_k_tokens = [tokenizer.decode(idx.item()).strip() for idx in top_k_indices[0]]
        return [(token, prob.item()) for token, prob in zip(top_k_tokens, top_k_probs[0])]

In [None]:
# Function to launch Gradio interface
def launch_gradio(model, tokenizer):
    def gradio_predict(text):
        predictions = predict_next_word(model, tokenizer, text)
        return "\n".join([f"{word}: {prob:.4f}" for word, prob in predictions])
    interface = gr.Interface(fn=gradio_predict, inputs="text", outputs="text", title="GPT-2 Next Word Prediction")
    interface.launch()

In [None]:
# 6. Main execution of the model
if __name__ == '__main__':
    print("Loading data and tokenizer...")
    train_dataset, eval_dataset, tokenizer = load_and_prepare_data()

    print("Training model...")
    model = train_model(train_dataset, eval_dataset, tokenizer, batch_size=8, num_epochs=5)

    model.save_pretrained("./fine_tuned_gpt2")
    tokenizer.save_pretrained("./fine_tuned_gpt2")

    print("Launching Gradio interface...")
    launch_gradio(model, tokenizer)


In [None]:
pip install gradio transformers torch datasets

In [None]:
pip install -U datasets huggingface_hub fsspec
