In [None]:
pip install transformers datasets torch



In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')




In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('/content/drive/MyDrive/task/bbc_data.csv')

In [None]:
df['labels'].unique()

array(['entertainment', 'business', 'sport', 'politics', 'tech'],
      dtype=object)

In [None]:
def preprocess_data(row):
    return f"Category: {row['labels']} Headline: {row['data']}"

In [None]:
# Apply preprocessing to the dataset
df['input_text'] = df.apply(preprocess_data, axis=1)

In [None]:
df['input_text']

Unnamed: 0,input_text
0,Category: entertainment Headline: Musicians to...
1,Category: entertainment Headline: U2s desire t...
2,Category: entertainment Headline: Rocker Doher...
3,Category: entertainment Headline: Snicket tops...
4,Category: entertainment Headline: Oceans Twelv...
...,...
2220,Category: tech Headline: Warning over Windows ...
2221,Category: tech Headline: Fast lifts rise into ...
2222,Category: tech Headline: Nintendo adds media p...
2223,Category: tech Headline: Fast moving phone vir...


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

In [None]:
# Convert the dataframe to Hugging Face Dataset format
dataset = Dataset.from_pandas(df[['input_text']])

In [None]:

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # To avoid padding token issues



In [None]:
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=512)

    # The labels are the same as the input_ids for auto-regressive language modeling
    inputs['labels'] = inputs['input_ids'].copy()

    # Replace padding token IDs in the labels with -100 so they are ignored in the loss calculation
    inputs['labels'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in inputs['labels']
    ]

    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2225 [00:00<?, ? examples/s]

In [None]:
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)



In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Same dataset for evaluation, but ideally, separate one
)

In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.2441,3.009139
2,3.1383,2.950948
3,3.0735,2.934202


TrainOutput(global_step=837, training_loss=3.1632894062511667, metrics={'train_runtime': 1364.8235, 'train_samples_per_second': 4.891, 'train_steps_per_second': 0.613, 'total_flos': 1744124313600000.0, 'train_loss': 3.1632894062511667, 'epoch': 3.0})

In [None]:
# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/task/fine_tuned_headline_model")
tokenizer.save_pretrained("/content/drive/MyDrive/task/fine_tuned_headline_model")

('/content/drive/MyDrive/task/fine_tuned_headline_model/tokenizer_config.json',
 '/content/drive/MyDrive/task/fine_tuned_headline_model/special_tokens_map.json',
 '/content/drive/MyDrive/task/fine_tuned_headline_model/vocab.json',
 '/content/drive/MyDrive/task/fine_tuned_headline_model/merges.txt',
 '/content/drive/MyDrive/task/fine_tuned_headline_model/added_tokens.json')

In [None]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned PyTorch GPT-2 model
model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/task/fine_tuned_headline_model')

# Convert the PyTorch model to TensorFlow
model_tf = TFGPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/task/fine_tuned_headline_model', from_pt=True)

# Save the TensorFlow model in SavedModel format
model_tf.save_pretrained('/content/drive/MyDrive/task/fine_tuned_headline_model_tf')


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
import numpy as np
import tensorflow as tf

def calculate_perplexity(model, tokenizer, texts):
    encodings = tokenizer(texts, return_tensors='tf', padding=True, truncation=True)
    inputs = encodings['input_ids']
    labels = encodings['input_ids']

    # Compute loss
    loss = model(inputs, labels=labels).loss
    perplexity = np.exp(loss.numpy())
    return perplexity


In [None]:
# Example evaluation dataset
eval_texts = [
    "Category: sports Headline: Team wins championship",
    "Category: politics Headline: Election results announced"
]

# Calculate perplexity
perplexity = calculate_perplexity(model_tf, tokenizer, eval_texts)
print(f"Perplexity: {perplexity}")


Perplexity: [18.194859]


In [None]:
import math

In [None]:
# After training
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")

Perplexity: 18.806488735557807


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/task/fine_tuned_headline_model')
model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/task/fine_tuned_headline_model')

# Set pad_token_id to eos_token_id to avoid warnings
tokenizer.pad_token = tokenizer.eos_token

# Categories for classification
categories = ['entertainment', 'business', 'sport', 'politics', 'tech']

# Function to generate predictions
def predict_category(text):
    # Preprocess the input text
    input_text = f"Headline: {text} Category:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)

    # Forward pass to get model logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the token corresponding to the category label space
    predicted_token_ids = torch.argmax(logits, dim=-1)

    # Decode the predicted token ids
    predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

    # Check if any category appears in the predicted text
    for category in categories:
        if category in predicted_text.lower():
            return category

    return "Category not found"

# Example usage for predicting the category
sample_text = "Tesla stocks are soaring after the latest financial results were released."
predicted_category = predict_category(sample_text)
print(f"Predicted Category: {predicted_category}")


Predicted Category: tech


In [None]:
sample_text = "The government announced new policies to boost renewable energy production and reduce carbon emissions."
predicted_category = predict_category(sample_text)
print(f"Predicted Category: {predicted_category}")


Predicted Category: business


In [None]:
sample_text = "Apple unveils the latest iPhone with cutting-edge technology and improved battery life."
predicted_category = predict_category(sample_text)
print(f"Predicted Category: {predicted_category}")


Predicted Category: tech


In [None]:
sample_text = "The latest blockbuster movie has broken box office records, with fans praising the incredible special effects."
predicted_category = predict_category(sample_text)
print(f"Predicted Category: {predicted_category}")


Predicted Category: entertainment


In [None]:
sample_text = "The president held a press conference to discuss foreign policy changes and economic reforms."
predicted_category = predict_category(sample_text)
print(f"Predicted Category: {predicted_category}")


Predicted Category: politics
