In [3]:
import pandas as pd

# Direct upload
df = pd.read_csv('cleaned_train.csv')

In [4]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, Trainer, TrainingArguments

# Loading the GPT-2 fast tokenizer and model
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", use_fast=True)
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [5]:
from datasets import Dataset 

# Converted the DataFrame to a Dataset
processed_dataset = Dataset.from_pandas(df)

In [6]:
# Set eos_token as the padding token
tokenizer.pad_token = tokenizer.eos_token


In [7]:
# Define the sliding window tokenization function
def tokenize_function_sliding_window(examples, max_length=512, stride=128):
    tokens = tokenizer(
        examples['article'],
        truncation=False,
        padding=False,
        return_overflowing_tokens=True,
        max_length=max_length,
        stride=stride
    )
    input_ids = []
    attention_mask = []
    labels = []
    for i in range(len(tokens['input_ids'])):
        input_ids.append(tokens['input_ids'][i][:max_length])
        attention_mask.append(tokens['attention_mask'][i][:max_length])
        
        # Padding manually
        padding_length = max_length - len(input_ids[-1])
        if padding_length > 0:
            input_ids[-1] = input_ids[-1] + [tokenizer.pad_token_id] * padding_length
            attention_mask[-1] = attention_mask[-1] + [0] * padding_length

        labels.append(input_ids[-1])
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask,  'labels': labels}


# Applying the sliding window tokenization function to the dataset
tokenized_dataset = processed_dataset.map(tokenize_function_sliding_window, batched=True)

# Set the format for PyTorch tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/115777 [00:00<?, ? examples/s]

In [8]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True)

In [9]:
from transformers import TrainingArguments, GPT2LMHeadModel, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=2  # Gradient accumulation
)

model = GPT2LMHeadModel.from_pretrained("gpt2")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Save the model and tokenizer
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")

In [None]:
# Push the model and tokenizer to the Hugging Face Hub
from huggingface_hub import HfApi, HfFolder

# Make sure you are logged in
api = HfApi()
user = HfFolder.get_token()
repo_id = f"{user}/finetuned-gpt2-smart-summary"

model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)