In [None]:
import pandas as pd

# Load the CSV dataset
data_path = './dataset/countries_in_natural_language.csv' 
df = pd.read_csv(data_path)

# Preview the dataset
df.head()

In [None]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the padding token if it's not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Tokenize the 'Description' column
df['tokens'] = df['Description'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=128))

# Preview the tokenized data
df['tokens'].head()

In [None]:
import torch
from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, df, max_length):
        self.tokenizer = tokenizer
        self.texts = df['Description'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized_inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
        return tokenized_inputs

# Initialize the dataset
dataset = CustomTextDataset(tokenizer, df, max_length=128)

In [None]:
#!pip install "accelerate>=0.26.0"

In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set padding token if necessary
model.config.pad_token_id = tokenizer.pad_token_id

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',               # output directory
    per_device_train_batch_size=2,        # batch size per device
    num_train_epochs=3,                   # number of training epochs
    logging_dir='./logs',                 # directory for storing logs
    logging_steps=10                      # log every 10 steps
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset  # Use your custom dataset here
)

# Start fine-tuning
trainer.train()