In [1]:
#!pip install --upgrade torch transformers
#!pip install datasets transformers accelerate



In [2]:
#load the dataset
from datasets import load_dataset

# Load your CSV dataset
data = load_dataset("csv", data_files="dataset/countries_in_natural_language.csv")
data

DatasetDict({
    train: Dataset({
        features: ['Country', 'Description'],
        num_rows: 195
    })
})

In [4]:
from transformers import GPT2Tokenizer
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['Description'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = data.map(tokenize_function, batched=True)



In [5]:
#Prepare the Dataset for Training
from datasets import DatasetDict

# Split into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(tokenized_datasets['train']))
train_dataset = tokenized_datasets['train'].select(range(train_size))
eval_dataset = tokenized_datasets['train'].select(range(train_size, len(tokenized_datasets['train'])))

datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})

In [6]:
#Define the Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # GPT-2 is not a masked LM

In [7]:
#load the gpt2 model
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [8]:
#Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=500,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=250,
)

#Train the Model
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()

Step,Training Loss


TrainOutput(global_step=117, training_loss=2.278110601963141, metrics={'train_runtime': 389.7474, 'train_samples_per_second': 1.201, 'train_steps_per_second': 0.3, 'total_flos': 122284670976000.0, 'train_loss': 2.278110601963141, 'epoch': 3.0})

In [9]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel


device = torch.device("cpu")
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Example question based on your dataset
country_name = "Spain"  # Replace with the desired country
input_text = f"What can you tell me about {country_name}?"

# Tokenize the input text and move the inputs to the correct device
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Move the model to the same device
model = model.to(device)

# Set pad_token_id and attention_mask, and generate the response
outputs = model.generate(
    inputs.input_ids, 
    attention_mask=inputs.attention_mask,  # Pass the attention mask
    max_length=100, 
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to the EOS token id
)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)



What can you tell me about Spain?

Spain is a country that has been in the forefront of the development of the world's most advanced technology. It is a country that has been the most successful in the world for the past 50 years. It is a country that has been the most successful in the world for the past 50 years. It is a country that has been the most successful in the world for the past 50 years. It is a country that has been the most successful in the
