In [10]:
!pip install --upgrade torch transformers



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
!pip install datasets transformers accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
#load the dataset
from datasets import load_dataset

# Load your CSV dataset
data = load_dataset("csv", data_files="dataset/countries_in_natural_language.csv")
data


DatasetDict({
    train: Dataset({
        features: ['Country', 'Description'],
        num_rows: 195
    })
})

In [14]:
#load gpt2 tokenizer
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 uses the end of sequence token as padding


In [15]:
def tokenize_function(examples):
    return tokenizer(examples['Description'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = data.map(tokenize_function, batched=True)


Map:   0%|          | 0/195 [00:00<?, ? examples/s]

In [16]:
#Prepare the Dataset for Training
from datasets import DatasetDict

# Split into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(tokenized_datasets['train']))
train_dataset = tokenized_datasets['train'].select(range(train_size))
eval_dataset = tokenized_datasets['train'].select(range(train_size, len(tokenized_datasets['train'])))

datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})


In [17]:
#Define the Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # GPT-2 is not a masked LM


In [19]:
#load the gpt2 model
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [20]:
#Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=500,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=250,
)


In [21]:
#Train the Model
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()


Step,Training Loss


TrainOutput(global_step=117, training_loss=2.2558181632278314, metrics={'train_runtime': 1421.9285, 'train_samples_per_second': 0.329, 'train_steps_per_second': 0.082, 'total_flos': 122284670976000.0, 'train_loss': 2.2558181632278314, 'epoch': 3.0})

In [23]:
# Example question based on your dataset
country_name = "Sweden"  # Replace with the desired country
input_text = f"What can you tell me about {country_name}?"

# Tokenize and generate response
inputs = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_length=100, num_return_sequences=1)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Sweden is a small, but well-populated country, and its capital city is Västerland. The capital city is Väster, and its capital city is Väster. The capital city capital city is Väster. The capital city is Väster. The capital city is Väster. The capital city is Väster. The capital city is a city of Väster, and its capital city is Väster. The capital city is Vä
