In [1]:
!pip install --upgrade torch transformers
!pip install datasets transformers accelerate



In [3]:
#load the dataset
from datasets import load_dataset

# Load your CSV dataset
level_data = load_dataset("csv", data_files="dataset/easy_advanced_cities.csv")
level_data
cities_data = load_dataset("csv", data_files="dataset/worldcities.csv")
cities_data

DatasetDict({
    train: Dataset({
        features: ['city', 'city_ascii', 'lat', 'lng', 'country', 'iso2', 'iso3', 'admin_name', 'capital', 'population', 'id'],
        num_rows: 47868
    })
})

In [6]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function to process both Hard and Easy clues
def tokenize_function(examples):
    # Concatenate City, Hard, and Easy into a single string for tokenization
    input_texts = [
        f"City: {examples['City'][i]} | Hard: {examples['Hard'][i]} | Easy: {examples['Easy'][i]}"
        for i in range(len(examples['City']))
    ]
    
    # Tokenize the combined input texts
    return tokenizer(input_texts, truncation=True, padding='max_length', max_length=512)

# Apply tokenization to the dataset
tokenized_datasets = level_data.map(tokenize_function, batched=True)

In [7]:
#Prepare the Dataset for Training
from datasets import DatasetDict

# Split into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(tokenized_datasets['train']))
train_dataset = tokenized_datasets['train'].select(range(train_size))
eval_dataset = tokenized_datasets['train'].select(range(train_size, len(tokenized_datasets['train'])))

datasets = DatasetDict({"train": train_dataset, "validation": eval_dataset})

In [8]:
#Define the Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # GPT-2 is not a masked LM

In [9]:
#load the gpt2 model
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [13]:
#Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=500,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=250,
)

#Train the Model
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()

Step,Training Loss


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [44]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import random

# Load GPT-2 tokenizer and model
device = torch.device("cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)

# Function to check if a city has a clue in the clues dataset
def check_city_in_clue_dataset(city_name, clues_data):
    return city_name in clues_data['train']['City']

# Function to randomly pick a city (from the clues dataset or world cities dataset)
def get_random_city(level_data, cities_data):
    if random.random() < 0.5:  # 50% chance to pick a city from the clues dataset
        random_row = level_data['train'].shuffle(seed=random.randint(0, 1000)).select([0])[0]
        return random_row['City'], random_row['Easy'], random_row['Hard'], True
    else:  # 50% chance to pick a city from the world cities dataset
        random_row = cities_data['train'].shuffle(seed=random.randint(0, 1000)).select([0])[0]
        return random_row['city'], None, None, False


# Get user input for difficulty (Easy or Hard)
level = input("Please type 'Easy' or 'Hard' to generate a clue: ").capitalize()

# Randomly select a city from both datasets
random_city, easy_clue, hard_clue, has_clue = get_random_city(level_data, cities_data)


# If the city has predefined clues, use them
if has_clue:
    if level == "Easy":
        clue = easy_clue
    elif level == "Hard":
        clue = hard_clue
    else:
        print("Invalid difficulty level. Please enter 'Easy' or 'Hard'.")
        exit()

# If the city doesn't have predefined clues, generate a clue using the model
else:

    internal_input_text = f"Describe a {level} clue based on the city's cultural, geographic, or historical significance. Do not mention the city name."


    # Tokenize the input and move it to the correct device
    inputs = tokenizer(internal_input_text, return_tensors="pt").to(device)

    # Generate the clue
    outputs = model.generate(
        inputs.input_ids, 
        attention_mask=inputs.attention_mask,  # Pass the attention mask
        max_length=50,  # Adjust this if you want longer/shorter clues
        num_return_sequences=1,
        temperature=0.6,  # Lower temperature for more focused results
        top_p=0.9,  # Use nucleus sampling to reduce repetition
        pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to the EOS token id
    )

    # Decode and print the generated clue (without mentioning the city)
    generated_clue = tokenizer.decode(outputs[0], skip_special_tokens=True)
    clue = generated_clue

# Print the generated clue without mentioning the city
print(f"Generated {level} clue: {clue}")


# Now, ask the user to guess the city
user_guess = input("Which city is this clue referring to? ")

# Check if the user's guess matches the hidden city
if user_guess.lower() == random_city.lower():
    print("Success!")
else:
    print(f"Try again! The correct city was {random_city}.")

Please type 'Easy' or 'Hard' to generate a clue:  Easy


Generated Easy clue: They didn’t divide Mandela here, but he lived in our South African “burg” just south of Pretoria.


Which city is this clue referring to?  


Try again! The correct city was Johannesburg.
