# Generate fake entity names with GPT-4

In [1]:
!pwd

/cluster/work/cotterell/kdu/measureLM/preprocessing/YagoECQ


In [2]:
from collections import defaultdict
import os
import re
import openai
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


True

In [3]:
DATA_ROOT = "../../data/YagoECQ/"
YAGO_GPT_FAKE_ENTITIES_PATH = os.path.join(DATA_ROOT, "chatgpt_fake_entities_all.csv") 
YAGO_QEC_PATH = os.path.join(DATA_ROOT, "yago_qec.json")

In [7]:
openai.api_key = os.getenv("OPENAI_API_KEY")


In [5]:
client = openai.OpenAI(api_key=openai.api_key)

In [16]:
entity_class_to_prompt = {
    "Organization": "Give me a list of {} names of organizations that are not real, but could sound plausibly real.",
    "Place": "Give me a list of {} names of places that are not real, but could sound plausibly real.",
    "Event": "Give me a list of {} names of events that are not real, but could sound plausibly real.",
    "Person": "Give me a list of {} names of people who could exist but don't. Try to make them different from each other, e.g. avoid repeating surnames or names from the same category. Give me some from different cultures and time periods. Avoid any names of real people, and especially of famous people. Avoid any names that sound overly fantastical.",
    "Product": "Give me a list of {} names of products that are not real, but could sound plausibly real.",
    "CreativeWork": "Give me a list of {} total names of 20% books/poems, 20% songs, 20% art pieces, 20% theater shows, and 20% music pieces that are not real, but could sound plausibly real. Try to make them different from each other. Give me some from different cultures, time periods, genres, and mediums. Avoid any names of real works, and especially of famous works. Avoid overly fantastical sounding names.",
    "Taxon": "Give me a list of {} biological taxon names that are not real, but could sound plausibly real. Give some with a fake but plausible-sounding genus and some with just the species name.",
}

In [23]:
def generate_fake_entity_names(prompt, model="gpt-4-1106-preview"):
    print(f"Calling {model} on the following prompt:\n{prompt}")
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a creative assistant. Your purpose is to help the user come up with fictional names fitting the user's criteria. Give the user the requested number of names, no matter how long the response gets or how many names the user asks for. If the user asks for 1000 names, give 1000 names and do not split the list into multiple responses and give the full list of 1000 names. Do not repeat names."},
            {"role": "user", "content": prompt}
        ]
    )
    # Use the regex to find all matches
    ents = re.findall(r'^\d+\.\s(.+)$', response.choices[0].message.content, re.MULTILINE)
    return ents

def generate_fake_entity_names_df(entity_class_to_prompt, num_rows=1000):
    entity_class_to_entities = defaultdict(set)
    for entity_class, prompt in entity_class_to_prompt.items():
        print("Generating {} entities...".format(entity_class))
        while len(entity_class_to_entities[entity_class]) < num_rows:
            ents = generate_fake_entity_names(prompt.format(num_rows))
            entity_class_to_entities[entity_class] = entity_class_to_entities[entity_class].union(set(ents))
            print("\tGenerated {} / {} {}s".format(len(entity_class_to_entities[entity_class]), num_rows, entity_class))

        entity_class_to_entities[entity_class] = list(entity_class_to_entities[entity_class])[:num_rows]
    
    return pd.DataFrame(entity_class_to_entities)

In [24]:
%%time
gpt_fake_entities_df = generate_fake_entity_names_df(entity_class_to_prompt, num_rows=1000)
gpt_fake_entities_df

Generating Place entities...
Calling gpt-4-1106-preview on the following prompt:
Give me a list of 1000 names of places that are not real, but could sound plausibly real. Type of places should include countries, regions, towns, provinces/states, and natural regions. Avoid names that sound overly fictional or like they're from a video game.
	Generated 280 / 1000 Places
Calling gpt-4-1106-preview on the following prompt:
Give me a list of 1000 names of places that are not real, but could sound plausibly real. Type of places should include countries, regions, towns, provinces/states, and natural regions. Avoid names that sound overly fictional or like they're from a video game.
	Generated 981 / 1000 Places
Calling gpt-4-1106-preview on the following prompt:
Give me a list of 1000 names of places that are not real, but could sound plausibly real. Type of places should include countries, regions, towns, provinces/states, and natural regions. Avoid names that sound overly fictional or like t

Unnamed: 0,Place
0,Padley
1,Northcliff
2,Gorsestone Heath
3,Bramblefort
4,Graythwaite
...,...
995,Relvia
996,Lakeview Heights
997,Brightwater
998,Rowanfield


In [25]:
# Append newly generated entity names to existing file, if exists.
if os.path.isfile(YAGO_GPT_FAKE_ENTITIES_PATH):
    pd.concat([pd.read_csv(YAGO_GPT_FAKE_ENTITIES_PATH), gpt_fake_entities_df], axis=1).to_csv(YAGO_GPT_FAKE_ENTITIES_PATH, index=False)
else:
    gpt_fake_entities_df.to_csv("data/YagoECQ/gpt_fake_entities_all.csv", index=False)