In [43]:
import datasets
import requests
from datasets import load_dataset
import numpy as np
import openai
from tqdm import tqdm

In [12]:
api_key = open("/data/katie_kang/openai_key_file.txt", "r").read()
openai.api_key = api_key.strip()

In [4]:
def get_wikidata_property_label(property_id):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": property_id,
        "props": "labels",
        "languages": "en",
        "format": "json",
    }
    response = requests.get(url, params=params)
    data = response.json()
    label = data['entities'][property_id]['labels']['en']['value']
    return label


In [45]:
dataset2 = load_dataset('relbert/t_rex')

In [46]:
trex_train_relations = list(set(dataset2["train"]['relation']))

In [47]:
base_prompt = """Write a question using the provided head, tail, and relation, such that the question includes the head and the relation, and the question is the tail. Write the question such that the head is replaced with '[X]'. Do NOT include information from the head or tail in the question. Answer with only the question.

Examples
Relation: developer
Head: Neverwinter Nights 2
Tail: Obsidian Entertainment
Question: Who is the developer of [X]?

Relation: member of sports team
Head: Roger Nilsen
Tail: Molde
Question: What sports team is [X] is member of?

Relation: part of
Head: Tungusic
Tail: Altaic family
Question: What is [X] part of?

Relation: surface played on
Head: Jakarta Open
Tail: outdoor hard courts
Question: What kind of surface is [X] played on?

Query
Relation: [relation]
Head: [head]
Tail: [tail]
Question: """

def get_prompt(relation, head, tail):
    prompt = base_prompt
    prompt = prompt.replace("[relation]", relation)
    prompt = prompt.replace("[head]", head)
    prompt = prompt.replace("[tail]", tail)
    return prompt


In [50]:
relations_train_dict = {}
relations_train_dict_2 = {}
for i in tqdm(range(len(trex_train_relations))):
    trex_train_relation = trex_train_relations[i]
    if trex_train_relation.startswith("P"):
        relation = (get_wikidata_property_label(trex_train_relation))
        relation_idxs = np.where(np.array(dataset2["train"]['relation']) == trex_train_relation)[0]
        if len(relation_idxs) >= 2:
            prompt1 = get_prompt(relation, dataset2["train"][int(relation_idxs[0])]['head'], dataset2["train"][int(relation_idxs[0])]['tail'])
            response1 = openai.Completion.create(model="text-davinci-003",
                                                prompt=prompt1,
                                                max_tokens=256,
                                                temperature=1,
                                                logprobs=0,
                                                echo=False)
            question1 = response1["choices"][0]["text"].lstrip().rstrip()
            prompt2 = get_prompt(relation, dataset2["train"][int(relation_idxs[1])]['head'], dataset2["train"][int(relation_idxs[1])]['tail'])
            response2 = openai.Completion.create(model="text-davinci-003",
                                        prompt=prompt2,
                                        max_tokens=256,
                                        temperature=1,
                                        logprobs=0,
                                        echo=False)
            question2 = response2["choices"][0]["text"].lstrip().rstrip()
            if question1 == question2:
                relations_train_dict[relation] = question1
            else:
                relations_train_dict_2[relation] = [relation, question1, question2, int(relation_idxs[0]), int(relation_idxs[1])]

  3%|▎         | 24/759 [00:40<20:52,  1.70s/it]


KeyboardInterrupt: 

In [51]:
relations_train_dict

{'antiparticle': 'What is the antiparticle of [X]?'}

In [52]:
relations_train_dict_2

{'surface played on': ['surface played on',
  'What kind of surface is the Jakarta Open played on?',
  'What kind of surface is [X] played on?',
  11103,
  16363],
 'after a work by': ['after a work by',
  'What work by Tyler Perry was Diary of a Mad Black Woman based on?',
  'What work by Herman Melville did Pola X come after?',
  7796,
  8406],
 'physically interacts with': ['physically interacts with',
  'What does nisoxetine physically interact with?',
  'What does aripiprazole physically interact with?',
  8484,
  10712],
 'designed by': ['designed by',
  'Who designed the Cherry Street Strauss Trunnion Bascule Bridge?',
  'Who designed [X]?',
  7223,
  8995],
 'inception': ['inception',
  'When was the [X] founded?',
  'When was [X] founded?',
  493,
  1367],
 'capital of': ['capital of',
  'What is the capital of [X]?',
  'What is the capital of the Belorussian SSR?',
  645,
  944],
 'record held': ['record held',
  'What record does [X] hold in?',
  'Who holds the record in [X]

In [32]:
 dataset2["train"][int(relation_idxs[1])]

{'relation': 'P765',
 'head': 'Internationaux de Strasbourg',
 'tail': 'outdoor clay courts',
 'title': '1995 Internationaux de Strasbourg',
 'text': "The 1995 Internationaux de Strasbourg was a women's tennis tournament played on outdoor clay courts in Strasbourg in France that was part of Tier III of the 1995 WTA Tour. It was the ninth edition of the tournament and was held from May 22 through May 28, 1995."}

In [68]:
dataset2["train"][5]['head']

{'relation': 'P144',
 'head': 'The Garden of Eden with the Fall of Man',
 'tail': 'Genesis',
 'title': 'The Garden of Eden with the Fall of Man',
 'text': "The Garden of Eden with the Fall of Man or The Earthly Paradise with the Fall of Adam and Eve is a 1617 painting by Peter Paul Rubens (figures) and Jan Brueghel the Elder (flora and fauna). It is housed in the Mauritshuis, Netherlands. The painting depicts the moment just before the consumption of forbidden fruit and the fall of man. Adam and Eve are depicted beneath the tree of the knowledge of good and evil, where various fruits grow. On the opposite side the tree of life is depicted, also laden with fruits. The scene is a reference to Genesis 2:8–14. A monkey biting an apple to the left symbolizes sin. The sanguine monkey next to Adam is the hotspur who cannot resist temptation, while the choleric cat near Eve's heels represents cruel cunning. In Christian symbolism, several grapes in the foliage behind Adam and Eve represent Chr

In [60]:
dataset2["train"][37]

{'relation': 'P22',
 'head': 'Lord Ganesh',
 'tail': 'Lord Shiva',
 'title': 'Pandalam Mahadeva Temple',
 'text': "Pandalam Mahadeva Temple is situated in between Thottakkonam and Mulampuzha villages of Pandalam in Kerala, India. Yearly Kettukazhcha festival is one of the attractions for tourists. 10 days major festival is celebrated for Lord Sivain 'Dhanu masa' November–December every year. the festival start by hosting traditional flag names 'kodiettu' and ends by 'aarattu'. the administration of this temple held by Mahadeva Seva Samithi participated by 12 villages '12 karakal'in Pandalam. It is believed that Lord Parasuram has installed the main idol of the Garbhagriha in this temple. Pandalam Mahadeva Temple is one of the oldest temples out of the 108 Shiva temples consecrated by the great 'Sanayasin Khara Muni'. The sacred and world famous temple is located on the left banks of the Achenkovil river, one side of this temple is in the banks of Achankovil river so this temple also kn

(array([     37,      76,     797, ..., 1272967, 1273398, 1273508]),)

In [50]:
get_wikidata_property_label("P22")

'father'

In [25]:
trex_test_relations = (set(dataset2["test"]['relation']))

In [26]:
len(trex_test_relations)

34

In [27]:
len(list(trex_test_relations & trex_train_relations))

0

In [33]:
import numpy as np
np.mean([relation[0]=="P" for relation in list(trex_train_relations)])

0.7259552042160737

In [37]:
get_wikidata_property_label("P1264")

'valid in period'

In [32]:
list(trex_train_relations)

['P1411',
 'P3461',
 '[Holiday] is a national holiday of [Country]',
 'P1304',
 '[Play] is first performed on [Date]',
 'P575',
 'P2743',
 'P1191',
 'P282',
 'P611',
 'P730',
 'P3701',
 'P527',
 'P176',
 'P620',
 '[Person] is the prime minister of [Country]',
 '[Software] is used for [Purpose]',
 'P669',
 '[System] is a system in [Artifact]',
 'P624',
 'P1366',
 'P1830',
 'P1552',
 '[Movie] is a spinoff of [Movie]',
 'P1264',
 '[Group] is founded by [Person]',
 'P3403',
 'P912',
 'P3490',
 'P1429',
 'P3828',
 '[Person] studies at [School]',
 'P1302',
 'P162',
 '[Person] is born in [Location]',
 'P2578',
 'P3712',
 'P237',
 'P1057',
 'P551',
 '[Currency] is used in [Country]',
 "[Sport Team]'s home field is [Location]",
 'P410',
 '[Museum] is memorial to [Event]',
 'P123',
 'P1923',
 'P524',
 '[Disease] has [Symptoms]',
 '[River] drains [Location]',
 'P16',
 'P3438',
 '[Artifact] is name of [Artifact]',
 'P800',
 '[Artifact] is used by [Person]',
 'P2813',
 'P1576',
 'P1462',
 'P1435',


In [None]:
get_wikidata_property_label