In [19]:
%run wikidata_functions.ipynb

[('Q146', 'house cat', 'domesticated feline'), ('Q4167836', 'Wikimedia category', "use with 'instance of' (P31) for Wikimedia category"), ('Q387278', 'Kellas cat', 'cat'), ('Q300918', 'cat', 'Unix utility that concatenates and lists files'), ('Q10813994', 'Tsim Tung Brother Cream', 'cat')]
[('Q76', 'Barack Obama', 'president of the United States from 2009 to 2017'), ('Q61909968', 'Barack Obama', 'Wikimedia disambiguation page')]
3


In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import Matcher 
import spacy_universal_sentence_encoder
import claucy   
from bs4 import BeautifulSoup
import re
import random

2023-12-08 19:24:48.066795: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
from chatgpt_api import query_gpt, query_gpt_conversation

In [208]:
import pickle
import time
import functools

In [254]:
from collections import Counter

In [184]:
def get_gpt_sentences(pid, G, n=50):
    edges = list(G.edges.data('pred'))
    edges = list(filter(lambda e: e[2]==pid, edges))
    per = 50 # number of sentences per iteration

    num_edges = n//per

    edges_subset = random.sample(edges, num_edges)

    contents = {}
    for Q1, Q2, P in edges_subset:
        q1 = get_label_from_qid(Q1)
        q2 = get_label_from_qid(Q2)
        p = get_property_from_pid(P)

        prompt = f"""Could you provide me {per} sentences that correspond to the true RDF triple: {q1}, {p}, {q2}?
                     Each sentence must only contain two entities. Please respond only with the sentences, each separated with a new line."""

        resp = query_gpt(prompt)
        print("sleeping.. ", time.time())
        time.sleep(22)
        
        content = resp.choices[0].message.content
        contents[(Q1, Q2, P)] = content

    return contents

In [185]:
contents = get_gpt_sentences('P530', G2, n=100)

sleeping..  1702092930.716746
sleeping..  1702092975.875725


In [198]:
# function that takes output of get_opt_sentences and returns half true sentences, half false sentences
# input: contents, false replacements
def process_gpt_contents(contents, reps):
    assert len(contents) == 42
    tsents = [0]*21
    fsents = [0]*21

    for i, key in enumerate(contents.keys()):
        content = contents[key]
        
        if i >= 21:
            rep = random.choice(reps)
            subj = get_label_from_qid(key[1])
            content = re.sub(subj, rep, content)
            
        sentences = content.split("\n")

        if i < 21:
            tsents[i] = sentences
        else:
            fsents[i-21] = sentences

    return tsents, fsents

def get_gpt_sentences_false(pid, G, used = [], n=5):
    edges = list(G.edges.data('pred'))
    edges = list(filter(lambda e: e[2]==pid, edges))

    num_edges = n//5 # get 10 examples per edge

    edges_subset = random.sample(edges, num_edges)

    sents = {}
    for Q1, Q2, P in edges_subset:
        q1 = get_label_from_qid(Q1)
        q2 = get_label_from_qid(Q2)
        p = get_property_from_pid(P)

        prompt = f"""Could you provide me 5 sentences that correspond to the RDF triple: ({q1}, {p}, "OBJECT")?
                     Each sentence must only contain two entities. Please respond only with the sentences, each separated with a new line."""
        rep = input(f"Replace OBJECT in the RDF triple {q1}, {p}, OBJECT with: ")
        resp = query_gpt(prompt)
        content = resp.choices[0].message.content
        content = re.sub('"OBJECT"', rep, content)
        sentences = content.split("\n")
        sents[(Q1, Q2, P)] = ((q1, q2, p, rep), sentences)

    return sents

In [58]:
def helper(sent):
        left, right = sent.split(":")
        one, two = left.split(",")
        return one.strip(), two.strip(), right.strip()

In [161]:
def get_gpt_sentences_false0(pid, G, used = [], n=50):
    p = get_property_from_pid(pid)
    aliases = get_pid_aliases(pid)

    # all_sents=[0]*n
    
    prompt = f"""Can you make up 50 examples of FALSE SIMPLE sentences corresponding to the RDF triple (SUBJECT, {p}, OBJECT), 
    where you can make up the SUBJECT and OBJECT?
    For some of the sentences, use any of the following aliases for {p}: {str(aliases)}. 
    Here are the rules for these sentences. Follow all the rules. 
    1. Every example should contain a SENTENCE that corresponds to the RDF triple (subject, {p}, object), where you choose the SUBJECT and OBJECT.
    2. Be creative when choosing the subject and object so that the sentences are OBVIOUSLY FALSE.
    3. If the sentence does not seem false, use a different SUBJECT and OBJECT.
    4. Respond ONLY with the sentences, as well as the SUBJECT and OBJECT that you have chosen in that sentence.
    5. Each example should separated with an empty line.
    6. The examples should be unnumbered.
    7. Each examples should follow the following format: SUBJECT, OBJECT: SENTENCE"""

    if used:
        append = "\n8. Do not use any of the following nouns as your SUBJECT or OBJECT: "
        append += ", ".join(used)
        prompt += append

    # print(prompt)
        
    resp = query_gpt(prompt, temp=0.8)
    content = resp.choices[0].message.content
    return content
    
    # sents = content.split("\n")
    # print(sents)
    # for j, sent in enumerate(sents):
    #     one, two, right = helper(sent)
    #     all_sents[50*i+j] = right
    
    return all_sents

In [80]:
def false_helper(sent):
    left, right = sent.split(":")
    one, two = left.split(",")
    return one.strip(), two.strip(), right.strip()

In [81]:
content = get_gpt_sentences_false('P530', G2)

In [95]:
def process_gpt_sentences_false(content):
    sents = content.split("\n")
    for j, sent in enumerate(sents):
        if sent.count(":") != 1: 
            sents[j] = 0
        else: sents[j] = false_helper(sent)

    return sents

In [96]:
sents = process_gpt_sentences_false(content)

In [92]:
def process_gpt_sentence_false_split(sents):
    l1 = map(lambda tup : (tup[0], tup[1]), sents)
    l2 = list(map(lambda tup : tup[2], sents))
    flattened = [n for sub in l1 for n in sub]
    
    return flattened, l2

In [93]:
used, sents_ = process_gpt_sentence_false_split(sents)

In [94]:
sents_

['The United States and Russia have ambassadorial relations.',
 'John and Mars have diplomatic relations.',
 'Paris and Unicorn have foreign relations.',
 'Apple and Banana have political relations.',
 'Cat and Dog have diplomatic relations.',
 'China and Moon have ambassadorial relations.',
 'Tom and Jerry have foreign relations.',
 'London and Paris have political relations.',
 'Sun and Earth have diplomatic relations.',
 'Pizza and Hamburger have ambassadorial relations.',
 'Alice and Wonderland have foreign relations.',
 'Ocean and Mountain have diplomatic relations.',
 'Italy and Spain have ambassadorial relations.',
 'Book and Pen have foreign relations.',
 'Tree and Flower have political relations.',
 'Coffee and Tea have diplomatic relations.',
 'Car and Bicycle have ambassadorial relations.',
 'John and Mary have foreign relations.',
 'London and New York have political relations.',
 'Sun and Moon have diplomatic relations.',
 'Pizza and Pasta have ambassadorial relations.',
 

In [103]:
content2 = get_gpt_sentences_false('P530', G2, used)

Can you make up 50 examples of FALSE SIMPLE sentences corresponding to the RDF triple (SUBJECT, diplomatic relation, OBJECT), 
    where you can make up the SUBJECT and OBJECT?
    For some of the sentences, use any of the following aliases for diplomatic relation: ['diplomatic relations', 'ambassadorial relations', 'foreign relations', 'foreign relation', 'diplomatic relation', 'ambassadorial relation', 'political relations', 'political relation']. 
    Here are the rules for these sentences. Follow all the rules. 
    1. Every example should contain a SENTENCE that corresponds to the RDF triple (subject, diplomatic relation, object), where you choose the SUBJECT and OBJECT.
    2. Be creative when choosing the subject and object so that the sentences are OBVIOUSLY FALSE.
    3. If the sentence does not seem false, use a different SUBJECT and OBJECT.
    4. Respond ONLY with the sentences, as well as the SUBJECT and OBJECT that you have chosen in that sentence.
    5. Each example sho

In [104]:
content2

'1. Lions, Elephants: Lions have ambassadorial relations with elephants.\n2. Flowers, Mountains: Flowers have political relations with mountains.\n3. Birds, Fish: Birds have diplomatic relations with fish.\n4. Clouds, Stars: Clouds have foreign relations with stars.\n5. Butterflies, Bees: Butterflies have diplomatic relations with bees.\n6. Rainbows, Raindrops: Rainbows have ambassadorial relations with raindrops.\n7. Butterflies, Flowers: Butterflies have political relations with flowers.\n8. Trees, Rocks: Trees have foreign relations with rocks.\n9. Dolphins, Whales: Dolphins have diplomatic relations with whales.\n10. Rain, Sun: Rain has ambassadorial relations with the sun.\n\n11. Butterflies, Clouds: Butterflies have political relations with clouds.\n12. Trees, Birds: Trees have foreign relations with birds.\n13. Rainbows, Waterfalls: Rainbows have diplomatic relations with waterfalls.\n14. Butterflies, Grasshoppers: Butterflies have ambassadorial relations with grasshoppers.\n15.

# GET SENTENCES

In [255]:
G3 = pickle.load(open('graphs/obama_3.pickle', 'rb'))
core3 = pickle.load(open('graphs/obama_3_core.pickle', 'rb'))
pred = list(map(lambda e:e[2], list(G3.edges.data('pred'))))
Counter(pred).most_common(5)

[('P530', 5668), ('P463', 2489), ('P150', 2134), ('P2936', 2035), ('P47', 407)]

## Train predicates P530, P463, P150, P2936

In [264]:
contents = get_gpt_sentences_true("P2936", G3, 2100)

sleeping..  1702107504.107349
sleeping..  1702107546.091829
sleeping..  1702107587.764792
sleeping..  1702107624.654748
sleeping..  1702107662.8474612
sleeping..  1702107703.739306
sleeping..  1702107741.0346072
sleeping..  1702107779.317056
sleeping..  1702107819.7777689
sleeping..  1702107862.368403
sleeping..  1702107898.605012
sleeping..  1702107937.962452
sleeping..  1702107987.315205
sleeping..  1702108024.379942
sleeping..  1702108059.509148
sleeping..  1702108095.942375
sleeping..  1702108134.29455
sleeping..  1702108167.128236
sleeping..  1702108203.283801
sleeping..  1702108253.650568
sleeping..  1702108295.4551392
sleeping..  1702108419.458415
sleeping..  1702108459.2951481
sleeping..  1702108494.3360882
sleeping..  1702108535.247091
sleeping..  1702108685.779615
sleeping..  1702108723.4522429
sleeping..  1702108764.320172
sleeping..  1702108797.80279
sleeping..  1702108835.2530859
sleeping..  1702108875.463275
sleeping..  1702108924.59084
sleeping..  1702108963.756511
sleep

In [265]:
contents

{('Q668',
  'Q56666',
  'P2936'): "India uses the language Yakkha.\nYakkha is the language used in India.\nThe language used in India is Yakkha.\nIndia's language is Yakkha.\nYakkha is spoken in India.\nIn India, Yakkha is the language used.\nThe language spoken in India is Yakkha.\nYakkha is the language of India.\nIndia communicates in Yakkha.\nYakkha is the language of communication in India.\nIn India, the language of communication is Yakkha.\nThe official language of India is Yakkha.\nYakkha is the official language of India.\nIndia's official language is Yakkha.\nYakkha is the language predominantly used in India.\nIn India, Yakkha is predominantly spoken.\nThe predominant language used in India is Yakkha.\nYakkha is the predominant language in India.\nIndia primarily uses the language Yakkha.\nYakkha is primarily used in India.\nThe primary language used in India is Yakkha.\nYakkha is the primary language in India.\nIndia mainly uses the language Yakkha.\nYakkha is mainly used i

In [266]:
pickle.dump(contents, open('gpt_responses/P2936.pickle', 'wb'))

In [271]:
contents = pickle.load(open('gpt_responses/P150.pickle', 'rb'))

In [272]:
contents

{('Q750',
  'Q238079',
  'P150'): 'Bolivia contains the administrative territorial entity Potosí Department.\nBolivia is the country that contains the administrative territorial entity Potosí Department.\nThe administrative territorial entity Potosí Department is contained in Bolivia.\nThe administrative territorial entity Potosí Department is located in Bolivia.\nPotosí Department is a part of Bolivia.\nBolivia encompasses the administrative territorial entity Potosí Department.\nPotosí Department falls within the borders of Bolivia.\nBolivia is where the administrative territorial entity Potosí Department is situated.\nThe administrative territorial entity Potosí Department is situated in Bolivia.\nPotosí Department is within the boundaries of Bolivia.\nBolivia is the nation that includes the administrative territorial entity Potosí Department.\nThe administrative territorial entity Potosí Department is included in Bolivia.\nPotosí Department is positioned in Bolivia.\nBolivia is the