In [1]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import re
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import requests
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_name = "bartowski/Mistral-7B-Instruct-v0.3-GGUF"
model_file = "Mistral-7B-Instruct-v0.3-Q4_K_M.gguf"
model_path = hf_hub_download(model_name, filename=model_file)

In [None]:
model = Llama(
    model_path = model_path,
    n_ctx=32000,
    verbose=False
)

In [12]:
# Approach: agent that can go multiple rounds: at each step, it can either: output semantic triple, or answer the user

system_prompt = """
You will first be given a fact by the user, and your goal is to determine whether or not that fact is true or false.

You have only 2 jobs:
1) Search wikidata by outputting a semantic triple
2) Answer the user

Your response must always include a THOUGHT and a RESPONSE keyword, and then you must follow it with the PAUSE keyword.

If you decide that you need more information do determine whether a fact is true, then you can perform a search on wikidata. To do this, you need to output a semantic triple, which follows the format: ENTITY_1 RELATIONSHIP ENTITY_2.

Specifically, I want you to output the JSON: 
RESPONSE: {"entity_1": "entity1", "relationship": "relationship", "entity_2": "entity2"}

If you decide that you do not need more information, and can say whether a fact is true or false, then simply output the JSON:
RESPONSE: {"message": "Whether the fact is true/false, and your reasoning."}

If you decide to call wikidata, you will be provided the result of the query ENTITY_1 ", which will be the true value for ENTITY_2. Then, you will compare the true value for ENTITY_2 to your predicted value for ENTITY_2.

###EXAMPLE 1:
User: Bach was a composer.

THOUGHT: I assume the user is talking about the musician Johann Bach. I will perform a search on wikidata.
RESPONSE: {"entity_1": "Johann Bach", "relationship": "occupation", "entity_2": "composer"}
PAUSE

RESULT: ["composer", "musician", "organist", "violinist", ...]

THOUGHT: I have enough information now.
RESPONSE: {"message": "True. According to wikidata, one of the many occupations Bach had was a composer."}
PAUSE
###END EXAMPLE 1

###EXAMPLE 2:
User: Lebron James was born in the United States.

THOUGHT: I will look up where Lebron James was born.
RESPONSE: {"entity_1": "Lebron James", "relationship": "place of birth", "entity_2": "United States"}
PAUSE

RESULT: "Akron"

THOUGHT: Since Akron is a city, I should find what country Akron is located in.
RESPONSE: {"entity_1": "Akron", "relationship": "country", "entity_2": "United States"}
PAUSE

RESULT: "United States"

THOUGHT: I have enough information to answer the user now.
RESPONSE: {"message": "True. Lebron James was born in Akron, which is a city in the United States."}
PAUSE
###END EXAMPLE 2

###EXAMPLE 3:
User: Lebron James has 4 children.

THOUGHT: Number of children is most likely not a property in wikidata, but children is. I will first find all of Lebron Jame"s children, then count the number of children he has. I can leave the 2nd entity empty since I don"t care about it.
RESPONSE: {"entity_1": "Lebron James", "relationship": "children", "entity_2": ""}
PAUSE

RESULT: ["Bryce James", "Bronny James", "Zhuri James"]

THOUGHT: By counting the result, I can tell that Lebron has 3 children, not 4
RESPONSE: {"message": "False. Lebron James has 3 children: Bryce, Bronny, and Zhuri."}
PAUSE
###END EXAMPLE 3
""".strip()

In [None]:
user_prompt = "Lebron James is nicknamed 'King James'"
prompt = f"""
[SYSTEM]
{system_prompt}

[USER]
{user_prompt}

[ASSISTANT]
THOUGHT:
""".strip()

In [None]:
output = model(
    prompt=prompt,
    stop=["PAUSE"],
    max_tokens=512
)["choices"][0]["text"]
print(output)

In [None]:
prompt = f"""
[SYSTEM]
{system_prompt}

[USER]
{user_prompt}

[ASSISTANT]
THOUGHT: I will look up if Lebron James has a nickname. I will also check the nickname to make sure it is indeed "King James".
RESPONSE: {{"entity_1": "Lebron James", "relationship": "nickname", "entity_2": "King James"}}

RESULT: No results returned

THOUGHT:
""".strip()

output = model(
    prompt=prompt,
    stop=["PAUSE"],
    max_tokens=512
)["choices"][0]["text"]
print(output)

In [None]:
# a flaw with wikidata is that there a lot of facts i don"t think it can answer
# like "world"s smallest mammal"
# it can only really answer things where there is a clear relationship
prompt = f"""
[SYSTEM]
{system_prompt}

[USER]
Fact check this statement: The smallest mammal is the Etruscan Shrew.

[ASSISTANT]
THOUGHT: 
""".strip()

output = model(
    prompt=prompt,
    stop=["PAUSE"],
    max_tokens=512
)["choices"][0]["text"]
print(output)

In [None]:
prompt = f"""
[SYSTEM]
{system_prompt}

[USER]
Fact check this statement: The smallest mammal is the Etruscan Shrew.

[ASSISTANT]
THOUGHT: I will look up what the smallest mammal is.
RESPONSE: {{"entity_1": "Etruscan Shrew", "relationship": "size", "entity_2": ""}}
PAUSE

RESULT: "size" is not a valid relationship in wikidata. Try another relationship.

THOUGHT: I will try looking up the length of an Etruscan Shrew instead.
RESPONSE: {{"entity_1": "Etruscan Shrew", "relationship": "length", "entity_2": ""}}
PAUSE

RESULT: "length" is not a valid relationship in wikidata. Try another relationship.
""".strip()

output = model(
    prompt=prompt,
    stop=["PAUSE"],
    max_tokens=512
)["choices"][0]["text"]
print(output)

# it tries to look up size, length, weight, etc...
# but, wikidata doesn"t have these properties for the etruscan shrew

In [None]:
# i will try to have a property dictionary that maps property name to property id
# basically, fetch all property ids for entity, then fetch the names from it either from prestored dictionary or api request. 
# then map those 2 together; then we do like a vector search for the property name probably.

# Implementing Actual Wikidata API

In [15]:
user_prompt = "Lebron James is nicknamed 'King James'"
prompt = f"""
[SYSTEM]
{system_prompt}

[USER]
{user_prompt}

[ASSISTANT]
THOUGHT:
""".strip()
output = model(
    prompt=prompt,
    stop=["PAUSE"],
    max_tokens=512
)["choices"][0]["text"]
print(output)

 I know that the user is asking about a nickname of Lebron James. I will perform a search on wikidata to find out if he has this nickname.
RESPONSE: {"entity_1": "Lebron James", "relationship": "nickname", "entity_2": "King James"}



In [16]:
# EXTRACT ENTITY NAME
response_regex = "RESPONSE: (.*)"
foundResponse = re.search(response_regex, output)
response_json = json.loads(foundResponse.group(1))
entity_1 = response_json["entity_1"]
print(entity_1)

Lebron James


In [2]:
# API CALL
endpoint = f"https://www.wikidata.org/w/api.php"
entity_1 = "Lebron James"
parameters = {
    "action":"wbsearchentities",
    "search":entity_1,
    "language":"en",
    "type":"item",
    "format":"json"
}
response = requests.get(endpoint, params=parameters).json()


In [5]:
top_result = response["search"][0]
entity_id = top_result["id"]
# another problem is figuring out which is the correct entity
# i can find out the correct property by looking at all the properties of the entity, but I can't guarantee i find the right entity
# for now, I'll just choose the top result
# later on, i can possibly use LLM to decide which one to select
print(entity_id)
response["search"]

Q36159


[{'id': 'Q36159',
  'title': 'Q36159',
  'pageid': 38812,
  'concepturi': 'http://www.wikidata.org/entity/Q36159',
  'repository': 'wikidata',
  'url': '//www.wikidata.org/wiki/Q36159',
  'display': {'label': {'value': 'LeBron James', 'language': 'en'},
   'description': {'value': 'American basketball player (born 1984)',
    'language': 'en'}},
  'label': 'LeBron James',
  'description': 'American basketball player (born 1984)',
  'match': {'type': 'label', 'language': 'en', 'text': 'LeBron James'}},
 {'id': 'Q108870659',
  'title': 'Q108870659',
  'pageid': 104098936,
  'concepturi': 'http://www.wikidata.org/entity/Q108870659',
  'repository': 'wikidata',
  'url': '//www.wikidata.org/wiki/Q108870659',
  'display': {'label': {'value': 'LeBron James', 'language': 'en'},
   'description': {'value': 'main protagonist in the 2021 film Space Jam: A New Legacy',
    'language': 'en'}},
  'label': 'LeBron James',
  'description': 'main protagonist in the 2021 film Space Jam: A New Legacy',
 

In [8]:
# get list of all property ids
# params = {
#     "action":"wbsearchentities",
#     "ids":entity_id,
#     "languages":'en',
#     "format":"json",
#     "props":"claims"
# }
# response = requests.get(endpoint, params).json()
# response

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

In [9]:
# FETCHES ALL PROPERTIES
sparql.setQuery("""
SELECT ?property ?propertyLabel WHERE {
  wd:"""+entity_id+""" ?directClaim ?object .
  ?property wikibase:directClaim ?directClaim .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""")
results = sparql.query().convert()
properties_unformatted = results["results"]["bindings"]

In [10]:
properties_unformatted

[{'property': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P268'},
  'propertyLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'Bibliothèque nationale de France ID'}},
 {'property': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P269'},
  'propertyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'IdRef ID'}},
 {'property': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P345'},
  'propertyLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'IMDb ID'}},
 {'property': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P373'},
  'propertyLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'Commons category'}},
 {'property': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P413'},
  'propertyLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'position played on team / speciality'}},
 {'property': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P413'},
  'propertyLabel': {'xml:lang': 'en',
   'type

In [7]:
# turns all the properties into a map
properties = {}
ids = []
names = []
for prop in properties_unformatted:
    property_id = prop["property"]["value"][prop["property"]["value"].index("P"):]
    property_name = prop["propertyLabel"]["value"]
    if property_name not in properties:
        ids.append(property_id)
        names.append(property_name)
    properties[property_name] = property_id
properties

NameError: name 'properties_unformatted' is not defined

In [54]:
db = chromadb.Client()
collection = db.get_or_create_collection(entity_id)
collection.add(documents=names, ids=ids)

In [56]:
docs=  collection.query(query_texts="nickname", n_results=1)
print(docs)

{'ids': [['P1449']], 'distances': [[0.0]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['nickname']], 'uris': None, 'data': None}


In [59]:
# fetch value
sparql.setQuery("""
SELECT ?value WHERE {
  wd:"""+entity_id+""" wdt:"""+docs["ids"][0][0]+""" ?value .
}
""")
results = sparql.query().convert()
values = results["results"]["bindings"]
values

[{'value': {'xml:lang': 'en', 'type': 'literal', 'value': 'King James'}}]

In [1]:
from fact_checker_agent import FactCheckerAgent

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
agent = FactCheckerAgent()

In [4]:
res = agent.query("Fact Check with Wikidata: Lebron James is nicknamed King James", verbose=True)

________________________ITERATION: 1________________________
[SYSTEM]
You will first be given a fact by the user, and your goal is to determine whether or not that fact is true or false.

You have only 2 jobs:
1) Search wikidata by outputting a semantic triple
2) Answer the user

Your response must always include a THOUGHT and a RESPONSE keyword, and then you must follow it with the PAUSE keyword.

If you decide that you need more information do determine whether a fact is true, then you can perform a search on wikidata. To do this, you need to output a semantic triple, which follows the format: ENTITY_1 RELATIONSHIP ENTITY_2.

Specifically, I want you to output the JSON: 
RESPONSE: {"entity_1": "entity1", "relationship": "relationship", "entity_2": "entity2"}

If you decide that you do not need more information, and can say whether a fact is true or false, then simply output the JSON:
RESPONSE: {"message": "Whether the fact is true/false, and your reasoning."}

If you decide to call w

In [5]:
res

'True. Lebron James is commonly known as King James.'