In [1]:
from utils import read_jsonl, save_jsonl
import pandas as pd
from pydantic import BaseModel, model_validator, field_validator, Field, ValidationInfo
from typing import List, Dict, Union, Any, Optional
import instructor
from openai import OpenAI
import os
import json
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_core.documents.base import Document
from langchain_openai import OpenAIEmbeddings

from data_classes import KnowledgeGraph, ValidatedProperty

In [2]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))
MODEL = "gpt-3.5-turbo-0125"

# 🧠 Load data

In [3]:
pred_kbs = read_jsonl('../../data/prediction.jsonl')
ref_kbs = read_jsonl('../../data/wikidata_entities.jsonl')
print(f"Number of predicted KBs: {len(pred_kbs)}")

Number of predicted KBs: 4


In [4]:
ref_kbs[0].keys()

dict_keys(['entity_label', 'properties', 'chunked_content', 'QID'])

In [5]:
pred_kbs[0].keys()

dict_keys(['entity_label', 'properties'])

In [6]:
# only take the predictions and references of entities that exist in both
union_entities = set([p['entity_label'] for p in pred_kbs]).intersection(set([r['entity_label'] for r in ref_kbs]))

pred_kbs = [kb for kb in pred_kbs if kb['entity_label'] in union_entities]
pred_kbs = sorted(pred_kbs, key=lambda x: x['entity_label'])

ref_kbs = [kb for kb in ref_kbs if kb['entity_label'] in union_entities]
ref_kbs = sorted(ref_kbs, key=lambda x: x['entity_label'])

print(f"Number of predicted KBs: {len(pred_kbs)}")
print(f"Number of reference KBs: {len(ref_kbs)}")
print("ref: ", [kb['entity_label'] for kb in pred_kbs])
print("pred: ", [kb['entity_label'] for kb in ref_kbs])

Number of predicted KBs: 4
Number of reference KBs: 4
ref:  ['Barack Obama', 'Douglas Adams', 'George Washington', 'Tim Berners-Lee']
pred:  ['Barack Obama', 'Douglas Adams', 'George Washington', 'Tim Berners-Lee']


# 🪬 Define Evaluation Model

In [49]:

class ReferenceKGValidator(KnowledgeGraph):

    reference_wikidata_knowledge_graph: KnowledgeGraph = None
    validated_properties: List[ValidatedProperty] = []


    @staticmethod
    def create_parent_document_retriever(docs: List[Document]):
        # https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever

        # This text splitter is used to create the child documents
        child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
        # The vectorstore to use to index the child chunks
        vectorstore = Chroma(
            collection_name="full_documents", embedding_function=OpenAIEmbeddings()
        )
        # The storage layer for the parent documents
        store = InMemoryStore()
        retriever = ParentDocumentRetriever(
            vectorstore=vectorstore,
            docstore=store,
            child_splitter=child_splitter,
            # parent_splitter=parent_splitter,
        )
        retriever.add_documents(docs, ids=None) # add entity doc(s)

        # list(store.yield_keys())   # see how many chunks it's created

        return retriever, store, vectorstore


    @staticmethod
    def retrieve_relevant_property(entity_name, property_name, vectorstore, retriever):
        '''Fetch the most similar chunk to predicted property name'''

        query = f"{property_name}"

        sub_docs = vectorstore.similarity_search(query)

        # if there is a parent doc chunk, use that, 
        #otherwise use the sub doc chunk
        parent_chunk = retriever.get_relevant_documents(query)
        if len(parent_chunk) > 0:
            relevant_property = parent_chunk[0].page_content
        else:
            relevant_property = sub_docs[0].page_content
        return relevant_property



    @model_validator(mode='before')
    def validate(self, context) -> "ReferenceKGValidator":

        ref_property_names = [Document(p) for p in self['reference_knowledge_graph']['properties'].keys()]   # 🚨 embedding the reference properties
        retriever, store, vectorstore = ReferenceKGValidator.create_parent_document_retriever(ref_property_names)

        self['validated_properties'] = []
        existing_pred_properties = [name for name in self['properties'].keys()]

        for predicted_property_name, predicted_property_value in self['properties'].items():

            relevant_property_name = ReferenceKGValidator.retrieve_relevant_property(
                entity_name=self['entity_label'],
                property_name=predicted_property_name, 
                vectorstore=vectorstore,
                retriever=retriever
            )
            # get reference property-value pairs
            relevant_property = {
                relevant_property_name: self['reference_knowledge_graph']['properties'][relevant_property_name]
            }

            # EVALUATE ONE PROPERTY
            resp: ValidatedProperty = client.chat.completions.create(
                response_model=ValidatedProperty,
                messages=[
                    {
                        "role": "user",
                        "content": f"Using your knowledge of the world " +
                        "and the given candidate property from the reference knowledge graph, " +
                        "is the following predicted property valid for the given entity? " +
                        f"\nEntity Label: {self['entity_label']}" +
                        f"\nPredicted Property Name: {predicted_property_name}" +
                        f"\nPredicted Property Value: {predicted_property_value}" +
                        f"\n\nReference Properties {relevant_property}"
                    }
                ],
                validation_context={
                    "existing_pred_properties": existing_pred_properties,
                },
                max_retries=2,
                model=MODEL,
            )

            self['validated_properties'].append(resp)
        return self


    @model_validator(mode='after')
    def assert_all_properties_validated(self, info: ValidationInfo):
        if len(self.validated_properties) != len(self.properties):
            raise ValueError(
                "Number of properties validated does not match number of properties in the prediction knowledge base. " +
                f"Number of properties validated: {len(self.validated_properties)}, " +
                f"Number of properties in the text: {len(self.properties)}"
                )
        return self



# 🐕 Relevant Chunk Retrieval

In [44]:
idx = 0
results = []
assert pred_kbs[idx]['entity_label'] == ref_kbs[idx]['entity_label']

pred_kbs[idx]['reference_knowledge_graph'] = ref_kbs[idx]

ref_property_names = [Document(p) for p in ref_kbs[idx]['properties'].keys()]   # 🚨 embedding the reference properties
retriever, store, vectorstore = ReferenceKGValidator.create_parent_document_retriever(ref_property_names)

retrieved_properties = []
for predicted_property_name, predicted_property_value in pred_kbs[idx]['properties'].items():

    relevant_property_name = ReferenceKGValidator.retrieve_relevant_property(
        entity_name=pred_kbs[idx]['entity_label'],
        property_name=predicted_property_name, 
        vectorstore=vectorstore,
        retriever=retriever
    )

    # # get reference property-value
    relevant_property = {
        relevant_property_name: pred_kbs[idx]['reference_knowledge_graph']['properties'][relevant_property_name]
    }
    retrieved_properties.append((predicted_property_name, predicted_property_value, relevant_property))
    print((predicted_property_name, predicted_property_value, relevant_property))


('Birth Place', 'Honolulu, Hawaii', {'place of birth': ['Kapiolani Medical Center for Women and Children', 'Honolulu']})
('Birthday', 'August 4, 1961', {'date of birth': ['+1961-08-04T00:00:00Z']})
('Party', 'Democratic Party', {'occupation': ['politician', 'lawyer', 'political writer', 'community organizer', 'statesperson', 'jurist', 'podcaster', 'academic', 'memoirist', 'international forum participant']})
('Education', ['Columbia University', 'Harvard Law School', 'Occidental College'], {'educated at': ['State Elementary School Menteng 01', 'Punahou School', 'Occidental College', 'Columbia University', 'Harvard Law School', 'Noelani Elementary School', 'Centaurus High School', 'University of Chicago Law School', 'Harvard University', 'Nelson High School', 'King College Prep High School']})
('Net Worth in 2007', '$1.3 million (equivalent to $1.8 million in 2022)', {'NLP ID (old)': ['a0000002122644']})
('House Purchase in 2005', '$1.6 million house in Kenwood, Chicago (equivalent to $

In [45]:
relevant_property = ReferenceKGValidator.retrieve_relevant_property(
    entity_name="Barack Obama",
    property_name='Job', 
    vectorstore=vectorstore,
    retriever=retriever
)
relevant_property

'occupation'

In [46]:
query = f"Barack Obama date of birth"
sub_docs = vectorstore.similarity_search(query)
sub_docs

[Document(page_content='date of birth', metadata={'doc_id': '31a1b64e-c332-4ca3-a2ca-3672c69904ae'}),
 Document(page_content='date of birth', metadata={'doc_id': 'cd14cb0b-ec12-44c8-9058-d8724ef2fb51'}),
 Document(page_content='date of birth', metadata={'doc_id': 'b6d49bed-b6b8-4101-8a80-d7278b4d9823'}),
 Document(page_content='date of birth', metadata={'doc_id': '65572c63-ac5e-4dfd-bee4-8c3f46eadedd'})]

In [47]:
for predicted_property_name, predicted_property_value, relevant_property in retrieved_properties:
    print(f"\n----------\n❓Pred Property name: {predicted_property_name}\n🙋Predicted value: {predicted_property_value}\n⭐️ Relevant property -->  {relevant_property}\n")


----------
❓Pred Property name: Birth Place
🙋Predicted value: Honolulu, Hawaii
⭐️ Relevant property -->  {'place of birth': ['Kapiolani Medical Center for Women and Children', 'Honolulu']}


----------
❓Pred Property name: Birthday
🙋Predicted value: August 4, 1961
⭐️ Relevant property -->  {'date of birth': ['+1961-08-04T00:00:00Z']}


----------
❓Pred Property name: Party
🙋Predicted value: Democratic Party
⭐️ Relevant property -->  {'occupation': ['politician', 'lawyer', 'political writer', 'community organizer', 'statesperson', 'jurist', 'podcaster', 'academic', 'memoirist', 'international forum participant']}


----------
❓Pred Property name: Education
🙋Predicted value: ['Columbia University', 'Harvard Law School', 'Occidental College']
⭐️ Relevant property -->  {'educated at': ['State Elementary School Menteng 01', 'Punahou School', 'Occidental College', 'Columbia University', 'Harvard Law School', 'Noelani Elementary School', 'Centaurus High School', 'University of Chicago Law Sc

# Evaluate!

In [None]:
'''
KG -> a wikidata Entity
case: find the actual wikidata entity then eval. If you can't find a page -> print error
search wikidata for string

'''

In [50]:
idx = 0
results = []
assert pred_kbs[idx]['entity_label'] == ref_kbs[idx]['entity_label']

pred_kbs[idx]['reference_knowledge_graph'] = ref_kbs[idx]

# Add some (presumably) wrong properties
pred_kbs[idx]['properties']['Bought Stocks in'] = ['Tesla', 'Nvidia', 'Hertz']
pred_kbs[idx]['properties']['Favourite Fast Food Chain'] = 'McDonalds'

results.append(ReferenceKGValidator(**pred_kbs[idx]))

In [51]:
results[0].model_dump()['validated_properties']

[{'property_name': 'Birth Place',
  'property_value': 'Honolulu, Hawaii',
  'property_is_valid': False,
  'is_valid_reason': None,
  'error_message': "The predicted birth place 'Honolulu, Hawaii' does not match any of the known birth places for Barack Obama, which are 'Kapiolani Medical Center for Women and Children' and 'Honolulu'."},
 {'property_name': 'Birthday',
  'property_value': 'August 4, 1961',
  'property_is_valid': True,
  'is_valid_reason': "The predicted property 'Birthday' matches the reference property 'date of birth' which is '+1961-08-04T00:00:00Z' for Barack Obama.",
  'error_message': None},
 {'property_name': 'Party',
  'property_value': 'Democratic Party',
  'property_is_valid': True,
  'is_valid_reason': 'Barack Obama is a member of the Democratic Party.',
  'error_message': None},
 {'property_name': 'Education',
  'property_value': ['Columbia University',
   'Harvard Law School',
   'Occidental College'],
  'property_is_valid': True,
  'is_valid_reason': 'The pre

In [None]:
results_json = [r.model_dump() for r in results]
save_jsonl(results_json, '../../data/ref_kg_context_evaluation_results.jsonl')

Saved to f'../../data/text_context_evaluation_results.jsonl


# Look at our Evaluations

In [None]:
results = read_jsonl('../../data/ref_kg_context_evaluation_results.jsonl')
len(results)

1