In [1]:
from utils import read_jsonl, save_jsonl
import pandas as pd
from pydantic import BaseModel, model_validator, field_validator, Field, ValidationInfo
from typing import List, Dict, Union, Any, Optional
import instructor
from openai import OpenAI
import os
import json
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_core.documents.base import Document
from langchain_openai import OpenAIEmbeddings

from data_classes import KnowledgeGraph, ValidatedProperty

In [2]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))
MODEL = "gpt-3.5-turbo-0125"

# 🧠 Load data

In [3]:
pred_kbs = read_jsonl('../../data/prediction.jsonl')
ref_kbs = read_jsonl('../../data/wikidata_entities.jsonl')
print(f"Number of predicted KBs: {len(pred_kbs)}")

Number of predicted KBs: 4


In [4]:
ref_kbs[0].keys()

dict_keys(['entity_label', 'properties', 'chunked_content', 'QID'])

In [5]:
pred_kbs[0].keys()

dict_keys(['entity_label', 'properties'])

In [6]:
# only take the predictions and references of entities that exist in both
union_entities = set([p['entity_label'] for p in pred_kbs]).intersection(set([r['entity_label'] for r in ref_kbs]))

pred_kbs = [kb for kb in pred_kbs if kb['entity_label'] in union_entities]
pred_kbs = sorted(pred_kbs, key=lambda x: x['entity_label'])

ref_kbs = [kb for kb in ref_kbs if kb['entity_label'] in union_entities]
ref_kbs = sorted(ref_kbs, key=lambda x: x['entity_label'])

print(f"Number of predicted KBs: {len(pred_kbs)}")
print(f"Number of reference KBs: {len(ref_kbs)}")
print("ref: ", [kb['entity_label'] for kb in pred_kbs])
print("pred: ", [kb['entity_label'] for kb in ref_kbs])

Number of predicted KBs: 4
Number of reference KBs: 4
ref:  ['Barack Obama', 'Douglas Adams', 'George Washington', 'Tim Berners-Lee']
pred:  ['Barack Obama', 'Douglas Adams', 'George Washington', 'Tim Berners-Lee']


# 🪬 Define Evaluation Model

In [13]:

class TextContextKGValidator(KnowledgeGraph):

    documents: List[str]
    validated_properties: List[ValidatedProperty] = []

    @staticmethod
    def create_parent_document_retriever(docs: List[Document]):
        # https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever

        # This text splitter is used to create the parent documents
        parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
        # This text splitter is used to create the child documents
        # It should create documents smaller than the parent
        child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
        # The vectorstore to use to index the child chunks
        vectorstore = Chroma(
            collection_name="full_documents", embedding_function=OpenAIEmbeddings()
        )
        # The storage layer for the parent documents
        store = InMemoryStore()
        retriever = ParentDocumentRetriever(
            vectorstore=vectorstore,
            docstore=store,
            child_splitter=child_splitter,
            parent_splitter=parent_splitter,
        )
        retriever.add_documents(docs, ids=None) # add entity doc(s)

        # list(store.yield_keys())   # see how many chunks it's created

        return retriever, store, vectorstore


    @staticmethod
    def retrieve_relevant_chunk(entity_name, property_name, vectorstore, retriever):
        '''Fetch the most similar chunk to entity label + predicted property name'''

        query = f"{entity_name} {property_name}"

        sub_docs = vectorstore.similarity_search(query)

        # if there is a parent doc chunk, use that, 
        #otherwise use the sub doc chunk
        parent_chunk = retriever.get_relevant_documents(query)
        if len(parent_chunk) > 0:
            relevant_chunk = parent_chunk[0].page_content
        else:
            relevant_chunk = sub_docs[0].page_content
        return relevant_chunk



    @model_validator(mode='before')
    def validate(self, context) -> "TextContextKGValidator":

        docs = [Document(d) for d in self['documents']]
        retriever, store, vectorstore = TextContextKGValidator.create_parent_document_retriever(docs)

        self['validated_properties'] = []
        existing_pred_properties = [name for name in self['properties'].keys()]

        for predicted_property_name, predicted_property_value in self['properties'].items():

            relevant_chunk = TextContextKGValidator.retrieve_relevant_chunk(
                entity_name=self['entity_label'],
                property_name=predicted_property_name, 
                vectorstore=vectorstore,
                retriever=retriever
            )

            # EVALUATE ONE PROPERTY
            resp: ValidatedProperty = client.chat.completions.create(
                response_model=ValidatedProperty,
                messages=[
                    {
                        "role": "user",
                        "content": f"Using your knowledge of the world " +
                        "and the given chunk of text, " +
                        "is the following property valid for the given entity? " +
                        f"\nEntity Label: {self['entity_label']}" +
                        f"\nPredicted Property Name: {predicted_property_name}" +
                        f"\nPredicted Property Value: {predicted_property_value}" +
                        f"\n\n{relevant_chunk}"
                    }
                ],
                validation_context={
                    "existing_pred_properties": existing_pred_properties,
                },
                max_retries=2,
                model=MODEL,
            )

            self['validated_properties'].append(resp)
        return self


    @model_validator(mode='after')
    def assert_all_properties_validated(self, info: ValidationInfo):
        if len(self.validated_properties) != len(self.properties):
            raise ValueError(
                "Number of properties validated does not match number of properties in the prediction knowledge base. " +
                f"Number of properties validated: {len(self.validated_properties)}, " +
                f"Number of properties in the text: {len(self.properties)}"
                )
        return self


    

# 🐕 Relevant Chunk Retrieval

In [14]:
idx = 0
results = []
assert pred_kbs[idx]['entity_label'] == ref_kbs[idx]['entity_label']
# joining the chunked content for each entity back together
docs = [" ".join([chunk for chunk in ref_kbs[idx]['chunked_content']])]  # 🚨 getting content from ref KG

pred_kbs[idx]['documents'] = docs

docs = [Document(d) for d in pred_kbs[idx]['documents']]
retriever, store, vectorstore = TextContextKGValidator.create_parent_document_retriever(docs)

retrieved_chunks = []
for predicted_property_name, predicted_property_value in pred_kbs[idx]['properties'].items():

    relevant_chunk = TextContextKGValidator.retrieve_relevant_chunk(
        entity_name=pred_kbs[idx]['entity_label'],
        property_name=predicted_property_name, 
        vectorstore=vectorstore,
        retriever=retriever
    )
    retrieved_chunks.append((predicted_property_name, predicted_property_value, relevant_chunk))


In [15]:
query = f"Barack Obama date of birth"
sub_docs = vectorstore.similarity_search(query)
sub_docs

[Document(page_content='Obama was born on August 4, 1961, at Kapiolani Medical Center for Women and Children in Honolulu, Hawaii. He is the only president born outside the contiguous 48 states. He was born to an American mother and a Kenyan father. His mother, Ann Dunham (1942–1995), was born in Wichita, Kansas and was of English, Welsh, German, Swiss, and Irish descent. In 2007 it was discovered her', metadata={'doc_id': '4e0001cc-59a2-46de-a8f8-0799232bbbc1'}),
 Document(page_content='Obama was born on August 4, 1961, at Kapiolani Medical Center for Women and Children in Honolulu, Hawaii. He is the only president born outside the contiguous 48 states. He was born to an American mother and a Kenyan father. His mother, Ann Dunham (1942–1995), was born in Wichita, Kansas and was of English, Welsh, German, Swiss, and Irish descent. In 2007 it was discovered her', metadata={'doc_id': '788aebf8-5e23-436a-8b4e-b9ead3e04ae4'}),
 Document(page_content='Obama was born on August 4, 1961, at Kap

In [16]:
for predicted_property_name, predicted_property_value, relevant_chunk in retrieved_chunks:
    print(f"\n----------\n❓Property name: {predicted_property_name}\n Relevant chunk -->  {relevant_chunk}\n🙋Predicted value: {predicted_property_value}")


----------
❓Property name: Birth Place
 Relevant chunk -->  Obama was born on August 4, 1961, at Kapiolani Medical Center for Women and Children in Honolulu, Hawaii. He is the only president born outside the contiguous 48 states. He was born to an American mother and a Kenyan father. His mother, Ann Dunham (1942–1995), was born in Wichita, Kansas and was of English, Welsh, German, Swiss, and Irish descent. In 2007 it was discovered her great-great-grandfather Falmouth Kearney emigrated from the village of Moneygall, Ireland to the US in 1850. In July 2012, Ancestry.com found a strong likelihood that Dunham was descended from John Punch, an enslaved African man who lived in the Colony of Virginia during the seventeenth century. Obama's father, Barack Obama Sr. (1934–1982), was a married Luo Kenyan from Nyang'oma Kogelo. His last name, Obama, was derived from his Luo descent. Obama's parents met in 1960 in a Russian language class at the University of Hawaiʻi at Mānoa, where his father 

# Evaluate!

In [17]:
idx = 0
results = []
assert pred_kbs[idx]['entity_label'] == ref_kbs[idx]['entity_label']
# joining the chunked content for each entity back together
docs = [" ".join([chunk for chunk in ref_kbs[idx]['chunked_content']])]  # 🚨 getting content from ref KG

pred_kbs[idx]['documents'] = docs

# Add some (presumably) wrong properties
pred_kbs[idx]['properties']['Bought Stocks in'] = ['Tesla', 'Nvidia', 'Hertz']
pred_kbs[idx]['properties']['Favourite Fast Food Chain'] = 'McDonalds'

results.append(TextContextKGValidator(**pred_kbs[idx]))

In [18]:
results[0].model_dump()['validated_properties']

[{'property_name': 'Birth Place',
  'property_value': 'Honolulu, Hawaii',
  'property_is_valid': True,
  'is_valid_reason': 'Barack Obama was indeed born in Honolulu, Hawaii at the Kapiolani Medical Center for Women and Children.',
  'error_message': None},
 {'property_name': 'Birthday',
  'property_value': 'August 4, 1961',
  'property_is_valid': True,
  'is_valid_reason': 'Barack Obama was born on August 4, 1961, as mentioned in the text.',
  'error_message': None},
 {'property_name': 'Party',
  'property_value': 'Democratic Party',
  'property_is_valid': True,
  'is_valid_reason': 'Barack Obama is indeed a member of the Democratic Party, which is supported by the given information about his political affiliations and role as the 44th president of the United States.',
  'error_message': None},
 {'property_name': 'Education',
  'property_value': ['Columbia University',
   'Harvard Law School',
   'Occidental College'],
  'property_is_valid': True,
  'is_valid_reason': 'Barack Obama gr

In [19]:
results_json = [r.model_dump() for r in results]
save_jsonl(results_json, '../../data/text_context_evaluation_results.jsonl')

Saved to f'../../data/text_context_evaluation_results.jsonl


# Look at our Evaluations

In [20]:
results = read_jsonl('../../data/text_context_evaluation_results.jsonl')
len(results)

1