# 🌎 Build a KB against Wikipedia data

TODO: De-duplicate the KB

TODO: Investigate why some KBs are so small . Enforce minimum length List??

In [1]:
from typing import List, Optional, Any, Dict, Union
from pydantic import BaseModel, Field, field_validator
import instructor
from openai import OpenAI
import os
import json
from tqdm import tqdm
from utils import save_jsonl, read_jsonl

In [2]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))
MODEL = "gpt-3.5-turbo-0125"

In [3]:
wikidata_entities = read_jsonl('../../data/wikidata_entities.jsonl')
wikidata_entities = wikidata_entities[:5]


# 👔 Define Model

In [4]:

class Property(BaseModel):
    property_name: str = Field(..., description="The property name.")
    property_value: Union[List[str], str, int, Dict[str, str]] = Field(..., description="The property value.") # 


    @field_validator('property_value')
    @classmethod
    def check_for_redundant_property(cls, property_value: str, info):
        # TODO: something smarter for this ?
        if type(property_value) == str:
            if property_value.lower() in ['yes', 'no', 'unknown']:
                raise ValueError(
                    f"The property value {property_value} is redundant. Please remove it."
                    )
        return property_value


class Entity(BaseModel):

    properties: Optional[List[Property]] = Field(
        ..., description="Extract any properties that are relevant in building a knowledge base for the entity."
    )

    def update(self, other: "Entity") -> "Entity":
        """Updates the current kb with the other kb."""
        # TODO: deduplicating
        self.properties += other.properties
        return self


class KnowledgeBase(BaseModel):
    entities: Dict[str, Entity] = {}


def generate_kb(entity, texts) -> Entity:

    cur_state = None
    num_iterations = len(texts)
    for i, inp in enumerate(texts):
        new_updates = client.chat.completions.create(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": f"""You are an iterative knowledge base builder based on Wikidata.
                    You are given the current state of the knowledge base for entity {entity}, 
                    and you must add Wikidata-like property-value pairs
                    to it. Do not procide any duplcates.""",
                },
                {
                    "role": "user",
                    "content": f"""Extract any new information from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the current state of the graph:
                    {cur_state.model_dump_json(indent=2) if cur_state is not None else 'empty'}""",
                },  
            ],
            max_retries=5,
            response_model=Entity,
        )  # type: ignore

        if cur_state is None:
            cur_state = new_updates
        else:
            # Update the current state
            print(f"Merging new update: {new_updates.model_dump_json(indent=2)}")
            cur_state = cur_state.update(new_updates)

    return cur_state


# Build!

In [14]:
kb = KnowledgeBase()

error_count = 0
for ent in tqdm(wikidata_entities):
    try:
        kb.entities[ent['entity_label']] = generate_kb(ent['entity_label'], ent['chunked_content'])
    except Exception as e:
        print(f"Failed to build KB for entity {ent['entity_label']} because of {e}")
        error_count += 1

print(f"Errors count: {error_count}")

  0%|          | 0/5 [00:00<?, ?it/s]

Merging new update: {
  "properties": [
    {
      "property_name": "Place of Birth",
      "property_value": "Westmoreland County, Virginia"
    },
    {
      "property_name": "Place of Death",
      "property_value": "Mount Vernon, Virginia"
    },
    {
      "property_name": "Spouse",
      "property_value": "Martha Washington"
    },
    {
      "property_name": "Children",
      "property_value": [
        "none",
        "adoptive children: Martha Parke Custis, John Parke Custis"
      ]
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Role in American Revolution",
      "property_value": "Commander-in-Chief of the Continental Army during the American Revolution"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Deficit in Estate",
      "property_value": "Eleventh year running deficit in 1787 due to poor crop yields and pestilence"
    },
    {
      "property_name": "Agricultural Innovation",
      "property_va

 20%|██        | 1/5 [00:27<01:48, 27.22s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "Abolitionist Society",
      "property_value": "Declined a suggestion from Jacques Brissot to establish an abolitionist society in Virginia in 1788"
    },
    {
      "property_name": "Emancipation of Slaves",
      "property_value": "Instructed his secretary to find buyers for his land in western Virginia in 1794 to liberate his slaves, emancipated 123 slaves in his will in 1799, and Martha Washington signed an order to free his slaves in 1801"
    },
    {
      "property_name": "Legacy and Influence",
      "property_value": "Endures as one of the most influential figures in American history, served as commander-in-chief of the Continental Army, a hero of the Revolution, and the first president of the United States; set many precedents for the national government and the presidency, known as the 'Father of His Country' and among the highest-ranked U.S. Presidents"
    }
  ]
}


 40%|████      | 2/5 [00:28<00:35, 11.91s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "Net Worth in 2007",
      "property_value": "$1.3 million (equivalent to $1.8 million in 2022)"
    },
    {
      "property_name": "House Purchase in 2005",
      "property_value": "$1.6 million house in Kenwood, Chicago (equivalent to $2.4 million in 2022)"
    },
    {
      "property_name": "Income in 2009",
      "property_value": "$5.5 million"
    },
    {
      "property_name": "Charitable Donation in 2010",
      "property_value": "$131,000 to Fisher House Foundation"
    },
    {
      "property_name": "Religious Views",
      "property_value": "Protestant Christian, described his religious beliefs and journey in The Audacity of Hope"
    },
    {
      "property_name": "Church Affiliations",
      "property_value": [
        "Trinity United Church of Christ (1992-2008)",
        "Shiloh Baptist Church",
        "St. John's Episcopal Church",
        "Evergreen Chapel at Camp David"
      ]
    },
    {
   

 60%|██████    | 3/5 [00:57<00:39, 19.83s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "Presidential Library",
      "property_value": "The Barack Obama Presidential Center will be hosted by the University of Chicago and located in Jackson Park on the South Side of Chicago."
    },
    {
      "property_name": "Awards and Honors",
      "property_value": [
        "Nobel Peace Prize in 2009",
        "Ambassador of Humanity Award in 2014",
        "Profile in Courage Award in 2017",
        "Ripple of Hope Award in 2018",
        "Time Person of the Year in 2008 and 2012",
        "Two Grammy Awards for Best Spoken Word Album",
        "Two Primetime Emmy Awards for Outstanding Narrator",
        "Two Children's and Family Emmy Awards"
      ]
    }
  ]
}


100%|██████████| 5/5 [01:40<00:00, 20.04s/it]

Failed to build KB for entity Abraham Lincoln because of 1 validation error for Entity
properties.1.property_value
  Field required [type=missing, input_value={'property_name': 'Ran fo...rancis McIntosh murder'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/missing
Errors count: 1





In [15]:
# print("chunk1: ", wikidata_entities[1]['chunked_content'][0])
# print("\n\n\nchunk2:\n\n ", wikidata_entities[1]['chunked_content'][1])

In [16]:
print(kb.entities[list(kb.entities.keys())[0]].model_dump_json(indent=2))

{
  "properties": [
    {
      "property_name": "Name",
      "property_value": "George Washington"
    },
    {
      "property_name": "Birth date",
      "property_value": "February 22, 1732"
    },
    {
      "property_name": "Death date",
      "property_value": "December 14, 1799"
    },
    {
      "property_name": "Occupation",
      "property_value": [
        "Founding Father",
        "Military Officer",
        "Politician",
        "First President of the United States"
      ]
    },
    {
      "property_name": "Place of Birth",
      "property_value": "Westmoreland County, Virginia"
    },
    {
      "property_name": "Place of Death",
      "property_value": "Mount Vernon, Virginia"
    },
    {
      "property_name": "Spouse",
      "property_value": "Martha Washington"
    },
    {
      "property_name": "Children",
      "property_value": [
        "none",
        "adoptive children: Martha Parke Custis, John Parke Custis"
      ]
    },
    {
      "property_name"



# 🎬 Reformat to Match Wikidata schema

Current format is not easy to use for evaluation.

reformat to:
```
"property_name" : "property_value"
e.g "date_of_birth" : "1980-01-01"
```

In [28]:
def format_kb(kb: KnowledgeBase) -> List:
    '''Format to a list of Dicts'''

    pred_kb = kb.model_dump()

    formatted_kb = []
    for ent, properties in pred_kb['entities'].items():
        ent_kb = {}
        ent_kb['entity_label'] = ent
        ent_kb['properties'] = {}
        for property in properties['properties']:
            ent_kb['properties'][property['property_name']] = property['property_value']

        formatted_kb.append(ent_kb)

    return formatted_kb

formatted_kbs = format_kb(kb)

formatted_kbs[-1]

{'entity_label': 'Tim Berners-Lee',
 'properties': {'Full Name': 'Sir Timothy John Berners-Lee',
  'Date of Birth': '8 June 1955',
  'Nationality': 'English',
  'Occupation': ['Computer Scientist', 'Professor'],
  'Known For': 'Inventing the World Wide Web, HTML, URL system, HTTP'}}

# 💾 Save

In [29]:
save_jsonl(formatted_kbs, '../../data/prediction.jsonl')

Saved to f'../../data/prediction.jsonl


make sure it looks ok

In [30]:
inp = read_jsonl('../../data/prediction.jsonl')
print(json.dumps(inp[0], indent=2))

{
  "entity_label": "George Washington",
  "properties": {
    "Name": "George Washington",
    "Birth date": "February 22, 1732",
    "Death date": "December 14, 1799",
    "Occupation": [
      "Founding Father",
      "Military Officer",
      "Politician",
      "First President of the United States"
    ],
    "Place of Birth": "Westmoreland County, Virginia",
    "Place of Death": "Mount Vernon, Virginia",
    "Spouse": "Martha Washington",
    "Children": [
      "none",
      "adoptive children: Martha Parke Custis, John Parke Custis"
    ],
    "Role in American Revolution": "Commander-in-Chief of the Continental Army during the American Revolution",
    "Deficit in Estate": "Eleventh year running deficit in 1787 due to poor crop yields and pestilence",
    "Agricultural Innovation": "Undertook a new landscaping plan to cultivate fast-growing trees and native shrubs",
    "Mule Breeding": "Began breeding mules after being gifted a Spanish jack by King Charles III of Spain in 1