# 🌎 Build a KB against Wikipedia data

In [29]:
from typing import List, Optional, Any, Dict, Union
from pydantic import BaseModel, Field, field_validator
import instructor
from openai import OpenAI
import os
import json

from utils import save_jsonl, read_jsonl

In [30]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))

In [31]:

class Property(BaseModel):
    property_name: str = Field(..., description="The property name.")
    property_value: Union[List[str], str, int, Dict[str, str]] = Field(..., description="The property value.") # 


    @field_validator('property_value')
    @classmethod
    def check_for_redundant_property(cls, property_value: str, info):
        # TODO: something smarter for this ?
        if type(property_value) == str:
            if property_value.lower() in ['yes', 'no', 'unknown']:
                raise ValueError(
                    f"The property value {property_value} is redundant. Please remove it."
                    )
        return property_value


class Entity(BaseModel):

    properties: Optional[List[Property]] = Field(
        ..., description="Extract any properties that are relevant in building a knowledge base for the entity."
    )

    def update(self, other: "Entity") -> "Entity":
        """Updates the current kb with the other kb."""
        # TODO: deduplicating
        self.properties += other.properties
        return self


class KnowledgeBase(BaseModel):
    entities: Dict[str, Entity] = {}


def generate_kb(entity, texts) -> Entity:

    cur_state = None
    num_iterations = len(texts)
    for i, inp in enumerate(texts):
        new_updates = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {
                    "role": "system",
                    "content": f"""You are an iterative knowledge base builder based on Wikidata.
                    You are given the current state of the knowledge base for entity {entity}, 
                    and you must add Wikidata-like property-value pairs
                    to it. Do not procide any duplcates.""",
                },
                {
                    "role": "user",
                    "content": f"""Extract any new information from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                # {
                #     "role": "user",
                #     "content": f"""Here is the current state of the graph:
                #     {cur_state.model_dump_json(indent=2) if cur_state is not None else 'empty'}""",
                # },  
            ],
            max_retries=2,
            response_model=Entity,
        )  # type: ignore

        if cur_state is None:
            cur_state = new_updates
        else:
            # Update the current state
            print(f"Merging new update: {new_updates.model_dump_json(indent=2)}")
            cur_state = cur_state.update(new_updates)

    return cur_state


In [32]:
wikidata_entities = read_jsonl('../../data/wikidata_entities.jsonl')
kb = KnowledgeBase()
entity1 = wikidata_entities[0]
kb.entities[entity1['entity_label']] = generate_kb(entity1['entity_label'], entity1['chunked_content'])

Merging new update: {
  "properties": [
    {
      "property_name": "Occupation",
      "property_value": "Politician and Activist"
    },
    {
      "property_name": "Campaign Fundraising",
      "property_value": "$3.09 million from individuals throughout Russia"
    },
    {
      "property_name": "Campaign Volunteers",
      "property_value": 20000
    },
    {
      "property_name": "Campaign Coverage",
      "property_value": "Received very little television coverage and did not utilize billboards"
    },
    {
      "property_name": "Result of Election",
      "property_value": "Received 27% of the vote in the mayoral election"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Projects",
      "property_value": [
        "RosYama",
        "Anti-Corruption Foundation",
        "He Is Not Dimon to You"
      ]
    },
    {
      "property_name": "Scandalous Real Estate Deal",
      "property_value": "Hungary sold a former embassy building in Mosc

In [33]:
print(kb.model_dump_json(indent=2))

{
  "entities": {
    "Alexei Navalny": {
      "properties": [
        {
          "property_name": "Full Name",
          "property_value": "Alexei Anatolyevich Navalny"
        },
        {
          "property_name": "Nationality",
          "property_value": "Russian"
        },
        {
          "property_name": "Date of Birth",
          "property_value": "4 June 1976"
        },
        {
          "property_name": "Date of Death",
          "property_value": "16 February 2024"
        },
        {
          "property_name": "Occupation",
          "property_value": [
            "Opposition Leader",
            "Lawyer",
            "Anti-corruption Activist",
            "Political Prisoner"
          ]
        },
        {
          "property_name": "Occupation",
          "property_value": "Politician and Activist"
        },
        {
          "property_name": "Campaign Fundraising",
          "property_value": "$3.09 million from individuals throughout Russia"
        }

In [34]:
out = []
out.append(kb.model_dump())
save_jsonl(out, '../../data/prediction.jsonl')

Saved to f'../../data/prediction.jsonl


In [35]:
kb.model_dump()

{'entities': {'Alexei Navalny': {'properties': [{'property_name': 'Full Name',
     'property_value': 'Alexei Anatolyevich Navalny'},
    {'property_name': 'Nationality', 'property_value': 'Russian'},
    {'property_name': 'Date of Birth', 'property_value': '4 June 1976'},
    {'property_name': 'Date of Death', 'property_value': '16 February 2024'},
    {'property_name': 'Occupation',
     'property_value': ['Opposition Leader',
      'Lawyer',
      'Anti-corruption Activist',
      'Political Prisoner']},
    {'property_name': 'Occupation',
     'property_value': 'Politician and Activist'},
    {'property_name': 'Campaign Fundraising',
     'property_value': '$3.09 million from individuals throughout Russia'},
    {'property_name': 'Campaign Volunteers', 'property_value': 20000},
    {'property_name': 'Campaign Coverage',
     'property_value': 'Received very little television coverage and did not utilize billboards'},
    {'property_name': 'Result of Election',
     'property_value'

In [36]:
# inp = read_jsonl('../../data/prediction.jsonl')
# print(json.dumps(inp[0], indent=2))