# 🌎 Build a KB against Wikipedia data

In [32]:
from typing import List, Optional, Any, Dict, Union
from pydantic import BaseModel, Field, field_validator
import instructor
from openai import OpenAI
import os
import json

from utils import save_jsonl, read_jsonl

In [33]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))
MODEL = "gpt-3.5-turbo-0125"

# 👔 Define Model

In [34]:

class Property(BaseModel):
    property_name: str = Field(..., description="The property name.")
    property_value: Union[List[str], str, int, Dict[str, str]] = Field(..., description="The property value.") # 


    @field_validator('property_value')
    @classmethod
    def check_for_redundant_property(cls, property_value: str, info):
        # TODO: something smarter for this ?
        if type(property_value) == str:
            if property_value.lower() in ['yes', 'no', 'unknown']:
                raise ValueError(
                    f"The property value {property_value} is redundant. Please remove it."
                    )
        return property_value


class Entity(BaseModel):

    properties: Optional[List[Property]] = Field(
        ..., description="Extract any properties that are relevant in building a knowledge base for the entity."
    )

    def update(self, other: "Entity") -> "Entity":
        """Updates the current kb with the other kb."""
        # TODO: deduplicating
        self.properties += other.properties
        return self


class KnowledgeBase(BaseModel):
    entities: Dict[str, Entity] = {}


def generate_kb(entity, texts) -> Entity:

    cur_state = None
    num_iterations = len(texts)
    for i, inp in enumerate(texts):
        new_updates = client.chat.completions.create(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": f"""You are an iterative knowledge base builder based on Wikidata.
                    You are given the current state of the knowledge base for entity {entity}, 
                    and you must add Wikidata-like property-value pairs
                    to it. Do not procide any duplcates.""",
                },
                {
                    "role": "user",
                    "content": f"""Extract any new information from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the current state of the graph:
                    {cur_state.model_dump_json(indent=2) if cur_state is not None else 'empty'}""",
                },  
            ],
            max_retries=2,
            response_model=Entity,
        )  # type: ignore

        if cur_state is None:
            cur_state = new_updates
        else:
            # Update the current state
            print(f"Merging new update: {new_updates.model_dump_json(indent=2)}")
            cur_state = cur_state.update(new_updates)

    return cur_state


In [35]:
wikidata_entities = read_jsonl('../../data/wikidata_entities.jsonl')
kb = KnowledgeBase()
from tqdm import tqdm

for ent in tqdm(wikidata_entities[:2]):
    kb.entities[ent['entity_label']] = generate_kb(ent['entity_label'], ent['chunked_content'])

  0%|          | 0/2 [00:00<?, ?it/s]

Merging new update: {
  "properties": [
    {
      "property_name": "Political Party Affiliation",
      "property_value": "Progress Party"
    },
    {
      "property_name": "Position Held",
      "property_value": [
        "Co-Chairman of RPR-PARNAS",
        "Fourth Co-Chairman of RPR-PARNAS"
      ]
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Founder of",
      "property_value": "Anti-Corruption Foundation"
    },
    {
      "property_name": "Founded",
      "property_value": "RosYama (Russian Hole)"
    },
    {
      "property_name": "Founded",
      "property_value": "Anti-Corruption Foundation"
    },
    {
      "property_name": "Accusations Against",
      "property_value": [
        "Ramzan Kadyrov",
        "Igor Shuvalov",
        "Viktor Zolotov",
        "Dmitry Medvedev",
        "Vladimir Putin"
      ]
    },
    {
      "property_name": "Accusation Date",
      "property_value": [
        "May 2011",
        "August 2011",
  

 50%|█████     | 1/2 [00:22<00:22, 22.86s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "Marital Status",
      "property_value": "Married to Yulia Abrosimova"
    },
    {
      "property_name": "Children",
      "property_value": [
        "Daughter Darya (Dasha) Navalnaya",
        "Son Zakhar"
      ]
    },
    {
      "property_name": "Daughter's Education",
      "property_value": "Undergraduate studies at Stanford University"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Religious Views",
      "property_value": "Protestant Christian"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Electoral Votes in 2012",
      "property_value": 332
    },
    {
      "property_name": "Popular Vote Percentage in 2012",
      "property_value": "51.1"
    },
    {
      "property_name": "First Democratic President since FDR to win majority of popular vote twice",
      "property_value": "True"
    },
    {
      "property_name": "Quote After Re

100%|██████████| 2/2 [00:46<00:00, 23.37s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "Presidential Library",
      "property_value": "Barack Obama Presidential Center hosted by the University of Chicago located in Jackson Park on the South Side of Chicago"
    },
    {
      "property_name": "Awards and Honors",
      "property_value": [
        "Nobel Peace Prize in 2009",
        "Ambassador of Humanity Award in 2014",
        "John F. Kennedy Profile in Courage Award in 2017",
        "Ripple of Hope Award in 2018",
        "Two Grammy Awards for Best Spoken Word Album",
        "Two Primetime Emmy Awards for Outstanding Narrator",
        "Two Children's and Family Emmy Awards"
      ]
    }
  ]
}





In [36]:
# print("chunk1: ", wikidata_entities[1]['chunked_content'][0])
# print("\n\n\nchunk2:\n\n ", wikidata_entities[1]['chunked_content'][1])

In [37]:
print(kb.entities['Alexei Navalny'].model_dump_json(indent=2))

{
  "properties": [
    {
      "property_name": "Name",
      "property_value": "Alexei Navalny"
    },
    {
      "property_name": "Citizenship",
      "property_value": "Russian"
    },
    {
      "property_name": "Date of Birth",
      "property_value": "4 June 1976"
    },
    {
      "property_name": "Date of Death",
      "property_value": "16 February 2024"
    },
    {
      "property_name": "Occupation",
      "property_value": [
        "Opposition Leader",
        "Lawyer",
        "Anti-Corruption Activist",
        "Political Prisoner"
      ]
    },
    {
      "property_name": "Political Party Affiliation",
      "property_value": "Progress Party"
    },
    {
      "property_name": "Position Held",
      "property_value": [
        "Co-Chairman of RPR-PARNAS",
        "Fourth Co-Chairman of RPR-PARNAS"
      ]
    },
    {
      "property_name": "Founder of",
      "property_value": "Anti-Corruption Foundation"
    },
    {
      "property_name": "Founded",
      "pr

In [38]:
print(kb.entities['Barack Obama'].model_dump_json(indent=2))

{
  "properties": [
    {
      "property_name": "Full Name",
      "property_value": "Barack Hussein Obama II"
    },
    {
      "property_name": "Date of Birth",
      "property_value": "August 4, 1961"
    },
    {
      "property_name": "Occupation",
      "property_value": "Politician, Lawyer, Lecturer, Author"
    },
    {
      "property_name": "Party Affiliation",
      "property_value": "Democratic Party"
    },
    {
      "property_name": "Term",
      "property_value": [
        "2009-2017"
      ]
    },
    {
      "property_name": "Religious Views",
      "property_value": "Protestant Christian"
    },
    {
      "property_name": "Electoral Votes in 2012",
      "property_value": 332
    },
    {
      "property_name": "Popular Vote Percentage in 2012",
      "property_value": "51.1"
    },
    {
      "property_name": "First Democratic President since FDR to win majority of popular vote twice",
      "property_value": "True"
    },
    {
      "property_name": "Quote 



# 🎬 Reformat to Match Wikidata schema

Current format is not easy to use for evaluation.

reformat to:
```
"property_name" : "property_value"
e.g "date_of_birth" : "1980-01-01"
```

In [21]:
def format_kb(pred_kb: KnowledgeBase) -> List:

    pred_kb = pred_kb.model_dump()

    formatted_kb = []
    for ent, properties in pred_kb['entities'].items():
        ent_kb = {}
        ent_kb['entity_label'] = ent
        ent_kb['properties'] = {}
        for property in properties['properties']:
            ent_kb['properties'][property['property_name']] = property['property_value']

        formatted_kb.append(ent_kb)
    formatted_kb[0]

    return formatted_kb

kb = format_kb(kb)

kb[1]

{'entity_label': 'Barack Obama',
 'properties': {'Full Name': 'Barack Hussein Obama II',
  'Date of Birth': 'August 4, 1961',
  'Birth Place': 'Honolulu, Hawaii',
  'Nationality': 'American',
  'Political Party': 'Democratic Party',
  'Education': ['Columbia University',
   'Harvard Law School',
   'Occidental College'],
  'Occupation': 'Politician, Lawyer, Lecturer, Author',
  'Religion': 'Protestant Christian',
  'Electoral Votes': 332,
  'Percentage of Popular Vote': '51.1',
  'First Democratic President to Win Majority of Popular Vote Twice Since': 'Franklin D. Roosevelt',
  'Environmental policy': 'Obama emphasized the conservation of federal lands during his term in office. He used his power under the Antiquities Act to create 25 new national monuments during his presidency and expand four others, protecting a total of 553,000,000 acres (224,000,000 ha) of federal lands and waters, more than any other U.S. president.',
  'Supports Two-State Solution': 'True',
  'Views on Israeli 

# 💾 Save

In [41]:
kb_dict = kb.model_dump()
kb_dict

{'entities': {'Alexei Navalny': {'properties': [{'property_name': 'Name',
     'property_value': 'Alexei Navalny'},
    {'property_name': 'Citizenship', 'property_value': 'Russian'},
    {'property_name': 'Date of Birth', 'property_value': '4 June 1976'},
    {'property_name': 'Date of Death', 'property_value': '16 February 2024'},
    {'property_name': 'Occupation',
     'property_value': ['Opposition Leader',
      'Lawyer',
      'Anti-Corruption Activist',
      'Political Prisoner']},
    {'property_name': 'Political Party Affiliation',
     'property_value': 'Progress Party'},
    {'property_name': 'Position Held',
     'property_value': ['Co-Chairman of RPR-PARNAS',
      'Fourth Co-Chairman of RPR-PARNAS']},
    {'property_name': 'Founder of',
     'property_value': 'Anti-Corruption Foundation'},
    {'property_name': 'Founded', 'property_value': 'RosYama (Russian Hole)'},
    {'property_name': 'Founded',
     'property_value': 'Anti-Corruption Foundation'},
    {'property_name

In [22]:
save_jsonl(kb, '../../data/prediction.jsonl')

Saved to f'../../data/prediction.jsonl


In [23]:
inp = read_jsonl('../../data/prediction.jsonl')
print(json.dumps(inp[0], indent=2))

{
  "entity_label": "Alexei Navalny",
  "properties": {
    "Name": "Alexei Navalny",
    "Full Name": "Alexei Anatolyevich Navalny",
    "Date of Birth": "4 June 1976",
    "Date of Death": "16 February 2024",
    "Citizenship": "Russian",
    "Occupation": "Politician and Anti-corruption activist",
    "Founded Project": [
      "RosYama"
    ],
    "Launched Project": [
      "Anti-Corruption Foundation (FBK)"
    ],
    "Title": "Investigation and Legal Cases",
    "Description": "Details about investigations and legal cases involving Alexei Navalny.",
    "Date of death": "16 February 2024",
    "Place of death": "Yamalo-Nenets in Western Siberia",
    "stated by": "Alexei Kudrin, Boris Akunin, Mikhail Khodorkovsky, Vladimir Zhirinovsky, Marie Harf, Catherine Ashton, Andreas Schockenhoff, The New York Times, Alexei Venediktov, Levada Center, Leonid Volkov, Alexander Verkhovskiy",
    "criticised by": [
      "Vladimir Zhirinovsky",
      "United States Department of State Deputy S