# 🌎 Build a KB against Wikipedia data

In [1]:
from typing import List, Optional, Any, Dict, Union
from pydantic import BaseModel, Field, field_validator
import instructor
from openai import OpenAI
import os
import json

from utils import save_jsonl, read_jsonl

In [31]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))
MODEL = "gpt-3.5-turbo-0125"

# 👔 Define Model

In [3]:

class Property(BaseModel):
    property_name: str = Field(..., description="The property name.")
    property_value: Union[List[str], str, int, Dict[str, str]] = Field(..., description="The property value.") # 


    @field_validator('property_value')
    @classmethod
    def check_for_redundant_property(cls, property_value: str, info):
        # TODO: something smarter for this ?
        if type(property_value) == str:
            if property_value.lower() in ['yes', 'no', 'unknown']:
                raise ValueError(
                    f"The property value {property_value} is redundant. Please remove it."
                    )
        return property_value


class Entity(BaseModel):

    properties: Optional[List[Property]] = Field(
        ..., description="Extract any properties that are relevant in building a knowledge base for the entity."
    )

    def update(self, other: "Entity") -> "Entity":
        """Updates the current kb with the other kb."""
        # TODO: deduplicating
        self.properties += other.properties
        return self


class KnowledgeBase(BaseModel):
    entities: Dict[str, Entity] = {}


def generate_kb(entity, texts) -> Entity:

    cur_state = None
    num_iterations = len(texts)
    for i, inp in enumerate(texts):
        new_updates = client.chat.completions.create(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": f"""You are an iterative knowledge base builder based on Wikidata.
                    You are given the current state of the knowledge base for entity {entity}, 
                    and you must add Wikidata-like property-value pairs
                    to it. Do not procide any duplcates.""",
                },
                {
                    "role": "user",
                    "content": f"""Extract any new information from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the current state of the graph:
                    {cur_state.model_dump_json(indent=2) if cur_state is not None else 'empty'}""",
                },  
            ],
            max_retries=2,
            response_model=Entity,
        )  # type: ignore

        if cur_state is None:
            cur_state = new_updates
        else:
            # Update the current state
            print(f"Merging new update: {new_updates.model_dump_json(indent=2)}")
            cur_state = cur_state.update(new_updates)

    return cur_state


In [4]:
wikidata_entities = read_jsonl('../../data/wikidata_entities.jsonl')
kb = KnowledgeBase()
from tqdm import tqdm

for ent in tqdm(wikidata_entities[:2]):
    kb.entities[ent['entity_label']] = generate_kb(ent['entity_label'], ent['chunked_content'])

  0%|          | 0/2 [00:00<?, ?it/s]

Merging new update: {
  "properties": [
    {
      "property_name": "Occupation",
      "property_value": "Politician and Anti-corruption activist"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Founded Project",
      "property_value": [
        "RosYama"
      ]
    },
    {
      "property_name": "Launched Project",
      "property_value": [
        "Anti-Corruption Foundation (FBK)"
      ]
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Title",
      "property_value": "Investigation and Legal Cases"
    },
    {
      "property_name": "Description",
      "property_value": "Details about investigations and legal cases involving Alexei Navalny."
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Date of death",
      "property_value": "16 February 2024"
    },
    {
      "property_name": "Place of death",
      "property_value": "Yamalo-Nenets in Western Siberia"
    }
  ]
}
Mergin

 50%|█████     | 1/2 [01:18<01:18, 78.88s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "Family",
      "property_value": [
        "Married to Yulia Abrosimova",
        "Had two children: daughter Darya (Dasha) Navalnaya and son Zakhar",
        "Daughter began undergraduate studies at Stanford University in September 2019"
      ]
    },
    {
      "property_name": "Residence",
      "property_value": "Lived primarily in a three-room apartment in Maryino District in southeast Moscow since 1998"
    },
    {
      "property_name": "Religion",
      "property_value": "Became a member of the Russian Orthodox Church"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Religion",
      "property_value": "Protestant Christian"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "property_name": "Electoral Votes",
      "property_value": 332
    },
    {
      "property_name": "Percentage of Popular Vote",
      "property_value": "51.1"
    },
    {
      "property_

100%|██████████| 2/2 [01:32<00:00, 46.11s/it]

Merging new update: {
  "properties": [
    {
      "property_name": "presidential_library_name",
      "property_value": "Barack Obama Presidential Center"
    },
    {
      "property_name": "presidential_library_hosted_by",
      "property_value": "University of Chicago"
    },
    {
      "property_name": "presidential_library_location",
      "property_value": "Jackson Park, South Side of Chicago"
    }
  ]
}





In [30]:
# print("chunk1: ", wikidata_entities[1]['chunked_content'][0])
# print("\n\n\nchunk2:\n\n ", wikidata_entities[1]['chunked_content'][1])

In [11]:
print(kb.entities['Alexei Navalny'].model_dump_json(indent=2))

{
  "properties": [
    {
      "property_name": "Name",
      "property_value": "Alexei Navalny"
    },
    {
      "property_name": "Full Name",
      "property_value": "Alexei Anatolyevich Navalny"
    },
    {
      "property_name": "Date of Birth",
      "property_value": "4 June 1976"
    },
    {
      "property_name": "Date of Death",
      "property_value": "16 February 2024"
    },
    {
      "property_name": "Citizenship",
      "property_value": "Russian"
    },
    {
      "property_name": "Occupation",
      "property_value": "Politician and Anti-corruption activist"
    },
    {
      "property_name": "Founded Project",
      "property_value": [
        "RosYama"
      ]
    },
    {
      "property_name": "Launched Project",
      "property_value": [
        "Anti-Corruption Foundation (FBK)"
      ]
    },
    {
      "property_name": "Title",
      "property_value": "Investigation and Legal Cases"
    },
    {
      "property_name": "Description",
      "property_val

In [12]:
print(kb.entities['Barack Obama'].model_dump_json(indent=2))

{
  "properties": [
    {
      "property_name": "Full Name",
      "property_value": "Barack Hussein Obama II"
    },
    {
      "property_name": "Date of Birth",
      "property_value": "August 4, 1961"
    },
    {
      "property_name": "Birth Place",
      "property_value": "Honolulu, Hawaii"
    },
    {
      "property_name": "Nationality",
      "property_value": "American"
    },
    {
      "property_name": "Political Party",
      "property_value": "Democratic Party"
    },
    {
      "property_name": "Education",
      "property_value": [
        "Columbia University",
        "Harvard Law School",
        "Occidental College"
      ]
    },
    {
      "property_name": "Occupation",
      "property_value": "Politician, Lawyer, Lecturer, Author"
    },
    {
      "property_name": "Religion",
      "property_value": "Protestant Christian"
    },
    {
      "property_name": "Electoral Votes",
      "property_value": 332
    },
    {
      "property_name": "Percentage of P



# 🎬 Reformat to Match Wikidata schema

Current format is not easy to use for evaluation.

reformat to:
```
"property_name" : "property_value"
e.g "date_of_birth" : "1980-01-01"
```

In [21]:
def format_kb(pred_kb: KnowledgeBase) -> List:

    pred_kb = pred_kb.model_dump()

    formatted_kb = []
    for ent, properties in pred_kb['entities'].items():
        ent_kb = {}
        ent_kb['entity_label'] = ent
        ent_kb['properties'] = {}
        for property in properties['properties']:
            ent_kb['properties'][property['property_name']] = property['property_value']

        formatted_kb.append(ent_kb)
    formatted_kb[0]

    return formatted_kb

formatted_kbs = format_kb(kb)

formatted_kbs[1]

{'entity_label': 'Barack Obama',
 'properties': {'Full Name': 'Barack Hussein Obama II',
  'Date of Birth': 'August 4, 1961',
  'Birth Place': 'Honolulu, Hawaii',
  'Nationality': 'American',
  'Political Party': 'Democratic Party',
  'Education': ['Columbia University',
   'Harvard Law School',
   'Occidental College'],
  'Occupation': 'Politician, Lawyer, Lecturer, Author',
  'Religion': 'Protestant Christian',
  'Electoral Votes': 332,
  'Percentage of Popular Vote': '51.1',
  'First Democratic President to Win Majority of Popular Vote Twice Since': 'Franklin D. Roosevelt',
  'Environmental policy': 'Obama emphasized the conservation of federal lands during his term in office. He used his power under the Antiquities Act to create 25 new national monuments during his presidency and expand four others, protecting a total of 553,000,000 acres (224,000,000 ha) of federal lands and waters, more than any other U.S. president.',
  'Supports Two-State Solution': 'True',
  'Views on Israeli 

# 💾 Save

In [22]:
save_jsonl(formatted_kbs, '../../data/prediction.jsonl')

Saved to f'../../data/prediction.jsonl


In [23]:
inp = read_jsonl('../../data/prediction.jsonl')
print(json.dumps(inp[0], indent=2))

{
  "entity_label": "Alexei Navalny",
  "properties": {
    "Name": "Alexei Navalny",
    "Full Name": "Alexei Anatolyevich Navalny",
    "Date of Birth": "4 June 1976",
    "Date of Death": "16 February 2024",
    "Citizenship": "Russian",
    "Occupation": "Politician and Anti-corruption activist",
    "Founded Project": [
      "RosYama"
    ],
    "Launched Project": [
      "Anti-Corruption Foundation (FBK)"
    ],
    "Title": "Investigation and Legal Cases",
    "Description": "Details about investigations and legal cases involving Alexei Navalny.",
    "Date of death": "16 February 2024",
    "Place of death": "Yamalo-Nenets in Western Siberia",
    "stated by": "Alexei Kudrin, Boris Akunin, Mikhail Khodorkovsky, Vladimir Zhirinovsky, Marie Harf, Catherine Ashton, Andreas Schockenhoff, The New York Times, Alexei Venediktov, Levada Center, Leonid Volkov, Alexander Verkhovskiy",
    "criticised by": [
      "Vladimir Zhirinovsky",
      "United States Department of State Deputy S