In [29]:
from typing import List, Optional
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI

client = instructor.patch(OpenAI(api_key='sk-xxxxxxx'))

In [30]:
import requests

def fetch_labels(entity_ids):
    """
    Fetch labels for a given list of Wikidata entity IDs.

    Parameters:
    - entity_ids: List of Wikidata entity IDs.

    Returns:
    - A dictionary mapping each Wikidata entity ID to its label.
    """
    labels = {}
    # Wikidata API may have limits on the number of IDs per request; adjust if necessary.
    for i in range(0, len(entity_ids), 50):
        batch_ids = entity_ids[i:i+50]
        params = {
            'action': 'wbgetentities',
            'ids': '|'.join(batch_ids),
            'props': 'labels',
            'languages': 'en',
            'format': 'json'
        }
        response = requests.get("https://www.wikidata.org/w/api.php", params=params)
        response_json = response.json()

        for entity_id, entity in response_json['entities'].items():
            label = entity.get('labels', {}).get('en', {}).get('value', entity_id)
            labels[entity_id] = label
    return labels

def format_value(value, labels):
    """
    Convert a Wikidata value to a string representation, using labels for entity IDs.
    """
    if isinstance(value, dict):
        # For entity IDs, replace with label if available
        if 'id' in value:
            return labels.get(value['id'], value['id'])
        elif 'time' in value:
            return value['time']
        elif 'amount' in value:
            return value['amount']
        else:
            return str(value)
    else:
        return str(value)

def get_all_properties_with_labels(wikidata_id):
    """
    Fetch all properties and their values for a given Wikidata ID, 
    including labels for properties and their entity values.

    Parameters:
    - wikidata_id: The Wikidata ID of the entity.

    Returns:
    - A dictionary with property labels as keys and lists of their labeled values as values.
    """
    # Initial API call to get all claims/properties for the entity
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'props': 'claims',
        'format': 'json'
    }
    response = requests.get("https://www.wikidata.org/w/api.php", params=params)
    claims = response.json()['entities'][wikidata_id]['claims']

    # Collect property IDs and value entity IDs for label fetching
    prop_ids = list(claims.keys())
    value_entity_ids = set()
    for prop_id in prop_ids:
        for claim in claims[prop_id]:
            if 'datavalue' in claim['mainsnak']:
                data_value = claim['mainsnak']['datavalue'].get('value')
                if isinstance(data_value, dict) and 'id' in data_value:
                    value_entity_ids.add(data_value['id'])

    # Fetch labels for all property IDs and value entity IDs
    all_labels = fetch_labels(list(set(prop_ids) | value_entity_ids))

    # Construct the result dictionary with labels
    result = {}
    for prop_id in prop_ids:
        prop_label = all_labels.get(prop_id, prop_id)
        result[prop_label] = []
        for claim in claims[prop_id]:
            if 'datavalue' in claim['mainsnak']:
                data_value = claim['mainsnak']['datavalue'].get('value')
                formatted_value = format_value(data_value, all_labels)
                result[prop_label].append(formatted_value)

    return result

# Example usage
wikidata_id = 'Q155979'
properties_with_labels = get_all_properties_with_labels(wikidata_id)

for prop, values in properties_with_labels.items():
    print(f"{prop}: {', '.join(values)}")

GND ID: 1024799212
member of: Russian Opposition Coordination Council, Yale World Fellows
sex or gender: male
Library of Congress authority ID: n2012030899
VIAF ID: 254613339, 304693850
educated at: Finance University under the Government of the Russian Federation, Yale University, Peoples' Friendship University of Russia, Yale World Fellows
image: Alexey Navalny (cropped) 1.jpg
member of political party: Yabloko, Progress Party, Russia of the Future
Commons category: Alexey Navalny
Freebase ID: /m/0dlllc6
employer: Anti-Corruption Foundation, Aeroflot
date of birth: +1976-06-04T00:00:00Z
religion or worldview: Eastern Orthodoxy
native language: Russian
country of citizenship: Soviet Union, Russia
field of work: politics, jurisprudence
place of birth: Butyn
instance of: human
IdRef ID: 161623972
described by source: Lentapedia, Navalny
official website URL: https://navalny.com
given name: Alexey
significant event: Yves Rocher case, poisoning of Alexei Navalny, Kirovles trial, incarcera

In [31]:
import requests

def fetch_wikipedia_page_title(wikidata_id):
    """
    Fetch the Wikipedia page title for a given Wikidata ID.

    Parameters:
    - wikidata_id: The Wikidata ID of the entity.

    Returns:
    - The Wikipedia page title as a string.
    """
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'props': 'sitelinks',
        'sitefilter': 'enwiki',
        'format': 'json'
    }
    response = requests.get("https://www.wikidata.org/w/api.php", params=params)
    response_json = response.json()
    sitelinks = response_json['entities'][wikidata_id].get('sitelinks', {})
    enwiki = sitelinks.get('enwiki', {}).get('title', '')
    return enwiki

def fetch_wikipedia_content(page_title):
    """
    Fetch the content of a Wikipedia page given its title.

    Parameters:
    - page_title: The title of the Wikipedia page.

    Returns:
    - The content of the Wikipedia page as a string.
    """
    params = {
        'action': 'query',
        'format': 'json',
        'titles': page_title,
        'prop': 'extracts',
        # 'exintro': True,  # Remove or comment out this line to get the full content
        'explaintext': True,  # Return plain text content
    }
    response = requests.get("https://en.wikipedia.org/w/api.php", params=params)
    response_json = response.json()
    page = next(iter(response_json['query']['pages'].values()))
    content = page.get('extract', '')
    return content

# Example usage
wikidata_id = 'Q155979' 
entity_name = fetch_wikipedia_page_title(wikidata_id)
wikipedia_content = fetch_wikipedia_content(page_title)

print(f"Wikipedia Content for '{entity_name}':\n{wikipedia_content}")


Wikipedia Content for 'Alexei Navalny':
Alexei Anatolyevich Navalny (Russian: Алексей Анатольевич Навальный, IPA: [ɐlʲɪkˈsʲej ɐnɐˈtolʲjɪvʲɪtɕ nɐˈvalʲnɨj]; 4 June 1976 – 16 February 2024) was a Russian opposition leader, lawyer, anti-corruption activist, and political prisoner. He organised anti-government demonstrations and ran for office to advocate reforms against corruption in Russia and against President Vladimir Putin and his government. Navalny was a Russian Opposition Coordination Council member, the leader of the Russia of the Future party and founder of the Anti-Corruption Foundation (FBK). He was recognised by Amnesty International as a prisoner of conscience, and was awarded the Sakharov Prize for his work on human rights.
Through his social media channels, Navalny and his team published material about corruption in Russia, organised political demonstrations and promoted his campaigns. In a 2011 radio interview, he described Russia's ruling party, United Russia, as a "party 

In [32]:
def chunk_text_recursively(text, max_chunk_size=15000, chunks=None):
    """
    Recursively splits the text into chunks of a maximum size, without splitting words.

    Parameters:
    - text (str): The text to be chunked.
    - max_chunk_size (int): The maximum size of each chunk. Default is 15000.
    - chunks (List[str]): Used during recursion to accumulate chunks. Should not be set when calling externally.

    Returns:
    - List[str]: A list of text chunks.
    """
    # Initialize the chunks list during the first call
    if chunks is None:
        chunks = []
    
    # Base case: if the text is shorter than the max chunk size, just add it to the chunks list
    if len(text) <= max_chunk_size:
        chunks.append(text)
        return chunks

    # Recursive case: find the last space within the max chunk size and split the text there
    split_point = text.rfind(' ', 0, max_chunk_size)
    
    # If we couldn't find a space, we have a very long word, so just split at the max_chunk_size
    if split_point == -1:
        split_point = max_chunk_size
    
    # Add the left part of the text to the chunks list
    chunks.append(text[:split_point])
    
    # Recursively process the remaining text
    return chunk_text_recursively(text[split_point+1:], max_chunk_size, chunks)

# Example usage
text = "Your very long text goes here. Repeat this sentence to make the text longer." * 500  # Example text
chunks = chunk_text_recursively(wikipedia_content)

# Optionally, print the number of chunks and the first chunk to verify
print(f"Total chunks: {len(chunks)}")
print(f"First chunk (showing first 100 characters): {chunks[0][:100]}")


Total chunks: 7
First chunk (showing first 100 characters): Alexei Anatolyevich Navalny (Russian: Алексей Анатольевич Навальный, IPA: [ɐlʲɪkˈsʲej ɐnɐˈtolʲjɪvʲɪt


In [33]:
num_tokens = len(wikipedia_content) / 4  # 4 chars ~= 1 token
# 16385 max length for gpt-3.5
num_tokens

if num_tokens > 15_000:
    chunked_content = chunk_text_recursively(wikipedia_content)

# Build a KB against this data

In [40]:
knowledge_base = {}

class Property(BaseModel):
    key: str
    value: str


class Entity(BaseModel):
    # age: int
    # name: Optional[str]
    # date_of_birth: Optional[str]
    # place_of_birth: Optional[str]
    # sex: Optional[str]
    properties: Optional[List[Property]] = Field(
        ..., description="Extract any other properties that might be relevant."
    )

    def update(self, other: "Entity") -> "Entity":
        """Updates the current kb with the other kb."""
        # TODO: deduplicating
        self.properties += other.properties
        return self


def generate_kb(entity, texts) -> Entity:

    cur_state = None
    num_iterations = len(texts)
    for i, inp in enumerate(texts):
        new_updates = client.chat.completions.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {
                    "role": "system",
                    "content": f"""You are an iterative knowledge base builder.
                    You are given the current state of the knowledge base for entity {entity}, and you must add properties
                    to it. Do not procide any duplcates and try to re-use nodes as much as possible.""",
                },
                {
                    "role": "user",
                    "content": f"""Extract any new information from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                # {
                #     "role": "user",
                #     "content": f"""Here is the current state of the graph:
                #     {cur_state.model_dump_json(indent=2) if cur_state is not None else 'empty'}""",
                # },  
            ],
            response_model=Entity,
        )  # type: ignore

        if cur_state is None:
            cur_state = new_updates
        else:
            # Update the current state
            print(f"Merging new update: {new_updates.model_dump_json(indent=2)}")
            cur_state = cur_state.update(new_updates)

    return cur_state


knowledge_base[entity_name] = generate_kb(entity_name, chunked_content)

Merging new update: {
  "properties": [
    {
      "key": "Mayoral Candidate",
      "value": "Yes"
    },
    {
      "key": "Campaign Fundraising",
      "value": "$3.09 million total, $2.91 million transferred by individuals in Russia"
    },
    {
      "key": "Campaign Volunteers",
      "value": "Around 20,000 volunteers involved in campaign activities such as distributing leaflets and hanging banners"
    },
    {
      "key": "Campaign Success",
      "value": "Received 27% of the vote in the election, fared better in areas with higher income and education levels"
    },
    {
      "key": "2018 Presidential Candidate",
      "value": "Announced entry into presidential race on 13 December 2016"
    }
  ]
}
Merging new update: {
  "properties": [
    {
      "key": "Projects",
      "value": "RosPil, RosYama, real estate scandal, corruption allegations against Ramzan Kadyrov, Igor Shuvalov, Viktor Zolotov, Dmitry Medvedev, Vladimir Putin"
    },
    {
      "key": "Legal Cases"

In [43]:
print(knowledge_base['Alexei Navalny'].model_dump_json(indent=2))

{
  "properties": [
    {
      "key": "Name",
      "value": "Alexei Navalny"
    },
    {
      "key": "Nationality",
      "value": "Russian"
    },
    {
      "key": "Date of Birth",
      "value": "4 June 1976"
    },
    {
      "key": "Date of Death",
      "value": "16 February 2024"
    },
    {
      "key": "Occupation",
      "value": "Opposition Leader, Lawyer, Anti-corruption Activist, Political Prisoner"
    },
    {
      "key": "Notable Organizations",
      "value": "Russia of the Future party, Anti-Corruption Foundation (FBK)"
    },
    {
      "key": "Recognition",
      "value": "Prisoner of Conscience (Amnesty International), Sakharov Prize"
    },
    {
      "key": "Mayoral Candidate",
      "value": "Yes"
    },
    {
      "key": "Campaign Fundraising",
      "value": "$3.09 million total, $2.91 million transferred by individuals in Russia"
    },
    {
      "key": "Campaign Volunteers",
      "value": "Around 20,000 volunteers involved in campaign activitie