In [13]:
from typing import List, Optional, Any
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
import os
import json
from utils import save_jsonl, read_jsonl

# Get Wikidata KB

In [2]:
import requests

def fetch_labels(entity_ids):
    """
    Fetch labels for a given list of Wikidata entity IDs.

    Parameters:
    - entity_ids: List of Wikidata entity IDs.

    Returns:
    - A dictionary mapping each Wikidata entity ID to its label.
    """
    labels = {}
    # Wikidata API may have limits on the number of IDs per request; adjust if necessary.
    for i in range(0, len(entity_ids), 50):
        batch_ids = entity_ids[i:i+50]
        params = {
            'action': 'wbgetentities',
            'ids': '|'.join(batch_ids),
            'props': 'labels',
            'languages': 'en',
            'format': 'json'
        }
        response = requests.get("https://www.wikidata.org/w/api.php", params=params)
        response_json = response.json()

        for entity_id, entity in response_json['entities'].items():
            label = entity.get('labels', {}).get('en', {}).get('value', entity_id)
            labels[entity_id] = label
    return labels

def format_value(value, labels):
    """
    Convert a Wikidata value to a string representation, using labels for entity IDs.
    """
    if isinstance(value, dict):
        # For entity IDs, replace with label if available
        if 'id' in value:
            return labels.get(value['id'], value['id'])
        elif 'time' in value:
            return value['time']
        elif 'amount' in value:
            return value['amount']
        else:
            return str(value)
    else:
        return str(value)

def get_all_properties_with_labels(wikidata_id):
    """
    Fetch all properties and their values for a given Wikidata ID, 
    including labels for properties and their entity values, encapsulated by the entity name.

    Parameters:
    - wikidata_id: The Wikidata ID of the entity.

    Returns:
    - A dictionary with the entity label as the key and the properties dictionary as the value.
    """
    # Initial API call to get all claims/properties for the entity
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'props': 'claims|labels',
        'languages': 'en',
        'format': 'json'
    }
    response = requests.get("https://www.wikidata.org/w/api.php", params=params)
    entity_data = response.json()['entities'][wikidata_id]

    # Fetch the label for the entity
    entity_label = entity_data.get('labels', {}).get('en', {}).get('value', wikidata_id)

    claims = entity_data['claims']

    # Collect property IDs and value entity IDs for label fetching
    prop_ids = list(claims.keys())
    value_entity_ids = set()
    for prop_id in prop_ids:
        for claim in claims[prop_id]:
            if 'datavalue' in claim['mainsnak']:
                data_value = claim['mainsnak']['datavalue'].get('value')
                if isinstance(data_value, dict) and 'id' in data_value:
                    value_entity_ids.add(data_value['id'])

    # Fetch labels for all property IDs and value entity IDs
    all_labels = fetch_labels(list(set(prop_ids) | value_entity_ids))

    # Construct the result dictionary with labels
    properties_result = {}
    for prop_id in prop_ids:
        prop_label = all_labels.get(prop_id, prop_id)
        properties_result[prop_label] = []
        for claim in claims[prop_id]:
            if 'datavalue' in claim['mainsnak']:
                data_value = claim['mainsnak']['datavalue'].get('value')
                formatted_value = format_value(data_value, all_labels)
                properties_result[prop_label].append(formatted_value)

    # Encapsulate the result using the entity label
    encapsulated_result = {
        "entity_label": entity_label,
        "properties" : properties_result
        }

    return encapsulated_result

# Example usage
wikidata_id = 'Q155979'
props_with_labels = get_all_properties_with_labels(wikidata_id)

# for prop, values in properties_with_labels.items():
#     print(f"{prop}: {', '.join(values)}")


print(json.dumps(props_with_labels, indent=4, ensure_ascii=False))

{
    "entity_label": "Alexei Navalny",
    "properties": {
        "GND ID": [
            "1024799212"
        ],
        "member of": [
            "Russian Opposition Coordination Council",
            "Yale World Fellows"
        ],
        "sex or gender": [
            "male"
        ],
        "Library of Congress authority ID": [
            "n2012030899"
        ],
        "VIAF ID": [
            "254613339",
            "304693850"
        ],
        "educated at": [
            "Finance University under the Government of the Russian Federation",
            "Yale University",
            "Peoples' Friendship University of Russia",
            "Yale World Fellows"
        ],
        "image": [
            "Alexey Navalny (cropped) 1.jpg"
        ],
        "member of political party": [
            "Yabloko",
            "Progress Party",
            "Russia of the Future"
        ],
        "Commons category": [
            "Alexey Navalny"
        ],
        "Freebase ID"

# Get unstructured Text (wikipedia page)

In [3]:
import requests

def fetch_wikipedia_page_content(wikidata_id):
    """
    Fetch the entire content of a Wikipedia page for a given Wikidata ID.

    Parameters:
    - wikidata_id: The Wikidata ID of the entity.

    Returns:
    - The content of the Wikipedia page as a string.
    """
    # Fetch the Wikipedia page title for the Wikidata ID
    params_wikidata = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'props': 'sitelinks',
        'sitefilter': 'enwiki',
        'format': 'json'
    }
    response_wikidata = requests.get("https://www.wikidata.org/w/api.php", params=params_wikidata)
    response_json_wikidata = response_wikidata.json()
    sitelinks = response_json_wikidata['entities'][wikidata_id].get('sitelinks', {})
    enwiki_title = sitelinks.get('enwiki', {}).get('title', '')
    
    if not enwiki_title:
        return "Wikipedia page title not found for the given Wikidata ID."
    
    # Fetch the content of the Wikipedia page using the title
    params_wikipedia = {
        'action': 'query',
        'format': 'json',
        'titles': enwiki_title,
        'prop': 'extracts',
        'explaintext': True,  # Return plain text content for the entire page
    }
    response_wikipedia = requests.get("https://en.wikipedia.org/w/api.php", params=params_wikipedia)
    response_json_wikipedia = response_wikipedia.json()
    page = next(iter(response_json_wikipedia['query']['pages'].values()))
    content = page.get('extract', '')
    
    return content

# Example usage
wikidata_id = 'Q155979' 
wikipedia_content = fetch_wikipedia_page_content(wikidata_id)
print(wikipedia_content)


Alexei Anatolyevich Navalny (Russian: Алексей Анатольевич Навальный, IPA: [ɐlʲɪkˈsʲej ɐnɐˈtolʲjɪvʲɪtɕ nɐˈvalʲnɨj]; 4 June 1976 – 16 February 2024) was a Russian opposition leader, lawyer, anti-corruption activist, and political prisoner. He organised anti-government demonstrations and ran for office to advocate reforms against corruption in Russia and against President Vladimir Putin and his government. Navalny was founder of the Anti-Corruption Foundation (FBK). He was recognised by Amnesty International as a prisoner of conscience, and was awarded the Sakharov Prize for his work on human rights.
Through his social media channels, Navalny and his team published material about corruption in Russia, organised political demonstrations and promoted his campaigns. In a 2011 radio interview, he described Russia's ruling party, United Russia, as a "party of crooks and thieves", which became a popular epithet. Navalny and the FBK have published investigations detailing alleged corruption by h

In [4]:
def chunk_text_recursively(text, max_chunk_size=15000, chunks=None):
    """
    Recursively splits the text into chunks of a maximum size, without splitting words.

    # Example usage
    text = "Your very long text goes here. Repeat this sentence to make the text longer." * 500  # Example text
    chunks = chunk_text_recursively(text)
    # Optionally, print the number of chunks and the first chunk to verify
    print(f"Total chunks: {len(chunks)}")
    print(f"First chunk (showing first 100 characters): {chunks[0][:100]}")
    #

    Parameters:
    - text (str): The text to be chunked.
    - max_chunk_size (int): The maximum size of each chunk. Default is 15000.
    - chunks (List[str]): Used during recursion to accumulate chunks. Should not be set when calling externally.

    Returns:
    - List[str]: A list of text chunks.
    """
    # Initialize the chunks list during the first call
    if chunks is None:
        chunks = []
    
    # Base case: if the text is shorter than the max chunk size, just add it to the chunks list
    if len(text) <= max_chunk_size:
        chunks.append(text)
        return chunks

    # Recursive case: find the last space within the max chunk size and split the text there
    split_point = text.rfind(' ', 0, max_chunk_size)
    
    # If we couldn't find a space, we have a very long word, so just split at the max_chunk_size
    if split_point == -1:
        split_point = max_chunk_size
    
    # Add the left part of the text to the chunks list
    chunks.append(text[:split_point])
    
    # Recursively process the remaining text
    return chunk_text_recursively(text[split_point+1:], max_chunk_size, chunks)



num_tokens = len(wikipedia_content) / 4  # 4 chars ~= 1 token
# 16385 max length for gpt-3.5
num_tokens

if num_tokens > 15_000:
    chunked_content = chunk_text_recursively(wikipedia_content)


In [6]:
# Run batch and save

# Wikidata entities for people, organizations, and geopolitical entities (GPEs)
wikidata_ids = {
    'Q155979': 'Alexei Navalny',
    'Q76': 'Barack Obama',         
    'Q42': 'Douglas Adams',        
    'Q937': 'Albert Einstein',       
    'Q9682': 'Queen Elizabeth II',        
    'Q668': 'India',                
    'Q30': 'United States of America',
    'Q145': 'United Kingdom',        
    'Q148': 'China',                
    'Q215': 'Google',      
    'Q95': 'Microsoft',              
    'Q208': 'Apple Inc.',             #
    'Q717': 'World Health Organization', 
    'Q1065': 'United Nations',        
    'Q43402': 'Greenpeace',           
    'Q312': 'NATO',             
    'Q1067': 'European Union',        
}

wikidata_entities_out = []
from tqdm import tqdm
for wikidata_id in tqdm(wikidata_ids.keys()):
    entity = get_all_properties_with_labels(wikidata_id)
    wikipedia_content = fetch_wikipedia_page_content(wikidata_id)

    num_tokens = len(wikipedia_content) / 4  # 4 chars ~= 1 token
    # 16385 max length for gpt-3.5

    if num_tokens > 15_000:
        chunked_content = chunk_text_recursively(wikipedia_content)
    else:
        chunked_content = [wikipedia_content]
    
    entity['chunked_content'] = chunked_content

    entity['QID'] = wikidata_id

    wikidata_entities_out.append(entity)

save_jsonl(wikidata_entities_out, '../../data/wikidata_entities.jsonl')

  0%|          | 0/17 [00:00<?, ?it/s]

100%|██████████| 17/17 [02:29<00:00,  8.80s/it]

Saved to f'../../data/wikidata_entities.jsonl





In [19]:
entities = read_jsonl('../../data/wikidata_entities.jsonl')
entities[0].keys()

dict_keys(['entity_label', 'properties', 'chunked_content'])

In [20]:
ent = entities[0]
import wptools
wiki_content = wptools.page('Alexei Navalny', wikibase='Q155979').get_parse()
infobox = wiki_content.data['infobox']
print(infobox)
ent['infobox'] = infobox

en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}


In [21]:
for ent in entities:
    import wptools
    wiki_content = wptools.page('Alexei Navalny', wikibase='Q155979').get_parse()
    infobox = wiki_content.data['infobox']
    print(infobox)
    ent['infobox'] = infobox

en.wikipedia.org (parse) Alexei Navalny


en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}
en.wikipedia.org (parse) Alexei Navalny


{'name': 'Alexei Navalny', 'native_name': '{{nobold|Алексей Навальный}}', 'native_name_lang': 'ru', 'image': 'Alexey Navalny 2 (cropped) 1.jpg', 'caption': 'Navalny in 2011', 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}", 'term_start': '28 March 2019', 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}', 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]', 'predecessor': '[[Ivan Zhdanov]]', 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)', 'term_start1': '17 November 2013', 'term_end1': '19 May 2018', 'predecessor1': 'Office established', 'successor1': '[[Ivan Zhdanov]]', 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]', 'term_start2': '27 October', 'term_end2': '24 November 2012', 'predecessor2': 'Office established', 'successor2': '[[Garry Kasparov]]', 'office3': 'Member of the [[Russian Opposition Coo

en.wikipedia.org (imageinfo) File:Alexey Navalny 2 (cropped) 1.jpg
Alexei Navalny (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Alexey Na...
  infobox: <dict(46)> name, native_name, native_name_lang, image, ...
  iwlinks: <list(12)> https://commons.wikimedia.org/wiki/Category:...
  pageid: 29174356
  parsetree: <str(381294)> <root><template><title>Short descriptio...
  requests: <list(2)> parse, imageinfo
  title: Alexei Navalny
  wikibase: Q155979
  wikidata_url: https://www.wikidata.org/wiki/Q155979
  wikitext: <str(324411)> {{Short description|Russian opposition l...
}


In [22]:
print(entities[0]['infobox'].keys())
entities[0]['infobox']

dict_keys(['name', 'native_name', 'native_name_lang', 'image', 'caption', 'office', 'term_start', 'term_end', 'deputy', 'predecessor', 'successor', 'term_start1', 'term_end1', 'predecessor1', 'successor1', 'office2', 'term_start2', 'term_end2', 'predecessor2', 'successor2', 'office3', 'term_start3', 'term_end3', 'office4', 'governor4', 'term_start4', 'term_end4', 'office5', 'term_start5', 'term_end5', 'birth_date', 'birth_place', 'death_date', 'death_place', 'party', 'otherparty', 'spouse', 'children', 'residence', 'education', 'occupation', 'known_for', 'signature', 'website', 'module', 'module2'])


{'name': 'Alexei Navalny',
 'native_name': '{{nobold|Алексей Навальный}}',
 'native_name_lang': 'ru',
 'image': 'Alexey Navalny 2 (cropped) 1.jpg',
 'caption': 'Navalny in 2011',
 'office': "Leader of [[Russia of the Future]] {{efn|Previously known as the People's Alliance (2012–2014) and the Progress Party (2014–2018)}}",
 'term_start': '28 March 2019',
 'term_end': '17 January 2021 {{efn|Arrested and subsequently imprisoned}}',
 'deputy': '[[Leonid Volkov (politician)|Leonid Volkov]]',
 'predecessor': '[[Ivan Zhdanov]]',
 'successor': '[[Leonid Volkov (politician)|Leonid Volkov]] (acting)',
 'term_start1': '17 November 2013',
 'term_end1': '19 May 2018',
 'predecessor1': 'Office established',
 'successor1': '[[Ivan Zhdanov]]',
 'office2': 'Chairman of the Session of the [[Russian Opposition Coordination Council]]',
 'term_start2': '27 October',
 'term_end2': '24 November 2012',
 'predecessor2': 'Office established',
 'successor2': '[[Garry Kasparov]]',
 'office3': 'Member of the [[Ru

In [23]:
entities[0]['properties']

{'GND ID': ['1024799212'],
 'member of': ['Russian Opposition Coordination Council',
  'Yale World Fellows'],
 'sex or gender': ['male'],
 'Library of Congress authority ID': ['n2012030899'],
 'VIAF ID': ['254613339', '304693850'],
 'educated at': ['Finance University under the Government of the Russian Federation',
  'Yale University',
  "Peoples' Friendship University of Russia",
  'Yale World Fellows'],
 'image': ['Alexey Navalny (cropped) 1.jpg'],
 'member of political party': ['Yabloko',
  'Progress Party',
  'Russia of the Future'],
 'Commons category': ['Alexey Navalny'],
 'Freebase ID': ['/m/0dlllc6'],
 'employer': ['Anti-Corruption Foundation', 'Aeroflot'],
 'date of birth': ['+1976-06-04T00:00:00Z'],
 'religion or worldview': ['Eastern Orthodoxy'],
 'native language': ['Russian'],
 'country of citizenship': ['Soviet Union', 'Russia'],
 'field of work': ['politics', 'jurisprudence'],
 'place of birth': ['Butyn'],
 'instance of': ['human'],
 'IdRef ID': ['161623972'],
 'describ

# ݐ Get Property counts

In [15]:
import pandas as pd

df = pd.read_html('https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all')[0]
print(len(df))

11641


In [16]:
# filter " ID " rows
# df = df[~df['label'].str.endswith('ID')]
df = df[~df['label'].str.contains(r'\bID\b')]
print(len(df))
df

3896


Unnamed: 0,ID,label,description,Data type[1],Counts[2]
0,P6,head of government,"head of the executive power of this town, city...",WI,"42,788 M 74 N"
1,P10,video,"relevant video. For images, use the property P...",CM,"13,760 M 23 Q 2 N"
2,P14,traffic sign,"graphic symbol describing the item, used at th...",CM,"27,793 M 358 N"
3,P15,route map,image of route map at Wikimedia Commons,CM,"27,551 M 208 N"
4,P16,transport network,network the infrastructure is a part of,WI,"123,555 M 522 Q 32 N"
...,...,...,...,...,...
11627,P12439,civil rank,non-military rank of a civil office holder in ...,WI,3 M
11629,P12441,Anglo-Norman Dictionary entry,entry for a lexeme in the online Anglo-Norman ...,EI,3 M
11631,P12443,has cabinet,value is the advisory committee to the item's ...,WI,22 M
11634,P12446,ISCED field,mapping of a particular course or curriculum t...,WI,0


In [17]:
import pandas as pd
import re

# Function to find the largest number in the string
def find_largest_number(s):
    # Find all number occurrences and remove commas
    numbers = re.findall(r'\d+', s.replace(',', ''))
    # Convert all found numbers to integers
    numbers = [int(num) for num in numbers]
    # Return the max number or NaN if there are no numbers
    return max(numbers) if numbers else float('nan')

# Apply the function to each entry in 'Counts[2]'
df['largest_number'] = df['Counts[2]'].apply(find_largest_number)
df['largest_number'] = pd.to_numeric(df['largest_number'], errors='coerce')
df_sorted = df.sort_values(by='largest_number', ascending=False)

df_sorted.head(3)


Unnamed: 0,ID,label,description,Data type[1],Counts[2],largest_number
2376,P2860,cites work,citation from one creative or scholarly work t...,WI,"292,583,247 M 390 N",292583247
1184,P1545,series ordinal,position of an item in its parent series (most...,S,"175,830,141 Q 2,298 N",175830141
1663,P2093,author name string,stores unspecified author or editor name for p...,S,"138,055,438 M 589,477 Q 29,173 R 213 N",138055438


In [18]:
df_sorted.to_csv('../../data/wikidata-properties-counts.csv', index=False, encoding='utf-8')

In [19]:
entities = read_jsonl('../../data/wikidata_entities.jsonl')
filtered_dict = filter_properties_by_top_n_wikiproperties(entities[0]['properties'].items(), df_sorted, 200)
print(len(filtered_dict))
filtered_dict

36


{'member of': ['Russian Opposition Coordination Council',
  'Yale World Fellows'],
 'sex or gender': ['male'],
 'educated at': ['Finance University under the Government of the Russian Federation',
  'Yale University',
  "Peoples' Friendship University of Russia",
  'Yale World Fellows'],
 'image': ['Alexey Navalny (cropped) 1.jpg'],
 'member of political party': ['Yabloko',
  'Progress Party',
  'Russia of the Future'],
 'Commons category': ['Alexey Navalny'],
 'employer': ['Anti-Corruption Foundation', 'Aeroflot'],
 'date of birth': ['+1976-06-04T00:00:00Z'],
 'religion or worldview': ['Eastern Orthodoxy'],
 'country of citizenship': ['Soviet Union', 'Russia'],
 'field of work': ['politics', 'jurisprudence'],
 'place of birth': ['Butyn'],
 'instance of': ['human'],
 'described by source': ['Lentapedia', 'Navalny'],
 'official website URL': ['https://navalny.com'],
 'given name': ['Alexey'],
 'significant event': ['Yves Rocher case',
  'poisoning of Alexei Navalny',
  'Kirovles trial',