In [54]:
from utils import read_jsonl, save_jsonl
import pandas as pd
from pydantic import BaseModel, model_validator, field_validator, Field, ValidationInfo
from typing import List, Dict, Union, Any, Optional
import instructor
from openai import OpenAI
import os
import json

In [55]:
client = instructor.patch(OpenAI(api_key=os.environ['OPENAI_API_KEY']))
MODEL = "gpt-3.5-turbo-0125"

# 🧠 Load data

- reference KBs
- predicted KBs
- Wikidata Properties by Usage Count

In [31]:
pred_kbs = read_jsonl('../../data/prediction.jsonl')
ref_kbs = read_jsonl('../../data/wikidata_entities.jsonl')
popular_properties = pd.read_csv('../../data/wikidata-properties-counts.csv')
print(len(popular_properties))
popular_properties.head()

3896


Unnamed: 0,ID,label,description,Data type[1],Counts[2],largest_number
0,P2860,cites work,citation from one creative or scholarly work t...,WI,"292,583,247 M 390 N",292583247
1,P1545,series ordinal,position of an item in its parent series (most...,S,"175,830,141 Q 2,298 N",175830141
2,P2093,author name string,stores unspecified author or editor name for p...,S,"138,055,438 M 589,477 Q 29,173 R 213 N",138055438
3,P31,instance of,that class of which this subject is a particul...,WI,"114,614,926 M 20 N",114614926
4,P248,stated in,to be used in the references field to refer to...,WI,"98,211,619 R 122 Q 19 N",98211619


In [32]:
ref_kbs[0].keys()

dict_keys(['entity_label', 'properties', 'chunked_content'])

In [33]:
pred_kbs[0].keys()

dict_keys(['entity_label', 'properties'])

# 🔎 Filter the Reference KB by the most popular Properties

This is a method to constrain the y label for our model so that the task is easier.

Popular properties are assigned by looking at their usage count across all of Wikidata. [This info is available here](https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all)

In [34]:
list(ref_kbs[0]['properties'].items())[-30:-15] 
# there's a lot of garbage here that isn't even interesting e.g Rotten Tomatoes ID
# but it's mixed in with the good stuff

[('Geni.com profile ID', ['6000000055367165869']),
 ('Reddit topic ID', ['alexei_navalny']),
 ('Radio France person ID', ['alexei-navalny']),
 ('WorldCat Entities ID', ['E39PBJhxkxBrf8HXdMBvGCfXh3']),
 ('Süddeutsche Zeitung topic ID', ['Alexej_Nawalny']),
 ('Der Spiegel topic ID', ['alexej_nawalny']),
 ('Douban movie celebrity ID', ['1392443']),
 ('Kinobox person ID', ['1613953']),
 ('Great Encyclopedia of Cyril and Methodius entry ID',
  ['Навальный Алексей Анатольевич']),
 ('patronym or matronym for this person', ['Anatolyevich']),
 ('WikiKids ID', ['Aleksej_Navalny']),
 ('date of death', ['+2024-02-16T00:00:00Z']),
 ('place of death', ['Corrective colony No. 3, YaNAO']),
 ('Gran Enciclopèdia Catalana ID (former scheme)', ['21139531']),
 ('Canadiana Name Authority ID', ['ncf13696330'])]

In [36]:
def filter_ref_kb_by_top_n_wikiproperties(ref_kbs, wikiproperties, N):

    filtered_ref_kbs = []
        
    for kb in ref_kbs:
        properties = kb['properties']

        top_N = wikiproperties.sort_values(by='largest_number', ascending=False).head(N)

        top_N = top_N['label'].tolist()

        kb['properties'] = {
            key: value for key, value in properties.items() if key in top_N
            }
        filtered_ref_kbs.append(kb)
    
    return filtered_ref_kbs


filtered_ref_kbs = filter_ref_kb_by_top_n_wikiproperties(ref_kbs, popular_properties, 200)
print(len(filtered_ref_kbs[0]['properties']))
filtered_ref_kbs[0]['properties']


36


{'member of': ['Russian Opposition Coordination Council',
  'Yale World Fellows'],
 'sex or gender': ['male'],
 'educated at': ['Finance University under the Government of the Russian Federation',
  'Yale University',
  "Peoples' Friendship University of Russia",
  'Yale World Fellows'],
 'image': ['Alexey Navalny (cropped) 1.jpg'],
 'member of political party': ['Yabloko',
  'Progress Party',
  'Russia of the Future'],
 'Commons category': ['Alexey Navalny'],
 'employer': ['Anti-Corruption Foundation', 'Aeroflot'],
 'date of birth': ['+1976-06-04T00:00:00Z'],
 'religion or worldview': ['Eastern Orthodoxy'],
 'country of citizenship': ['Soviet Union', 'Russia'],
 'field of work': ['politics', 'jurisprudence'],
 'place of birth': ['Butyn'],
 'instance of': ['human'],
 'described by source': ['Lentapedia', 'Navalny'],
 'official website URL': ['https://navalny.com'],
 'given name': ['Alexey'],
 'significant event': ['Yves Rocher case',
  'poisoning of Alexei Navalny',
  'Kirovles trial',

In [37]:
import string
import re
from typing import List
from thefuzz import fuzz, process

# Fuzzy 🧸

def find_fuzzy_properties(property_name:str, reference_properties: List[str]):
    '''
    usage: 
        matches = find_similar_properties("incarceration_status", [p['propertyLabel'] for p in props])

    '''
    pattern = f"[{re.escape(string.punctuation)}]+"

    # Clean the string of any punctuation
    property_name_to_match = " ".join(re.split(pattern, property_name))

    # Using TheFuzz to find the top 5 matches
    top_matches = process.extract(
        property_name_to_match, 
        reference_properties, 
        scorer=fuzz.partial_token_sort_ratio, 
        limit=5)

    # Printing the top matches
    print(f"Top matches for {property_name}")
    for match in top_matches:
        print(f"Match: {match[0]}, Similarity: {match[1]}%")

    return top_matches


In [38]:
top_matches = find_fuzzy_properties(list(pred_kbs[0]['properties'].keys())[0], filtered_ref_kbs[0]['properties'].keys())
top_matches

Top matches for Name
Match: given name, Similarity: 100%
Match: name in native language, Similarity: 100%
Match: family name, Similarity: 100%
Match: member of, Similarity: 67%
Match: member of political party, Similarity: 67%


[('given name', 100),
 ('name in native language', 100),
 ('family name', 100),
 ('member of', 67),
 ('member of political party', 67)]