### 1. Get All Domains in Wikidata

In [1]:
import requests
import os
import json
from tqdm import tqdm
from json import JSONDecodeError
from time import sleep
import re
from urllib.parse import unquote, urlparse
import openai

url = 'https://query.wikidata.org/sparql'

In [2]:
def get_domain_entites(qid):
    
    query = f'''
        SELECT ?entity ?entityLabel
        WHERE {{
            ?entity wdt:P31 wd:{qid}.
            ?entity rdfs:label ?entityLabel filter (lang(?entityLabel) = "en")
        }}
        LIMIT 10000
    '''
    r = requests.get(url, params = {'format': 'json', 'query': query})
    try:
        data = r.json()
    except JSONDecodeError:
        sleep(5)
        r = requests.get(url, params = {'format': 'json', 'query': query})
        data = r.json()
    
    entities = dict()
    
    for e in data['results']['bindings']:
        entity_id = e["entity"]['value'].split('/')[-1]
        entity_label = e["entityLabel"]['value']
        entities[entity_id] = entity_label
        
    return entities


def get_entity_attributes(qid):
    
    query = f'''
        SELECT ?property ?propertyLabel ?value ?valueLabel
        WHERE {{
            wd:{qid} ?p ?value .
            ?property wikibase:directClaim ?p .
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
    '''
    r = requests.get(url, params = {'format': 'json', 'query': query})
    try:
        data = r.json()
    except JSONDecodeError:
        sleep(5)
        r = requests.get(url, params = {'format': 'json', 'query': query})
        data = r.json()
    
    properties = dict()
    
    for p in data['results']['bindings']:
        property_id = p["property"]['value'].split('/')[-1]
        property_label = p["propertyLabel"]['value']
        value_label = p["valueLabel"]['value']
        
        if property_id.startswith('P'):
            properties[property_id] = (property_label, value_label)
                
    return properties

In [3]:
def get_wikipedia_content(url):
    # Extracting the title from the URL
    parsed_url = urlparse(url)
    title = unquote(parsed_url.path.split('/')[-1])
    
    # API request setup
    api_url = f'https://{parsed_url.netloc}/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'explaintext': True,
    }

    # Making the request
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # Extracting the page content
    page = next(iter(data['query']['pages'].values()))
    if 'extract' in page:
        return page['extract']
    else:
        return 'Article content not found.'
    
def get_wikipedia_sentences(content, entity_name, word_upper_limit=20, word_lower_limit=5):
    # Splitting the content into sentences by ". ", ".\n" or ".\t"
    sentences = [s.strip() for s in re.split(r'\.|\;|\,|\n|\!|\?', content)]
    
    # Filtering the sentences that mention the entity
    entity_sentences = [s for s in sentences if entity_name in s]
    
    # Filtering the sentences by word count
    entity_sentences = [s for s in entity_sentences if len(s.split()) <= word_upper_limit and len(s.split()) >= word_lower_limit]
    
    return entity_sentences

def get_wikipedia_url(wikidata_id, language='en'):
    # Constructing the URL to call the API
    url = f'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'format': 'json',
        'props': 'sitelinks'
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    
    # Accessing the sitelink for the specified language
    try:
        wikipedia_title = data['entities'][wikidata_id]['sitelinks'][f'{language}wiki']['title']
        wikipedia_url = f"https://{language}.wikipedia.org/wiki/{wikipedia_title.replace(' ', '_')}"
        return wikipedia_url
    except KeyError:
        return "No Wikipedia article found for this language."

In [48]:
# import openai
from openai import OpenAI

def filter_attribute(attribute):
    
    client = OpenAI(
    # This is the default and can be omitted
        api_key = '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'
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": "Classify the following attribute into 'common knowledge' or 'specialized'.\nExamples of common knowledge attribute: Cause of death, Nationality, Country of origin\nExamples of specialized attribute: Canadiana Name Authority ID, Diamond Catalog ID for persons and organisations\n\n\n" + "Attribute" + attribute + "\n\nClass: "
            }
        ],
        model="gpt-3.5-turbo",
    )
    # OpenAI API Key setup
    content = chat_completion.choices[0].message.content
    return content

def filter_attributes (attributes):
    filtered_attributes = dict()
    for attribute in attributes:
        filtered_attributes[attribute] = filter_attribute(attributes[attribute][0])
    return filtered_attributes

### 3. Get Domain Entities

In [49]:
all_entities = get_domain_entites('Q11424')

In [None]:
attributes = get_entity_attributes("Q91")
filtered_attributes = filter_attributes(attributes)

In [None]:
filtered_attributes

In [40]:
qqq = """
    SELECT ?article
    WHERE {
    wd:Q91 schema:about ?article .
    ?article schema:isPartOf <https://en.wikipedia.org/> .
    }
"""

r = requests.get(url, params = {'format': 'json', 'query': qqq})
r.json()

{'head': {'vars': ['article']}, 'results': {'bindings': []}}

In [55]:
def get_wikipedia_content(url):
    # Extracting the title from the URL
    parsed_url = urlparse(url)
    title = unquote(parsed_url.path.split('/')[-1])
    
    # API request setup
    api_url = f'https://{parsed_url.netloc}/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'explaintext': True,
    }

    # Making the request
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # Extracting the page content
    page = next(iter(data['query']['pages'].values()))
    if 'extract' in page:
        return page['extract']
    else:
        return 'Article content not found.'
    
def get_wikipedia_sentences(content, entity_name, word_upper_limit=20, word_lower_limit=5):
    # Splitting the content into sentences by ". ", ".\n" or ".\t"
    sentences = [s.strip() for s in re.split(r'\.|\;|\,|\n|\!|\?', content)]
    
    # Filtering the sentences that mention the entity
    entity_sentences = [s for s in sentences if entity_name in s]
    
    # Filtering the sentences by word count
    entity_sentences = [s for s in entity_sentences if len(s.split()) <= word_upper_limit and len(s.split()) >= word_lower_limit]
    
    return entity_sentences

def get_wikipedia_url(wikidata_id, language='en'):
    # Constructing the URL to call the API
    url = f'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'format': 'json',
        'props': 'sitelinks'
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    
    # Accessing the sitelink for the specified language
    try:
        wikipedia_title = data['entities'][wikidata_id]['sitelinks'][f'{language}wiki']['title']
        wikipedia_url = f"https://{language}.wikipedia.org/wiki/{wikipedia_title.replace(' ', '_')}"
        return wikipedia_url
    except KeyError:
        return "No Wikipedia article found for this language."

In [56]:

# Example usage
wikidata_id = 'Q91'  # Abraham Lincoln
uri = get_wikipedia_url(wikidata_id)
content = get_wikipedia_content(uri)
sentences = get_wikipedia_sentences(content, 'Abraham Lincoln')


In [57]:
print(sentences)

['Abraham Lincoln ( LINK-ən', 'Abraham Lincoln was born on February 12', 'Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County', 'Abraham Lincoln', 'Zann Gill describes how these two murders set off a chain reaction that ultimately prompted Abraham Lincoln to run for President', 'Schwartz argues that in the 1930s and 1940s the memory of Abraham Lincoln was practically sacred and provided the nation with "a moral symbol inspiring and guiding American life', 'The United States Navy Nimitz-class aircraft carrier USS Abraham Lincoln (CVN-72) is named after Lincoln', 'include the Abraham Lincoln Presidential Library and Museum', 'Congress officially dedicated room H-226 in the United States Capitol to Abraham Lincoln', 'The room is located off National Statuary Hall and served as the post office of the House while then-Representative Abraham Lincoln served in Congress from 1847 to 1849', 'Outline of Abraham Lincoln', 'Abraham Lincoln Pr

In [35]:
q = """
    SELECT ?property ?propertyLabel ?value ?valueLabel
    WHERE {
        wd:Q42 ?p ?value .
        ?property wikibase:directClaim ?p .
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
"""

r = requests.get(url, params = {'format': 'json', 'query': q})

for p in r.json()['results']['bindings']:
    property_id = p["property"]['value'].split('/')[-1]
    property_label = p["propertyLabel"]['value']
    value_label = p["valueLabel"]['value']
    print(property_id, property_label, value_label)
    break

P268 Bibliothèque nationale de France ID 11888092r


In [None]:
entities = dict()

for class_id in domains.keys():
    query = f'''
        SELECT ?entity
        WHERE {{
        ?entity wdt:P31 wd:Q6542625.
        ?entity rdfs:label ?entityLabel filter (lang(?entityLabel) = "en")
        }}
    '''
    

In [34]:
query = f'''
    SELECT ?subclass ?classLabel 
    WHERE {{
        ?subclass wdt:P279 wd:Q6542625.  
        ?subclass rdfs:label ?classLabel filter (lang(?classLabel) = "en")
    }}
'''
r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()

In [35]:
data

{'head': {'vars': ['subclass', 'classLabel']}, 'results': {'bindings': []}}

In [None]:
query = f'''
    SELECT ?subclass  
    WHERE {{
        ?subclass wdt:P31 wd:Q16889133.  
        ?subclass rdfs:label ?classLabel filter (lang(?classLabel) = "en")
    }}
'''
r = requests.get(url, params = {'format': 'json', 'query': query})
r.json()