In [48]:
import requests
import json
import time

In [2]:
# https://pypi.org/project/wikipedia/
import wikipedia
wikipedia.set_lang('en')

In [135]:
from bs4 import BeautifulSoup

https://stackoverflow.com/questions/69136904/extracting-rdf-triples-from-wikidata

endpoint_url = "https://query.wikidata.org/sparql"
headers = { 'User-Agent': 'MyBot' }
payload = {
    'query': 'DESCRIBE wd:Q20145',
    'format': 'json'
}
r = requests.get(endpoint_url, params=payload, headers=headers)
results = r.json()

triples = []
for result in results["results"]["bindings"]:   
    triples.append((result["subject"], result["predicate"], result["object"]))

results["results"]["bindings"]

In [3]:
def query_wikidata(query):
    endpoint_url = "https://query.wikidata.org/sparql"
    headers = { 'User-Agent': 'MyBot' }
    payload = {
        'query': query,
        'format': 'json'
    }
    r = requests.get(endpoint_url, params=payload, headers=headers)
    try:
        results = r.json()
    except:
        print("query failed: ", query)
        results = {}
        results["results"] = {}
        results["results"]["bindings"] = []
    
    return results

In [77]:
def process_describe(results, clean = 1, cutoff = float("inf")):
    triples = []
    if clean == 0:
        for result in results["results"]["bindings"]:
            subject = result["subject"]["value"].replace('http://www.wikidata.org/entity/', '')
            predicate = result["predicate"]["value"].replace('http://www.wikidata.org/prop/direct/', '')
            object = result["object"]["value"].replace('http://www.wikidata.org/entity/', '')
            if 'statement/' in subject or 'statement/' in object:
                continue
            triples.append((subject, predicate, object))
            
    elif clean == 1:
        for result in results["results"]["bindings"]:
            if 'http://www.wikidata.org/entity/' not in result["subject"]["value"]: 
                continue
            if 'http://www.wikidata.org/prop/direct/' not in result["predicate"]["value"]:
                continue
            if 'http://www.wikidata.org/entity/' not in result["object"]["value"]:
                continue
                
            subject = result["subject"]["value"].replace('http://www.wikidata.org/entity/', '')
            predicate = result["predicate"]["value"].replace('http://www.wikidata.org/prop/direct/', '')
            object = result["object"]["value"].replace('http://www.wikidata.org/entity/', '')
            if 'statement/' in subject or 'statement/' in object:
                continue
            if subject[0] != 'Q' or object[0] != 'Q' or predicate[0] != 'P':
                continue
            if int(subject[1:]) > cutoff or int(object[1:]) > cutoff: ## condition to keep graph "relevant"
                continue
            triples.append((subject, predicate, object)) 
            
    else:
        for result in results["results"]["bindings"]:   
            triples.append((result["subject"], result["predicate"], result["object"]))

    return triples

In [118]:
def make_construct_query(qid):
    return 'CONSTRUCT {?s ?p ?o} WHERE {BIND(wd:' + qid + ' AS ?s) ?s ?p ?o}'

In [115]:
def process_construct(results, clean = 1, cutoff = float("inf")):
    triples = []
    if clean == 1:
        for result in results["results"]["bindings"]:
            if 'http://www.wikidata.org/entity/' not in result["subject"]["value"]: 
                continue
            if 'http://www.wikidata.org/prop/direct/' not in result["predicate"]["value"]:
                continue
            if 'http://www.wikidata.org/entity/' not in result["object"]["value"]:
                continue 
    
            subject = result["subject"]["value"].replace('http://www.wikidata.org/entity/', '')
            predicate = result["predicate"]["value"].replace('http://www.wikidata.org/prop/direct/', '')
            object = result["object"]["value"].replace('http://www.wikidata.org/entity/', '')
            if 'statement/' in subject or 'statement/' in object:
                continue
            if subject[0] != 'Q' or object[0] != 'Q' or predicate[0] != 'P':
                continue
            if int(subject[1:]) > cutoff or int(object[1:]) > cutoff: ## condition to keep graph "relevant"
                continue
            triples.append((subject, predicate, object)) 
    return triples

In [122]:
cat_query = 'DESCRIBE wd:Q146'
results = query_wikidata(cat_query)
cat_triples = process_describe(results)

In [123]:
len(cat_triples)

1482

In [128]:
construct_query = make_construct_query('Q146')
res = query_wikidata(construct_query)
trips = process_construct(res, 1)

In [152]:
def make_query_for_qid(name):
    query = """ SELECT distinct ?item ?itemLabel ?itemDescription WHERE{  
      ?item ?label "_item_"@en.  
      ?article schema:about ?item .
      ?article schema:inLanguage "en" .
      ?article schema:isPartOf <https://en.wikipedia.org/>.	
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }    
    }"""
    
    query = query.replace("_item_", name)
    
#     query = ' '.join(query.split())
    
    return query

In [153]:
def process_query_for_qid(results, clean=True):
    data = []
    if clean:
        for result in results["results"]["bindings"]:   
            item = result["item"]["value"].replace('http://www.wikidata.org/entity/', '')
            label = result["itemLabel"]["value"]
            desc = result["itemDescription"]["value"]
            data.append((item, label, desc))
    else:
        for result in results["results"]["bindings"]:   
            data.append((result["item"], result["itemLabel"], result["itemDescription"]))

    return data

In [154]:
def get_qid_from_data(data):
    # print(data)
    return data[0][0]

In [155]:
def get_qid(name):
    results = query_wikidata(make_query_for_qid(name))
    data = process_query_for_qid(results)
    qid = get_qid_from_data(data)
    return qid

In [156]:
get_qid("cat")

'Q146'

In [159]:
get_qid("Chicken")

'Q48778903'

In [65]:
def get_label_from_qid(qid):
    endpoint_url = f"https://www.wikidata.org/wiki/{qid}"
    headers = { 'User-Agent': 'MyBot' }
    r = requests.get(endpoint_url, headers=headers)
    
    if r.status_code != 200:
        raise Exception(f"Request for {qid} failed! Status code: " + str(r.status_code))
    
    html = r.text
    # len('<span class="wikibase-title-label">') == 35
    start = html.find('<span class="wikibase-title-label">')+35
    end = start + html[start:].find('<')
    
    return html[start:end]

In [76]:
# Faster to use API
# https://opendata.stackexchange.com/questions/5248/how-to-get-the-name-of-a-wikidata-item
def get_label_from_qid2(qid):
    endpoint_url = "https://www.wikidata.org/w/api.php?action=wbgetentities&props=labels&languages=en&format=json&ids=" + qid
    r = requests.get(endpoint_url)

    label = r.json()['entities'][qid]['labels']['en']['value']
    return label

In [75]:
get_label_from_qid2("Q146")

'house cat'

In [69]:
def get_property_from_pid(pid):
    endpoint_url = f"https://www.wikidata.org/wiki/Property:{pid}"
    headers = { 'User-Agent': 'MyBot' }
    r = requests.get(endpoint_url, headers=headers)
    
    if r.status_code != 200:
        raise Exception(f"Request for {pid} failed! Status code: " + str(r.status_code))
    
    html = r.text
    # len('<span class="wikibase-title-label">') == 35
    start = html.find('<span class="wikibase-title-label">')+35
    end = start + html[start:].find('<')
    
    return html[start:end]

In [73]:
# Faster to use API
def get_property_from_pid2(pid):
    endpoint_url = "https://www.wikidata.org/w/api.php?action=wbgetentities&props=labels&languages=en&format=json&ids=" + pid
    r = requests.get(endpoint_url)

    label = r.json()['entities'][pid]['labels']['en']['value']
    return label

In [74]:
get_property_from_pid2('P180')

'depicts'

In [100]:
endpoint_url = "https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&langauges=en&ids=Q146"
r = requests.get(endpoint_url)

label = r.json()['entities']['Q146']['labels']['en']['value']

In [146]:
def get_pid_aliases(pid):
    endpoint_url = f"https://www.wikidata.org/wiki/Property:{pid}"
    headers = { 'User-Agent': 'MyBot' }
    r = requests.get(endpoint_url, headers=headers)

    if r.status_code != 200:
        raise Exception(f"Request for {pid} failed! Status code: " + str(r.status_code))

    html = r.text
    # len('<span class="wikibase-title-label">') == 35
    start = html.find('<span class="wikibase-title-label">')+35
    end = start + html[start:].find('<')

    soup = BeautifulSoup(html, "html.parser")
    lis = soup.find_all('li', class_="wikibase-aliasesview-list-item")
    
    aliases = []
    for li in lis:
        a = str(li)
        prev = a.find(">")
        a = a[prev + 1:]
        after = a.find("<")
        a = a[:after]
        aliases.append(a)

    return aliases

In [148]:
# get_pid_aliases('P41')

r.json()['entities']['Q146']['claims']

In [18]:
from googleapiclient.discovery import build

In [19]:
my_api_key = "AIzaSyBUgwUkCJ1YwtIFMv8ShhWOvyMBSoZ_iIo" #The API_KEY you acquired
my_cse_id = "46750a965e81741e8" #The search-engine-ID you created

In [20]:
def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    # print(res)
    # print("\n\n\n")
    return res['items']

results = google_search('abc wikipedia', my_api_key, my_cse_id, num=2, siteSearch="wikipedia.com", siteSearchFilter="i", fileType="")
for result in results:
    print(result)
    print("\n\n\n")

In [49]:
# throws keyerror if no results
# throws HTTPError if google quota reached (100)
def get_pid_from_str(s):
    results = google_search(s, my_api_key, my_cse_id, num=1, siteSearch="https://wikidata.org/wiki/property:", siteSearchFilter="i", fileType="")
    link = results[0]['link']
    return link[link.find(":P")+1:]

In [22]:
# get_pid_from_str("father")

'P22'

In [23]:
get_property_from_pid('P22')

'father'

In [24]:
def check_triple(Q1, P, Q2):
    query = """SELECT *
    WHERE 
    {
        wd:_Q1_ wdt:_P_ wd:_Q2_.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
    }
    """

    query = query.replace("_Q1_", Q1)
    query = query.replace("_P_", P)
    query = query.replace("_Q2_", Q2)

    results = query_wikidata(query)

    return results["results"]["bindings"] != [] 

In [25]:
check_triple('Q15840011', 'P31', 'Q146')

True

In [26]:
check_triple('Q15840011', 'P32', 'Q146')

False

In [27]:
def check_sp(Q1, P):
    query = """SELECT *
    WHERE 
    {
        wd:_Q1_ wdt:_P_ ?any.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
    }
    """

    query = query.replace("_Q1_", Q1)
    query = query.replace("_P_", P)

    results = query_wikidata(query)

    return results["results"]["bindings"] != []

In [28]:
check_sp('Q15840011', 'P31')

True

In [29]:
check_sp('Q15840011', 'P32')

False

In [30]:
# can throw exceptions
def query_wikipedia(s):
    page = wikipedia.page(s)

    return page
    

In [31]:
def search_wikipedia(s):
    return wikipedia.search(s)

In [32]:
search_wikipedia("barack")

['Barack Obama',
 'Barack Obama Sr.',
 'Presidency of Barack Obama',
 'Barack (brandy)',
 'Family of Barack Obama',
 'Barack Obama religion conspiracy theories',
 'Barack (name)',
 'Barack (disambiguation)',
 'Zach Barack',
 'Barack Obama "Hope" poster']

In [33]:
query_wikipedia('Barack Obama "Hope" poster').content

'The Barack Obama "Hope" poster is an image of US president Barack Obama designed by American artist Shepard Fairey. The image  was widely described as iconic and came to represent Obama\'s 2008 presidential campaign. It is a stylized stencil portrait of Obama in solid red, beige and (light and dark) blue, with the word "progress", "hope", or "change" below (and other words in some versions).\nFairey based the design on a photo taken by former Associated Press (AP) freelance photographer Mannie Garcia. He created the design in a day and printed it first as a street poster. It was then widely distributed—both as a digital image and other paraphernalia—during the 2008 election season, with approval from the Obama campaign.  By July 2008, Sticker Robot had printed over 200,000 vinyl "Hope" stickers, 75% of which had been given away to support Obama\'s campaign. The image became one of the most widely recognized symbols of Obama\'s campaign, spawning many variations and imitations, includi

In [149]:
print(3)

3
