### Scrape Wikidata Relations Counts

In [7]:
# https://medium.com/analytics-vidhya/web-scraping-a-wikipedia-table-into-a-dataframe-c52617e1f451
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [8]:
# get the response in the form of html
wikiurl="https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [9]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
wikitable = soup.find('table',{'class':"wikitable"})

In [10]:
df = pd.read_html(str(wikitable))

# convert list to dataframe
df=pd.DataFrame(df[0])
print(df.head())

    ID               label                                        description  \
0   P6  head of government  head of the executive power of this town, city...   
1  P10               video  relevant video. For images, use the property P...   
2  P14        traffic sign  graphic symbol describing the item, used at th...   
3  P15           route map            image of route map at Wikimedia Commons   
4  P16   transport network            network the infrastructure is a part of   

                                             aliases     Data type  Count  
0  president, chancellor, mayor, prime minister, ...  WikibaseItem  37180  
1           animation, media, gif, trailer (Commons)  CommonsMedia   6395  
2  road sign, highway shield, shield, highway mar...  CommonsMedia  20323  
3  schema, railroad map, railway map, highway map...  CommonsMedia  23966  
4  highway system, network, transport network, pa...  WikibaseItem  52704  


In [11]:
# drop the unwanted columns
data = df.drop(["description", "Data type", 'aliases'], axis=1)

print(data.head())

    ID               label  Count
0   P6  head of government  37180
1  P10               video   6395
2  P14        traffic sign  20323
3  P15           route map  23966
4  P16   transport network  52704


In [12]:
data

Unnamed: 0,ID,label,Count
0,P6,head of government,37180
1,P10,video,6395
2,P14,traffic sign,20323
3,P15,route map,23966
4,P16,transport network,52704
...,...,...,...
9970,P10709,North Carolina Extension Gardener Plant Toolbo...,11
9971,P10710,Galaxy Store app ID,4
9972,P10711,Invasive.org species ID,0
9973,P10712,EIA utility ID,0


### Add counts

In [18]:
# Load relations
path = '../data/knowledge/en_zh_relations.csv'
relations = pd.read_csv(path)

In [19]:
relations

Unnamed: 0,id,en,zh
0,P1034,main food source,主要食物來源
1,P457,foundational text,成立文书
2,P532,port of registry,船籍港
3,P740,location of formation,成立地點
4,P1817,addressee,收件人
...,...,...,...
534,P31,instance of,隶属于
535,P1441,present in work,登场作品
536,P828,has cause,起因
537,P1542,has effect,導致


In [20]:
# Add Counts
for ind in range(relations.shape[0]):
    wiki_id = relations.at[ind, 'id']
    
    if wiki_id in data['ID'].values:
        count = data.loc[data['ID'] == wiki_id]['Count'].values[0]
    else:
        count = None
    relations.at[ind, 'count'] = count
    
relations['count'] = relations['count'].astype('Int64')

In [21]:
relations

Unnamed: 0,id,en,zh,count
0,P1034,main food source,主要食物來源,326
1,P457,foundational text,成立文书,50541
2,P532,port of registry,船籍港,25629
3,P740,location of formation,成立地點,44482
4,P1817,addressee,收件人,49021
...,...,...,...,...
534,P31,instance of,隶属于,103114325
535,P1441,present in work,登场作品,126749
536,P828,has cause,起因,9257
537,P1542,has effect,導致,5715


In [22]:
relations.to_csv(path, index=False)