# Reduce number of wikidata keywords
Reduce the number of entries from wikidata by searching wikidata for the top N hits.
Should remove unnecessary entries like ships or horses that have the name of a keyword.

Expect csv `keyword_wikidata.csv` with columns keyword, wikidata and parent0.



In [1]:
#config

#path where csv files are stored
data_path = '../../../../data'

# use top n hits
top_n = 5

# load search result from file search_result.csv
load_search = True

In [2]:
import os
import pandas as pd
import copy
import sys
import requests
from tqdm import tqdm

In [3]:
# load csv
keyword_wiki_path = os.path.join(data_path, 'keyword_wikidata.csv')
df_keywords_wiki = pd.read_csv(keyword_wiki_path, index_col='wikidata')

In [4]:
# group keywords by there number of entries
most_multiples =  df_keywords_wiki.groupby('keyword').agg(['count'])
most_multiples = most_multiples.droplevel(0, axis=1).sort_values('count', ascending=False)
most_multiples[:5]

Unnamed: 0_level_0,count
keyword,Unnamed: 1_level_1
erratum,9807
editorial,7647
correction,6857
book reviews,5925
reviews,5423


In [5]:
# select the keywords that have more than top_n entries

multiples = most_multiples[most_multiples['count'] > top_n]
keyword_list = multiples.index.tolist()

In [6]:
api = "https://www.wikidata.org/w/api.php"

params = {
    'action':'wbsearchentities',
    'format':'json',
    'language':'en',
    'type':'item',
    'continue': 0,
    'limit': top_n,
    'search': 'erratum',
}


def search_wikidata(keyword):
    params['search'] = keyword
    r = requests.get(url = api, params = params)
    data = r.json()
    ds = []
    for d in data['search']:
        label = d['label'].lower()
        id = d['id']
        if 'description' in d:
            description = d['description']
        else:
            description = ''
        ds.append((id, label, description))
    return ds

In [7]:
# example search

search_wikidata('erratum')

[('Q1348305', 'erratum', 'published correction to a previous publication'),
 ('Q28948209',
  'erratum',
  'scientific article (publication date: 30 August 2016)'),
 ('Q58375287', 'erratum', 'scientific article published on 01 July 1992'),
 ('Q56084196', 'erratum', ''),
 ('Q58937358', 'erratum', '')]

In [8]:
# search wikidata for all keywords

def get_keywords_from_wikidata():
    searchedIds = pd.DataFrame(columns=['keyword', 'id', 'label', 'description', 'hit'])

    count = 0
    try:
        for k in tqdm(keyword_list):
            data = search_wikidata(k)
            for i, d in enumerate(data):
                searchedIds = searchedIds.append({'keyword': k, 'id': d[0], 'label': d[1], 'description': d[2], 'hit': i}, ignore_index=True)
            count += 1
    except:
        print(count)

    print(count)
    return searchedIds

In [9]:
# load search results

search_result_path = os.path.join(data_path, 'search_result.csv')

if load_search:
    searchedIds = pd.read_csv(search_result_path, index_col=0)
else:
    searchedIds = get_keywords_from_wikidata()
    searchedIds.to_csv(search_result_path)


In [10]:
# check results
searchedIds

Unnamed: 0,keyword,id,label,description,hit
0,erratum,Q1348305,erratum,published correction to a previous publication,0
1,erratum,Q28948209,erratum,scientific article (publication date: 30 Augus...,1
2,erratum,Q58375287,erratum,scientific article published on 01 July 1992,2
3,erratum,Q56084196,erratum,,3
4,erratum,Q58937358,erratum,,4
...,...,...,...,...,...
31275,mediator,Q14886885,mediator complex,A protein complex that interacts with the carb...,0
31276,mediator,Q1666223,intermediary,third party that offers intermediation service...,1
31277,mediator,Q4859473,mediator,profession,2
31278,mediator,Q421695,benfluorex,chemical compound used as diabetes drug,3


In [11]:
# keyword and label of the result does not always match
searchedIds[searchedIds.keyword != searchedIds.label]

Unnamed: 0,keyword,id,label,description,hit
10,correction,Q1348305,erratum,published correction to a previous publication,0
11,correction,Q45203135,retraction,"act of withdrawing, refuting, or reversing an ...",1
12,correction,Q3633148,hajipur lok sabha constituency,Lok Sabha Constituency in Bihar,2
14,correction,Q40357,prison,place in which people legally are physically c...,4
32,introduction,Q28764881,introduction. taxonomy for the twenty-first ce...,scientific article,2
...,...,...,...,...,...
31257,medical sciences,Q26332849,danish council for independent research | medi...,,2
31275,mediator,Q14886885,mediator complex,A protein complex that interacts with the carb...,0
31276,mediator,Q1666223,intermediary,third party that offers intermediation service...,1
31278,mediator,Q421695,benfluorex,chemical compound used as diabetes drug,3


In [12]:
# filter entries
keywords_wiki_small = df_keywords_wiki[~df_keywords_wiki.keyword.isin(multiples.index) | df_keywords_wiki.index.isin(searchedIds.id)]
keywords_wiki_small

Unnamed: 0_level_0,keyword,parent0
wikidata,Unnamed: 1_level_1,Unnamed: 2_level_1
P5483,hardness,"[""Q22963600"", ""Q22981316""]"
Q28578,appetite,
P5520,toughness,"[""Q22963600"", ""Q22981316""]"
Q2878286,squamous cell carcinoma,
Q2880099,femur,
...,...,...
Q9195957,network dynamics,
Q9251921,magnetic shielding,
Q9282312,marine fungi,
Q28469711,first,


In [13]:
# save filtered entries to file
keyword_wiki_small_path = os.path.join(data_path, 'keyword_wikidata_small.csv')
keywords_wiki_small.to_csv(keyword_wiki_small_path)

In [14]:
# original number of entries
df_keywords_wiki.count()

keyword    231885
parent0    223991
dtype: int64

In [15]:
# filterd number of entries
keywords_wiki_small.count()

keyword    61221
parent0    56360
dtype: int64