# Wikidata

Build a `TrainingCorpus` complying dataset from a Wikidata dataset

---
## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'

In [None]:
version = 'v1'

---
## Load the dataset

In [2]:
import os
import pandas as pd

In [3]:
filename = 'wikisample.xlsx'
filepath = os.path.join(root_dir, data_dir, filename)

In [4]:
df = pd.read_excel(filepath, index_col=0)
df.head()

Unnamed: 0,entity,name,class,text,target
0,Q64,Berlin,city-state,capital and largest city of Germany,Berlin (capital and largest city of Germany)
1,Q64,Berlin,city with millions of inhabitants,Its economy is based on high-tech firms and th...,Berlin (capital and largest city of Germany)
2,Q64,Berlin,federal capital,"Significant industries also include IT, pharma...",Berlin (capital and largest city of Germany)
3,Q64,Berlin,city with millions of inhabitants,capital and largest city of Germany,Berlin (capital and largest city of Germany)
4,Q64,Berlin,urban municipality of Germany,East Berlin was declared capital of East Germa...,Berlin (capital and largest city of Germany)


In [5]:
df.shape

(120, 5)

In [6]:
grouped_df = df.groupby('target')

In [7]:
print('Listing all clusters:')
print('---------------------')
grouped_df.size()

Listing all clusters:
---------------------


target
Berlin (borough in Camden County, New Jersey, United States)                   20
Berlin (capital and largest city of Germany)                                   20
Berlin (city in Coos County, New Hampshire, USA)                               20
Berlin (city in Green Lake and Waushara counties, Wisconsin, United States)    20
Berlin (town in Connecticut)                                                   20
Berlin (town in Maryland, United States)                                       20
dtype: int64

In [8]:
print(f'There are {grouped_df.size().shape[0]} clusters')

There are 6 clusters


Add a `description` column by concatenating `name`, `class` and `text`

In [9]:
df['description'] = df['name'] + ' ' + df['class'] + ' ' + df['text']
df['description'].head()

0    Berlin city-state capital and largest city of ...
1    Berlin city with millions of inhabitants Its e...
2    Berlin federal capital Significant industries ...
3    Berlin city with millions of inhabitants capit...
4    Berlin urban municipality of Germany East Berl...
Name: description, dtype: object

---
## Convert the data to the `TrainingCorpus` format

In [10]:
data_dict = {}

### Define the `docs` field

In [11]:
data_dict['docs'] = df.index.tolist()

### Define the `texts` field

In [12]:
data_dict['texts'] = df['description'].tolist()

### Define the `tokens` field

In [13]:
src_dir = 'src'

In [14]:
import sys
sys.path.append(os.path.join(root_dir, src_dir))

from training import TrainingCorpus

In [15]:
data_dict['tokens'] = df['description']\
                        .map(lambda x: TrainingCorpus.tokenize(x.lower())).tolist()

### Define the `labels` field

In [16]:
sorted(df['entity'].unique().tolist())

['Q1086827', 'Q1569850', 'Q614184', 'Q64', 'Q821199', 'Q821244']

In [17]:
data_dict['labels'] = sorted(df['entity'].unique().tolist())

### Define the `target` field

In [18]:
data_dict['target'] = df['entity'].map(lambda x: [x]).to_dict()

---
## Save to JSON file

In [19]:
filename = f'wikidata_corpus_{version}.json'
filepath = os.path.join(root_dir, data_dir, filename)

In [20]:
import json

In [21]:
with open(filepath, 'w') as fd:
    json.dump(data_dict, fd)

---
## Open the dataset as an instance of the `TrainingCorpus` class

In [22]:
wikidata_corpus = TrainingCorpus()
wikidata_corpus.load(filepath)

In [23]:
wikidata_corpus.get_text(0)

'Berlin city-state capital and largest city of Germany'

In [24]:
wikidata_corpus.get_tokens(0)

['berlin', 'city-state', 'capital', 'largest', 'city', 'germany']

---
## Compute noun chunks

In [25]:
chunks_filename = f'wikidata_chunks_{version}.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [26]:
chunks_filepath

'../../data/corpus/wikidata_chunks.json'

In [27]:
wikidata_corpus.detect_chunks()

100%|██████████| 120/120 [00:02<00:00, 56.95it/s]


In [28]:
wikidata_corpus.save_chunks(chunks_filepath)

---
## Load chunks

In [29]:
wikidata_corpus.load_chunks(chunks_filepath)

In [30]:
list(wikidata_corpus.noun_chunks.items())[:3]

[('Berlin_city-state_capital', 1), ('largest_city', 7), ('Germany', 10)]

In [31]:
wikidata_corpus.get_chunk_document(0, threshold=0)

['berlin', 'city-state', 'capital', 'largest_city', 'germany']

In [32]:
wikidata_corpus.get_text(0)

'Berlin city-state capital and largest city of Germany'

In [33]:
wikidata_corpus.get_tokens(0)

['berlin', 'city-state', 'capital', 'largest', 'city', 'germany']

---