# Write Wikidata dataset to excel

## Load the data

In [1]:
root_dir = '../..'
src_dir = 'src'
data_dir = 'data/corpus'

In [2]:
version = 'v2'

In [3]:
import os
import sys

In [4]:
sys.path.append(os.path.join(root_dir, src_dir))

In [5]:
from training import TrainingCorpus

In [6]:
dataset_filename = f'wikidata_corpus_{version}.json'
chunks_filename = f'wikidata_chunks_{version}.json'

In [7]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, dataset_filename))
corpus.load_chunks(os.path.join(root_dir, data_dir, chunks_filename))

In [8]:
corpus.get_text(0)

'Stockholm city capital of Sweden'

In [9]:
corpus.get_tokens(0)

['stockholm', 'city', 'capital', 'sweden']

In [10]:
corpus.get_chunk_document(0, threshold=0)

['stockholm', 'city', 'capital', 'sweden']

---

## Build a DataFrame

In [11]:
import pandas as pd

In [12]:
df =pd.DataFrame([{'doc_id': doc_id,
                   'chunk_doc': ' '.join(corpus.get_chunk_document(doc_id, threshold=0)),
                   'label': corpus.target[doc_id][0]} for doc_id in corpus.docs])

In [13]:
df.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,stockholm city capital sweden,Q1754
1,1,stockholm capital capital sweden,Q1754
2,2,sthlm city city stretches across fourteen_isla...,Q1754
3,3,sthlm capital city stretches across fourteen_i...,Q1754
4,4,stockholm big city hosts annual nobel prize ce...,Q1754


---

## Write to excel

In [14]:
filename = f'wikidata_corpus_{version}.xlsx'

In [15]:
df.to_excel(os.path.join(root_dir, data_dir, filename))

---