# Write dataset to excel

## Load the data

In [1]:
root_dir = '../..'
src_dir = 'src'
data_dir = 'data/corpus'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
from training import TrainingCorpus

In [5]:
dataset_name = 'nyt'

In [6]:
dataset_filename = f'{dataset_name}_corpus.json'
chunks_filename = f'{dataset_name}_chunks.json'

In [7]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, dataset_filename))
corpus.load_chunks(os.path.join(root_dir, data_dir, chunks_filename))

In [8]:
corpus.get_text(0)

"panel of new york state judges hear arguments about whether former sen guy j velella of bronx should be returned to prison, from which he was released by mayoral panel after serving three months of one-year sentence for conspiring to accept bribes; photo (m).appeals court weighs velella's return to jail"

In [9]:
corpus.get_tokens(0)

['panel',
 'new',
 'york',
 'state',
 'judges',
 'hear',
 'arguments',
 'whether',
 'former',
 'sen',
 'guy',
 'j',
 'velella',
 'bronx',
 'returned',
 'prison',
 'released',
 'mayoral',
 'panel',
 'serving',
 'three',
 'months',
 'one-year',
 'sentence',
 'conspiring',
 'accept',
 'bribes',
 'photo',
 'appeals',
 'court',
 'weighs',
 'velella',
 'return',
 'jail']

In [10]:
corpus.get_chunk_document(0, threshold=0)

['panel',
 'new_york_state_judges',
 'hear',
 'arguments',
 'whether',
 'former_sen_guy_j_velella',
 'bronx',
 'returned',
 'prison',
 'released',
 'mayoral_panel',
 'serving',
 'three_months',
 'one-year_sentence',
 'conspiring',
 'accept',
 'bribes',
 'photo',
 'appeals_court',
 'weighs',
 'velella_return',
 'jail']

---

## Build a DataFrame

In [11]:
import pandas as pd

In [12]:
df =pd.DataFrame([{'doc_id': doc_id,
                   'chunk_doc': ' '.join(corpus.get_chunk_document(doc_id, threshold=0)),
                   'label': corpus.target[doc_id][0]} for doc_id in corpus.docs])

In [13]:
df.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,panel new_york_state_judges hear arguments whe...,Q11201
1,1,prof_stephen_gillers op-ed_article says florid...,Q11201
2,2,seabury start sup ct action bring hastings com...,Q11201
3,3,3-judge_fed_panel_rules nys election_law would...,Q11201
4,4,federal_appeals_court_judges ruled california_...,Q11201


---

## Write to excel

In [14]:
filename = f'{dataset_name}_corpus.xlsx'

In [16]:
df.to_excel(os.path.join(root_dir, data_dir, filename))

---