# Build the dataset

---

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'

---
## Load the data

In [2]:
import nltk
nltk.download('reuters')
from nltk.corpus import reuters

[nltk_data] Downloading package reuters to /home/nvidia/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [3]:
print(f'The Reuters dataset contains {len(reuters.fileids())} instances classified into {len(reuters.categories())} topics')

The Reuters dataset contains 10788 instances classified into 90 topics


In [4]:
from pprint import pprint

In [5]:
category_to_size = {cat: len(reuters.fileids(cat)) for cat in reuters.categories()}
sorted_categories = sorted(category_to_size.items(), key=lambda x:x[1], reverse=True)

In [6]:
pprint(sorted_categories)

[('earn', 3964),
 ('acq', 2369),
 ('money-fx', 717),
 ('grain', 582),
 ('crude', 578),
 ('trade', 485),
 ('interest', 478),
 ('ship', 286),
 ('wheat', 283),
 ('corn', 237),
 ('dlr', 175),
 ('money-supply', 174),
 ('oilseed', 171),
 ('sugar', 162),
 ('coffee', 139),
 ('gnp', 136),
 ('gold', 124),
 ('veg-oil', 124),
 ('soybean', 111),
 ('bop', 105),
 ('nat-gas', 105),
 ('livestock', 99),
 ('cpi', 97),
 ('cocoa', 73),
 ('reserves', 73),
 ('carcass', 68),
 ('jobs', 67),
 ('copper', 65),
 ('cotton', 59),
 ('rice', 59),
 ('yen', 59),
 ('alum', 58),
 ('gas', 54),
 ('iron-steel', 54),
 ('ipi', 53),
 ('barley', 51),
 ('meal-feed', 49),
 ('rubber', 49),
 ('palm-oil', 40),
 ('sorghum', 34),
 ('zinc', 34),
 ('pet-chem', 32),
 ('tin', 30),
 ('lead', 29),
 ('silver', 29),
 ('wpi', 29),
 ('orange', 27),
 ('rapeseed', 27),
 ('strategic-metal', 27),
 ('soy-meal', 26),
 ('retail', 25),
 ('soy-oil', 25),
 ('fuel', 23),
 ('hog', 22),
 ('housing', 20),
 ('heat', 19),
 ('income', 16),
 ('lumber', 16),
 ('su

## Filter documents by topics

We take only the top 20 most common topics and their corresponding documents

In [7]:
num_topics = 20

In [8]:
selected_categories = [cat for cat, _ in sorted_categories[:num_topics]]
selected_categories

['earn',
 'acq',
 'money-fx',
 'grain',
 'crude',
 'trade',
 'interest',
 'ship',
 'wheat',
 'corn',
 'dlr',
 'money-supply',
 'oilseed',
 'sugar',
 'coffee',
 'gnp',
 'gold',
 'veg-oil',
 'soybean',
 'bop']

In [9]:
reuters_dict = []

In [10]:
doc_list = []

for cat in selected_categories:
    docs_ids = reuters.fileids(cat)
    for doc_id in docs_ids:
        if doc_id not in doc_list:
            doc_list.append(doc_id)
            reuters_dict.append({'file_id': doc_id,
                                 'text': reuters.raw(doc_id)})

In [11]:
import pandas as pd

In [12]:
reuters_df = pd.DataFrame(reuters_dict)
reuters_df.head()

Unnamed: 0,file_id,text
0,test/14859,AMATIL PROPOSES TWO-FOR-FIVE BONUS SHARE ISSUE...
1,test/14860,BOWATER 1986 PRETAX PROFITS RISE 15.6 MLN STG\...
2,test/14872,BOWATER INDUSTRIES PROFIT EXCEED EXPECTATIONS\...
3,test/14873,CITIBANK NORWAY UNIT LOSES SIX MLN CROWNS IN 1...
4,test/14875,VIEILLE MONTAGNE SAYS 1986 CONDITIONS UNFAVOUR...


In [13]:
reuters_df.shape

(9848, 2)

---
## Retrieve labels for each text

First, make all texts lowercase

In [14]:
reuters_df['text'] = reuters_df['text'].map(lambda x: x.lower())

In [15]:
reuters_df['labels'] = reuters_df['file_id'].map(lambda x: reuters.categories(x))
reuters_df['labels'] = reuters_df['labels'].map(lambda categories: [cat for cat in categories if cat in selected_categories])

In [16]:
reuters_df.head()

Unnamed: 0,file_id,text,labels
0,test/14859,amatil proposes two-for-five bonus share issue...,[earn]
1,test/14860,bowater 1986 pretax profits rise 15.6 mln stg\...,[earn]
2,test/14872,bowater industries profit exceed expectations\...,[earn]
3,test/14873,citibank norway unit loses six mln crowns in 1...,[earn]
4,test/14875,vieille montagne says 1986 conditions unfavour...,[earn]


---
## Convert the data to the `TrainingCorpus` format

In [17]:
reuters_to_json = {}

### Define the `docs` field

In [18]:
reuters_to_json['docs'] = reuters_df.index.tolist()

### Define the `texts` field

In [19]:
reuters_to_json['texts'] = reuters_df['text'].tolist()

### Define the `tokens` field

In [20]:
import os
import sys

In [21]:
src_dir = 'src'

In [22]:
sys.path.append(os.path.join(root_dir, src_dir))

In [23]:
from training import TrainingCorpus

In [24]:
reuters_to_json['tokens'] = reuters_df['text']\
                            .map(lambda x: TrainingCorpus.tokenize(x)).tolist()

### Define the `labels` field

In [25]:
reuters_to_json['labels'] = sorted(selected_categories)

### Define the `target` field

In [26]:
reuters_to_json['target'] = reuters_df['labels'].to_dict()

---
## Save to JSON file

In [27]:
dataset_filename = 'reuters_corpus.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [28]:
import json

In [29]:
with open(dataset_filepath, 'w') as fd:
    json.dump(reuters_to_json, fd)

---
## Open the dataset as an instance of the `TrainingCorpus` class

In [30]:
reuters_corpus = TrainingCorpus()
reuters_corpus.load(dataset_filepath)

In [31]:
reuters_corpus.get_text(0)[:300]

'amatil proposes two-for-five bonus share issue\n  amatil ltd &lt;amaa.s> said it proposes to\n  make a two-for-five bonus issue out of its revaluation reserve\n  to shareholders registered may 26.\n      shareholders will be asked to approve the issue and an\n  increase in authorised capital to 175 mln s'

In [32]:
reuters_corpus.get_tokens(0)[:15]

['amatil',
 'proposes',
 'two-for-five',
 'bonus',
 'share',
 'issue',
 'amatil',
 'ltd',
 'lt',
 'amaa',
 'said',
 'proposes',
 'make',
 'two-for-five',
 'bonus']

---
## Compute noun chunks

In [33]:
chunks_filename = 'reuters_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [34]:
reuters_corpus.detect_chunks()

100%|██████████| 9848/9848 [04:15<00:00, 38.60it/s]


In [35]:
reuters_corpus.save_chunks(chunks_filepath)

---
## Load chunks

In [37]:
reuters_corpus.load_chunks(chunks_filepath)

In [38]:
list(reuters_corpus.noun_chunks.items())[:3]

[('amatil', 2), ('two-for-five_bonus_share_issue', 1), ('amatil_ltd', 1)]

In [40]:
' '.join(reuters_corpus.get_chunk_document(0, threshold=0))

'amatil proposes two-for-five_bonus_share_issue amatil_ltd lt_amaa said proposes make two-for-five_bonus_issue revaluation_reserve shareholders registered may 26. shareholders asked approve issue increase authorised_capital 175_mln_shares 125 mln general_meeting may 1 said statement new_shares rank dividends declared october 31. amatil b. a. t. industries plc lt bti l holds 41_pct_stake said expect maintain latest_annual_dividend_rate 29_cents share enlarged_capital'

In [42]:
reuters_corpus.get_text(0)

'amatil proposes two-for-five bonus share issue\n  amatil ltd &lt;amaa.s> said it proposes to\n  make a two-for-five bonus issue out of its revaluation reserve\n  to shareholders registered may 26.\n      shareholders will be asked to approve the issue and an\n  increase in authorised capital to 175 mln shares from 125 mln\n  at a general meeting on may 1, it said in a statement.\n      the new shares will rank for dividends declared after\n  october 31. amatil, in which b.a.t. industries plc &lt;bti.l>\n  holds a 41 pct stake, said it does not expect to maintain its\n  latest annual dividend rate of 29 cents a share on the enlarged\n  capital.\n  \n\n'

---

## Get average document length

In [43]:
sum(len(reuters_corpus.get_tokens(doc_id)) for doc_id in reuters_corpus.docs)/len(reuters_corpus.docs)

90.64429325751422

---