In [1]:
# An overview of nltk.corpus.reuters dataset, 
# including its api

'''
import nltk
from nltk.corpus import reuters
nltk.download('reuters')  # Download the corpus if not already done

reuters.fileids()               # list of all document IDs (file names)
reuters.categories()           # list of all topic categories
reuters.fileids(categories='earn')  # file IDs of docs in the 'earn' category
reuters.categories(fileids='test/14826')  # categories of a specific doc
reuters.raw(fileids='test/14826')         # full raw text of a document
reuters.words(fileids='test/14826')       # tokenized words
'''

# since some documents in reuters are multi-labeled
# show how many labels each document has
# [len(reuters.categories(fileids=fileid)) for fileid in reuters.fileids()]
# 
# this document has seven labels
# reuters.categories(fileids=reuters.fileids()[3])

"\nimport nltk\nfrom nltk.corpus import reuters\nnltk.download('reuters')  # Download the corpus if not already done\n\nreuters.fileids()               # list of all document IDs (file names)\nreuters.categories()           # list of all topic categories\nreuters.fileids(categories='earn')  # file IDs of docs in the 'earn' category\nreuters.categories(fileids='test/14826')  # categories of a specific doc\nreuters.raw(fileids='test/14826')         # full raw text of a document\nreuters.words(fileids='test/14826')       # tokenized words\n"

In [2]:
import nltk
from nltk.corpus import reuters
import gensim
from gensim.corpora import Dictionary
# from gensim.utils import simple_preprocess
from collections import Counter

nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to /home/zheng/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/zheng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# First Two Filterings:
# 
# - All single-labeled documents;
# - Top-6 categories of single-labeled documents

# Get all single-labeled documents
single_labeled_fileids = []
single_labels = []

for fileid in reuters.fileids():
    labels = reuters.categories(fileid)        # labels is a list of string(s)
    if len(labels) == 1:                       # keep the fileids that only have one label
        single_labeled_fileids.append(fileid)
        single_labels.append(labels[0])

# Count label frequencies
label_counts = Counter(single_labels)

# Select top-6 labels
top_6_categories = [label for label, _ in label_counts.most_common(6)]
print("Top 6 single-label categories:", top_6_categories)

# choose top-6 labeled documents, from single-labeled documents
filtered_fileids = []
filtered_labels = []

for fileid in single_labeled_fileids:
    label = reuters.categories(fileid)[0]
    if label in top_6_categories:
        filtered_fileids.append(fileid)
        filtered_labels.append(label)

len(filtered_fileids)

Top 6 single-label categories: ['earn', 'acq', 'crude', 'trade', 'money-fx', 'interest']


7496

In [4]:
# Notice: some documents are meta data, has no meaning
reuters.raw(filtered_fileids[69])

' 8-APR-1987 11:06:39.06\n   8-APR-1987 11:06:39.06\n\n'

In [5]:
# def preprocess(doc):
#     return simple_preprocess(doc, deacc=True)
import sys
import os
# Add parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from preprocess import preprocess as preprocess, \
                       dictionarize as dictionarize, \
                       bow2coo as bow2coo

# preprocess, and get tokenized documents,
# which is a list of lists of strings(tokens)
# texts = [preprocess(reuters.raw(fileid)) for fileid in filtered_fileids]
texts = []
final_fileids = []
for fileid in filtered_fileids:
    text = preprocess(reuters.raw(fileid))
    if len(text) > 0:                       # Third Filtering
        texts.append(text)
        final_fileids.append(fileid)

# filtered the tokens based on an interval,
# and build dictionary and bag-of-word
tokens, tokens_bow, dictionary = dictionarize(texts)

# convert the bag-of-word into pytorch coo matrix
tokens_sparse = bow2coo(tokens_bow, dictionary)



In [6]:
# Notice: after preprocessing, some documents might have no words at all
# texts[69]

In [7]:
tokens_sparse.shape

torch.Size([7460, 4562])

In [8]:
tokens[0]

['exporter',
 'fear',
 'damage',
 'trade',
 'friction',
 'japan',
 'raise',
 'fear',
 'many',
 'asia',
 'export',
 'nation',
 'row',
 'far',
 'reach',
 'economic',
 'damage',
 'businessman',
 'official',
 'tell',
 'reuter',
 'correspondent',
 'asian',
 'capital',
 'move',
 'japan',
 'boost',
 'protectionist',
 'sentiment',
 'lead',
 'curb',
 'american',
 'import',
 'product',
 'exporter',
 'conflict',
 'hurt',
 'long',
 'run',
 'short',
 'term',
 'tokyo',
 'loss',
 'gain',
 'impose',
 'dlrs',
 'tariff',
 'import',
 'japanese',
 'electronic',
 'good',
 'april',
 'retaliation',
 'japan',
 'allege',
 'failure',
 'stick',
 'pact',
 'sell',
 'semiconductor',
 'world',
 'market',
 'cost',
 'japanese',
 'estimate',
 'put',
 'impact',
 'tariff',
 'dlrs',
 'spokesman',
 'major',
 'electronic',
 'firm',
 'virtually',
 'halt',
 'export',
 'product',
 'hit',
 'new',
 'taxis',
 'able',
 'business',
 'spokesman',
 'lead',
 'japanese',
 'electronic',
 'firm',
 'electric',
 'industrial',
 'ltd',
 'tar

In [9]:
# verify that all documents have at least one word

for row, doc in enumerate(tokens):
    if len(doc) == 0:
        print(f"doc {row} has no words!")
        break

In [10]:
# label_to_id = {label: idx for idx, label in enumerate(top_6_categories)}
# doc_labels = [label_to_id[label] for label in filtered_labels]

In [11]:
final_labels = [reuters.categories(fileids=fileid)[0] for fileid in final_fileids]

In [12]:
import torch

torch.save((final_labels,
            tokens,
            dictionary,
            tokens_bow,
            tokens_sparse),
           "reuters_coo.pt")