In [1]:
import pandas as pd
import json
import sys
import os
sys.path.append("../")

from utils import file_util
from tqdm import tqdm
from ast import literal_eval
from sklearn.model_selection import train_test_split

In [2]:
data_dir = "../data/reuters"
seed = 6
test_size = 0.2

In [3]:
df = pd.read_csv(os.path.join(data_dir, "doc.csv"), converters={'lists': literal_eval})
df["topic"] = df["topic"].apply(literal_eval)
df.head()

Unnamed: 0,path,topic,subset,index,content,lead,tin,retail,fuel,propane,...,soy-meal,earn,sun-oil,instal-debt,cotton,heat,trade,dfl,palladium,iron-steel
0,test/14826,[trade],test,14826,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,test/14828,[grain],test,14828,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,test/14829,"[nat-gas, crude]",test,14829,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test/14832,"[rubber, tin, sugar, corn, rice, grain, trade]",test,14832,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,test/14833,"[palm-oil, veg-oil]",test,14833,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print("Data shape:", df.shape)
sample_content = df.sample(1)["content"].values[0]
print("A sample news content:")
print(sample_content[:5000])

Data shape: (10788, 95)
A sample news content:
NATIONAL WESTMINSTER BANK SAYS IT CUTTING BASE LENDING RATE TO 10.5 PCT FROM 11 PCT.

  NATIONAL WESTMINSTER BANK SAYS IT CUTTING BASE LENDING RATE TO 10.5 PCT FROM 11 PCT.
  




### Encode labels

In [5]:
labels = [x for x in df.columns if x not in ("path", "topic", "subset", "index", "content")]
print("- Number of labels:", len(labels))
print("- Labels:")
print(*labels, sep=", ")

- Number of labels: 90
- Labels:
lead, tin, retail, fuel, propane, crude, income, oat, copra-cake, barley, groundnut, cotton-oil, rand, cpi, lei, cocoa, groundnut-oil, jobs, nkr, livestock, castor-oil, palmkernel, money-fx, sunseed, hog, nat-gas, zinc, coconut-oil, gas, rape-oil, gold, orange, pet-chem, wheat, nickel, jet, interest, carcass, bop, l-cattle, potato, rapeseed, sugar, coffee, soy-oil, money-supply, platinum, yen, wpi, ship, soybean, sorghum, lin-oil, dmk, meal-feed, coconut, rice, dlr, alum, oilseed, acq, reserves, ipi, corn, grain, housing, nzdlr, naphtha, strategic-metal, palm-oil, sun-meal, lumber, tea, rye, rubber, gnp, veg-oil, cpu, silver, copper, soy-meal, earn, sun-oil, instal-debt, cotton, heat, trade, dfl, palladium, iron-steel


In [6]:
label_ids = {x : i for i, x in enumerate(labels)}

### Text normalization

Since news content is splitted into multiple line (separator `\n`), we first concatenate them into a single paragraph.
Also, we use new line `\n` to separate the news body from its heading, which is the first capitalized line.

In [7]:
def normalize(text):
    split_text = text.split("\n")
    title = split_text.pop(0).strip()
    content = " ".join(x.strip() for x in split_text).strip()
    return title, content

# Testing with a sample
print(normalize(sample_content))

('NATIONAL WESTMINSTER BANK SAYS IT CUTTING BASE LENDING RATE TO 10.5 PCT FROM 11 PCT.', 'NATIONAL WESTMINSTER BANK SAYS IT CUTTING BASE LENDING RATE TO 10.5 PCT FROM 11 PCT.')


In [8]:
df["content"] = df["content"].apply(normalize)

## Export

In [9]:
file_util.dump_json(label_ids, os.path.join(data_dir, "document_label_ids.json"))

In [10]:
f = open(os.path.join(data_dir, "data.ndjson"), "w")
train_doc_ids, _ = train_test_split(range(df.shape[0]), test_size=test_size, random_state=seed)

for rid, row in tqdm(df.iterrows(), desc="Writing", total=df.shape[0]):
    title, content = row["content"]
    sample = {
        "doc_id": row["index"],
        "title": title,
        "content": content,
        "labels": row["topic"],
        "is_train": rid in train_doc_ids,
    }
    f.write(json.dumps(sample, ensure_ascii=False) + "\n")
    
f.close()        

Writing: 100%|██████████| 10788/10788 [00:02<00:00, 3979.29it/s]
