# raw material stats

`ls > news_dates.txt` **365** (days) of news folders from `19960820` to `19970819`

`find . -type f -name "*.xml" > id.txt` 

`wc -l id.txt` raw material (**rcv1**) contains in all `806791` news files in `.xml`

# data transformation

`.xml` > 
`raw_text.csv`, 
`rel_info.csv`, 
`rel_info.pkl`, 
`sampled-for-bertal.pkl`

# files involved 

**on 45 categories derived from the authors**

`categories_from_author.csv` tables from original authors (sheet1 with code info, sheet2 clear version)

`categories.csv` 45 categories rearranged from hierarchy based on original author's table

`goldilocks_categories.txt` 45 categories rearranged sequentially from hierarchy

**derived from original rcv1 datasets requested from NIST**

`id.txt` paths of rcv1 news files for getting the news_ids 

`rcv1_path.txt` paths of rcv1 news files for parsing paths of news files for parsing 

`rcv1v2_ids.dat` news_ids in rcv1-v2 (total 804414)

**generaed additional files during data processing**

`rcv1_id.txt` news_ids of rcv1

`rcv1v2.pkl`  dataframe of the rcv1v2 (804414, codes rectified)


`rcv1v2_downsampled.txt` 160833 news_ids of the downsampled for experiment


In [None]:
import os
import pandas as pd
import numpy as np
import xml.dom.minidom
from xml.etree import ElementTree as ET
from tqdm import tqdm
from rcv1_v2_processing import getSubDir, get_xml_data, get_xml_csv

In [67]:
rcv1_ids = [line.strip() for line in open("./goldilocks_reproduce/reuters/id.txt").readlines()]

In [8]:
rcv1v2_ids = [line.strip() for line in open("./goldilocks_reproduce/reuters/rcv1v2-ids.dat").readlines()]

In [165]:
rcv1_path = [line.strip() for line in open("./goldilocks_reproduce/reuters/rcv1_path.txt").readlines()]

remove path of other files in `rcv1_path.txt`

In [166]:
rcv1_path.index('19960820')

807169

In [168]:
del rcv1_path[807169: ]

In [176]:
del rcv1_path[-12:]

In [179]:
del rcv1_path[0]

In [170]:
for date in rcv1_dates:
    date_path = './'+date
    if date_path in rcv1_path:
        idx = rcv1_path.index(date_path)
        del rcv1_path[idx]

In [171]:
rcv1_path.sort()

In [181]:
len(rcv1_path)

806791

In [69]:
len(rcv1_ids)

806791

In [182]:
set(rcv1_path) == set(rcv1_ids)

True

In [9]:
len(rcv1v2_ids)

804414

In [93]:
rcv1_ids[0]

'./19961119/200469newsML.xml'

In [94]:
rcv1_id = []
for _id in tqdm(rcv1_ids):
    name = re.search(r"([^/]*$)", _id).group().strip('newsML.xml')
    rcv1_id.append(name)

100%|█████████████████████████████| 806791/806791 [00:00<00:00, 906433.30it/s]


record the doc ids dropped from rcv1 to get rcv1-v2

In [100]:
drop_ids = set(rcv1_id).difference(set(rcv1v2_ids))

In [101]:
len(drop_ids) == 806791 - 804414

True

In [102]:
with open("./goldilocks_reproduce/reuters/drop_ids.txt", "a") as f:
    for _id in drop_ids:
        f.write(_id+'\n')

In [103]:
with open("./goldilocks_reproduce/reuters/rcv1_id.txt", "a") as f:
    for _id in rcv1_id:
        f.write(_id+'\n')

In [122]:
len(drop_ids)

2377

In [223]:
rcv1v2_path = rcv1_path.copy()

In [363]:
_dict = {"news_id":[],"title":[], "text":[],"code":[],"date":[]}

for path in tqdm(rcv1v2_path):
    get_xml_csv(path,_dict)

100%|███████████████████████████████| 804414/804414 [02:36<00:00, 5147.47it/s]


In [None]:
df = pd.DataFrame.from_dict(_dict, orient='index')

In [371]:
df = df.T
df

Unnamed: 0,news_id,title,text,code,date
0,2286,MEXICO: Recovery excitement brings Mexican mar...,Emerging evidence that Mexico's economy was ba...,"MEX,E11,ECAT,M11,M12,MCAT",1996-08-20
1,2287,USA: Chrysler plans new investments in Latin A...,Chrysler Corp. Tuesday announced $380 million ...,"ARG,BRAZ,USA,I24700,I34320,I35101,I36400,C24,CCAT",1996-08-20
2,2288,"USA: CompuServe reports loss, cutting work force.",CompuServe Corp. Tuesday reported a surprising...,"USA,I83940,C15,C151,CCAT,E41,ECAT,GCAT,GJOB",1996-08-20
3,2289,"USA: CompuServe reports loss, cutting work force.",CompuServe Corp. Tuesday reported a surprising...,"USA,I83940,C15,C151,CCAT",1996-08-20
4,2290,USA: Planet Hollywood launches credit card.,If dining at Planet Hollywood made you feel li...,"USA,I66100,I81501,I83954,C11,C22,CCAT",1996-08-20
...,...,...,...,...,...
804409,810592,USA: Teamsters' Carey to hold news conference.,"The president of the Teamsters union, Ron Care...","USA,I79010,C42,CCAT,E41,ECAT,GCAT,GJOB",1997-08-19
804410,810593,USA: UPS says has deal to end Teamsters' strike.,United Parcel Service said on Monday it had re...,"USA,GJOB",1997-08-19
804411,810594,USA: UPS says has tentative deal to end strike.,United Parcel Service said late Monday night i...,"USA,I79010,C42,CCAT,E41,ECAT,GCAT,GJOB",1997-08-19
804412,810595,JAPAN: Asia currency woes hurt region's oil pr...,This year's rash of Asian currency crises come...,"INDON,JAP,MALAY,PHLNS,THAIL,C21,C24",1997-08-19


In [41]:
df = pd.read_pickle('rcv1v2.pkl')

In [42]:
del df["date"]

In [43]:
df["raw"] = df["title"].str.cat(df["text"])

In [44]:
df["len"] = df["raw"].str.len()

In [46]:
del df["title"]

In [47]:
df = df[["news_id", "raw", "len", "code"]]
df

Unnamed: 0,news_id,raw,len,code
0,2286,MEXICO: Recovery excitement brings Mexican mar...,2521,"MEX,E11,ECAT,M11,M12,MCAT"
1,2287,USA: Chrysler plans new investments in Latin A...,4051,"ARG,BRAZ,USA,I24700,I34320,I35101,I36400,C24,CCAT"
2,2288,"USA: CompuServe reports loss, cutting work for...",3048,"USA,I83940,C15,C151,CCAT,E41,ECAT,GCAT,GJOB"
3,2289,"USA: CompuServe reports loss, cutting work for...",3048,"USA,I83940,C15,C151,CCAT"
4,2290,USA: Planet Hollywood launches credit card.If ...,1012,"USA,I66100,I81501,I83954,C11,C22,CCAT"
...,...,...,...,...
804409,810592,USA: Teamsters' Carey to hold news conference....,1152,"USA,I79010,C42,CCAT,E41,ECAT,GCAT,GJOB"
804410,810593,USA: UPS says has deal to end Teamsters' strik...,699,"USA,GJOB"
804411,810594,USA: UPS says has tentative deal to end strike...,1762,"USA,I79010,C42,CCAT,E41,ECAT,GCAT,GJOB"
804412,810595,JAPAN: Asia currency woes hurt region's oil pr...,3941,"INDON,JAP,MALAY,PHLNS,THAIL,C21,C24"


#### genrate `raw_text.csv`

In [92]:
raw_text = df.copy()
del raw_text["len"]
del raw_text["code"]
raw_text.to_csv("./raw_text.csv")

In [35]:
rel_info = df.copy()

In [78]:
for row in tqdm(rel_info.itertuples()):
    for cat in categories:
        if cat in row.code:
            rel_info.loc[row.Index, cat] = True
        else:
            rel_info.loc[row.Index, cat] = False

804414it [13:35, 986.44it/s]


In [88]:
rel_info

Unnamed: 0,news_id,code,ALG,ASIA,BELG,BUL,BURMA,C12,C182,C33,...,I82000,I83100,INDON,ISRAEL,MCDNIA,MEX,POL,RWANDA,SWED,TAIWAN
0,2286,"MEX,E11,ECAT,M11,M12,MCAT",False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,2287,"ARG,BRAZ,USA,I24700,I34320,I35101,I36400,C24,CCAT",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2288,"USA,I83940,C15,C151,CCAT,E41,ECAT,GCAT,GJOB",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2289,"USA,I83940,C15,C151,CCAT",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2290,"USA,I66100,I81501,I83954,C11,C22,CCAT",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804409,810592,"USA,I79010,C42,CCAT,E41,ECAT,GCAT,GJOB",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
804410,810593,"USA,GJOB",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
804411,810594,"USA,I79010,C42,CCAT,E41,ECAT,GCAT,GJOB",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
804412,810595,"INDON,JAP,MALAY,PHLNS,THAIL,C21,C24",False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


#### generate `rel_info.csv`, `rel_info.pkl`

In [89]:
rel_info1 = rel_info.copy()
del rel_info1["code"]
rel_info1

Unnamed: 0,news_id,ALG,ASIA,BELG,BUL,BURMA,C12,C182,C33,CUBA,...,I82000,I83100,INDON,ISRAEL,MCDNIA,MEX,POL,RWANDA,SWED,TAIWAN
0,2286,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,2287,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2288,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2289,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2290,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804409,810592,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
804410,810593,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
804411,810594,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
804412,810595,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [90]:
rel_info1.to_csv("./rel_info.csv")
rel_info1.to_pickle("./rel_info.pkl")

In [9]:
len_stat = list(df["len"])

In [10]:
len(len_stat)

804414

In [15]:
print(np.argmax(np.bincount(len_stat)))
print(np.mean(len_stat))
print(np.median(len_stat))

568
1477.4750762666986
1119.0


In [401]:
set1 = set(df[df.loc[:,'code'].str.contains('CZ') == True].index)

In [402]:
set2 = set(df[df.loc[:,'code'].str.contains('CZREP') == True].index)

In [415]:
df[df.loc[:,'code'].str.contains('GDR') == True].index

Int64Index([133304], dtype='int64')

In [403]:
set1.difference(set2)

{311735, 599226, 705298}

#### rectify codes from rcv1 to rcv1-v2

refer to **3.2.3 Region codes** of ***RCV1: A New Benchmark Collection for Text Categorization ResearchRCV1: A New Benchmark Collection for Text Categorization Research***


`CZ - CANAL ZONE` (one occurrence: 605159), `CZECH - CZECHOSLOVAKIA` (two occurrences: 711540, 316542), and `GDR - EAST GERMANY` (one occurrence: 137254)

into

`PANA(PANAMA)`, `CZREP(CZECH REPUBLIC)`, and `GFR(GERMANY)`

In [416]:
df.loc[311735,"code"] = 'CRTIA,CZREP,HUNG,POL,SLVAK,SLVNIA,M11,MCAT'

In [426]:
df.loc[311735,"news_id"]

'316542'

In [420]:
df.loc[705298,"code"] = 'CZREP,HUNG,NATO,POL,SPAIN,I36400,I83960,C33,C331,CCAT,GCAT,GDEF,GDIP'

In [427]:
df.loc[705298,"news_id"]

'711540'

In [423]:
df.loc[599226,"code"] = 'PANA,USA,I72603,C13,C24,CCAT,GCAT,GENV,GPOL'

In [428]:
df.loc[599226,"news_id"]

'605159'

In [425]:
df.loc[133304,"code"] = 'GFR,POL,SWITZ,I81402,C12,C13,CCAT,GCAT,GCRIM,GDIP'

In [429]:
df.loc[133304,"news_id"]

'137254'

In [386]:
raw_text = pd.read_csv('./rcv1v2.csv')

In [388]:
del raw_text

In [430]:
df.to_pickle("../rcv1v2.pkl")

#### generate `sampled-for-bertal.pkl` for the 45 categories

In [16]:
categories = []

def cat_div(listTemp, n):
    for i in range(0, len(listTemp),n):
        yield listTemp[i:i+n]

with open("./categories.csv")as f:
    for line in f:
        categories.append(line.split("_")[1].strip("\n"))
    rare_hard = categories[:5]
    rare_medium = categories[5:10]
    rare_easy = categories[10:15]
    medium_hard = categories[15:20]
    medium_medium = categories[20:25]
    medium_easy = categories[25:30]
    common_hard = categories[30:35]
    common_medium = categories[35:40]
    common_easy = categories[40:]

In [20]:
categories

['ASIA',
 'I35102',
 'I65100',
 'I41300',
 'I32200',
 'I24000',
 'I32830',
 'I42600',
 'I8150211',
 'RWANDA',
 'ALG',
 'GUAT',
 'BURMA',
 'CUBA',
 'MCDNIA',
 'GENV',
 'I3302021',
 'C182',
 'GPRO',
 'I82000',
 'I22100',
 'E513',
 'I36400',
 'I42900',
 'I41000',
 'FIN',
 'CZREP',
 'BUL',
 'SWED',
 'TAIWAN',
 'I83100',
 'E12',
 'I81501',
 'C33',
 'I81502',
 'C12',
 'BELG',
 'I21000',
 'I1300003',
 'I14000',
 'INDON',
 'MEX',
 'ISRAEL',
 'POL',
 'EEC']

In [21]:
categories_dict= {"rare_hard":rare_hard, "rare_medium":rare_medium, "rare_easy":rare_easy, "medium_hard":medium_hard, "medium_medium":medium_medium, "medium_easy":medium_easy, "common_hard":common_hard, "common_medium":common_medium, "common_easy":common_easy}

In [22]:
categories_dict

{'rare_hard': ['ASIA', 'I35102', 'I65100', 'I41300', 'I32200'],
 'rare_medium': ['I24000', 'I32830', 'I42600', 'I8150211', 'RWANDA'],
 'rare_easy': ['ALG', 'GUAT', 'BURMA', 'CUBA', 'MCDNIA'],
 'medium_hard': ['GENV', 'I3302021', 'C182', 'GPRO', 'I82000'],
 'medium_medium': ['I22100', 'E513', 'I36400', 'I42900', 'I41000'],
 'medium_easy': ['FIN', 'CZREP', 'BUL', 'SWED', 'TAIWAN'],
 'common_hard': ['I83100', 'E12', 'I81501', 'C33', 'I81502'],
 'common_medium': ['C12', 'BELG', 'I21000', 'I1300003', 'I14000'],
 'common_easy': ['INDON', 'MEX', 'ISRAEL', 'POL', 'EEC']}

In [23]:
import pickle
dump_file = open("./categories_dict.pkl", "wb")
pickle.dump(categories_dict, dump_file)
dump_file.close()

<function BufferedWriter.close>

In [25]:
with open('categories_dict.pkl', 'wb') as file:
    pickle.dump(categories_dict, file)

In [489]:
df[df.loc[:,'code'].str.contains('C12') == True].news_id

9           2295
13          2299
19          2305
20          2306
25          2311
           ...  
803765    809945
803875    810056
804139    810320
804395    810577
804398    810581
Name: news_id, Length: 11944, dtype: object

In [490]:
rare_hard

['ASIA', 'I35102', 'I65100', 'I41300', 'I32200']

In [None]:
rare_hard, rare_medium, rare_easy

In [505]:
def cat_id_dict(cat1, cat2, cat3):
    dic = {}
    for cat in tqdm(zip(cat1, cat2, cat3)):
        for item in cat:
            ids = list(df[df.loc[:,'code'].str.contains(item) == True].news_id)
            dic[item] = ids
    return dic

In [508]:
rare = cat_id_dict(rare_hard, rare_medium, rare_easy)
medium = cat_id_dict(medium_hard, medium_medium, medium_easy)
common = cat_id_dict(common_hard, common_medium, common_easy)

5it [00:02,  2.16it/s]
5it [00:02,  1.98it/s]
5it [00:02,  2.03it/s]


In [516]:
id_45cat = set()

def add_ids(cat_id_dict):
    for cat in cat_id_dict.keys():
        id_45cat.update(cat_id_dict[cat])

In [524]:
add_ids(rare)
add_ids(medium)
add_ids(common)

In [525]:
len(id_45cat)

277814

In [540]:
import random

random.seed(123)
id_45cat_list = list(id_45cat)
random.shuffle(id_45cat_list)
down_sampled = id_45cat_list[:160833]

In [541]:
with open("../rcv1v2_downsampled.txt", "a") as f:
    for _id in down_sampled:
        f.write(_id+'\n')

In [547]:
down_sampled_df = df[df.loc[:,'news_id'].isin(down_sampled)].drop(columns=["date"])

In [559]:
down_sampled_df.reset_index(drop=True,inplace=True)

In [560]:
down_sampled_df.head()

Unnamed: 0,news_id,title,text,code
0,2286,MEXICO: Recovery excitement brings Mexican mar...,Emerging evidence that Mexico's economy was ba...,"MEX,E11,ECAT,M11,M12,MCAT"
1,2287,USA: Chrysler plans new investments in Latin A...,Chrysler Corp. Tuesday announced $380 million ...,"ARG,BRAZ,USA,I24700,I34320,I35101,I36400,C24,CCAT"
2,2298,USA: U.S. Federal Reserve holds interest rates...,"By Rich Miller, Economics CorrespondentThe Fed...","USA,E12,ECAT,M13,M131,MCAT"
3,2303,USA: War hero Colin Powell hits road with Dole...,Gulf War hero Colin Powell lent his prestige a...,"USA,GCAT,GPOL"
4,2305,USA: Decision nears on Indiana tobacco lawsuit.,Another liability challenge to the tobacco ind...,"USA,I42900,C12,CCAT,GCAT,GCRIM"


In [142]:
down_index = list(down_sampled_df.index)

In [143]:
len(down_index)

160833

In [145]:
import pickle
dump_file = open("../rcv1_info/sampled-for-bertal.pkl", "wb")
pickle.dump(down_index, dump_file)
dump_file.close()

<function BufferedWriter.close>