In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import os
import re
import glob
import shutil
import xml.etree.ElementTree as ET

In [3]:
RCV1_root = '../rcv1/'
output_folder = '../converted/'

In [4]:
categories = {
  '1POL':  'CURRENT NEWS - POLITICS',
  '2ECO':  'CURRENT NEWS - ECONOMICS',
  '3SPO':  'CURRENT NEWS - SPORT',
  '4GEN':  'CURRENT NEWS - GENERAL',
  '6INS':  'CURRENT NEWS - INSURANCE',
  '7RSK':  'CURRENT NEWS - RISK NEWS',
  '8YDB':  'TEMPORARY',
  '9BNX':  'TEMPORARY',
  'ADS10': 'CURRENT NEWS - ADVERTISING',
  'BNW14': 'CURRENT NEWS - BUSINESS NEWS',
  'BRP11': 'CURRENT NEWS - BRANDS',
  'C11':   'STRATEGY/PLANS',
  'C12':   'LEGAL/JUDICIAL',
  'C13':   'REGULATION/POLICY',
  'C14':   'SHARE LISTINGS',
  'C15':   'PERFORMANCE',
  'C151':  'ACCOUNTS/EARNINGS',
  'C1511': 'ANNUAL RESULTS',
  'C152':  'COMMENT/FORECASTS',
  'C16':   'INSOLVENCY/LIQUIDITY',
  'C17':   'FUNDING/CAPITAL',
  'C171':  'SHARE CAPITAL',
  'C172':  'BONDS/DEBT ISSUES',
  'C173':  'LOANS/CREDITS',
  'C174':  'CREDIT RATINGS',
  'C18':   'OWNERSHIP CHANGES',
  'C181':  'MERGERS/ACQUISITIONS',
  'C182':  'ASSET TRANSFERS',
  'C183':  'PRIVATISATIONS',
  'C21':   'PRODUCTION/SERVICES',
  'C22':   'NEW PRODUCTS/SERVICES',
  'C23':   'RESEARCH/DEVELOPMENT',
  'C24':   'CAPACITY/FACILITIES',
  'C31':   'MARKETS/MARKETING',
  'C311':  'DOMESTIC MARKETS',
  'C312':  'EXTERNAL MARKETS',
  'C313':  'MARKET SHARE',
  'C32':   'ADVERTISING/PROMOTION',
  'C33':   'CONTRACTS/ORDERS',
  'C331':  'DEFENCE CONTRACTS',
  'C34':   'MONOPOLIES/COMPETITION',
  'C41':   'MANAGEMENT',
  'C411':  'MANAGEMENT MOVES',
  'C42':   'LABOUR',
  'CCAT':  'CORPORATE/INDUSTRIAL',
  'E11':   'ECONOMIC PERFORMANCE',
  'E12':   'MONETARY/ECONOMIC',
  'E121':  'MONEY SUPPLY',
  'E13':   'INFLATION/PRICES',
  'E131':  'CONSUMER PRICES',
  'E132':  'WHOLESALE PRICES',
  'E14':   'CONSUMER FINANCE',
  'E141':  'PERSONAL INCOME',
  'E142':  'CONSUMER CREDIT',
  'E143':  'RETAIL SALES',
  'E21':   'GOVERNMENT FINANCE',
  'E211':  'EXPENDITURE/REVENUE',
  'E212':  'GOVERNMENT BORROWING',
  'E31':   'OUTPUT/CAPACITY',
  'E311':  'INDUSTRIAL PRODUCTION',
  'E312':  'CAPACITY UTILIZATION',
  'E313':  'INVENTORIES',
  'E41':   'EMPLOYMENT/LABOUR',
  'E411':  'UNEMPLOYMENT',
  'E51':   'TRADE/RESERVES',
  'E511':  'BALANCE OF PAYMENTS',
  'E512':  'MERCHANDISE TRADE',
  'E513':  'RESERVES',
  'E61':   'HOUSING STARTS',
  'E71':   'LEADING INDICATORS',
  'ECAT':  'ECONOMICS',
  'ENT12': 'CURRENT NEWS - ENTERTAINMENT',
  'G11':   'SOCIAL AFFAIRS',
  'G111':  'HEALTH/SAFETY',
  'G112':  'SOCIAL SECURITY',
  'G113':  'EDUCATION/RESEARCH',
  'G12':   'INTERNAL POLITICS',
  'G13':   'INTERNATIONAL RELATIONS',
  'G131':  'DEFENCE',
  'G14':   'ENVIRONMENT',
  'G15':   'EUROPEAN COMMUNITY',
  'G151':  'EC INTERNAL MARKET',
  'G152':  'EC CORPORATE POLICY',
  'G153':  'EC AGRICULTURE POLICY',
  'G154':  'EC MONETARY/ECONOMIC',
  'G155':  'EC INSTITUTIONS',
  'G156':  'EC ENVIRONMENT ISSUES',
  'G157':  'EC COMPETITION/SUBSIDY',
  'G158':  'EC EXTERNAL RELATIONS',
  'G159':  'EC GENERAL',
  'GCAT':  'GOVERNMENT/SOCIAL',
  'GCRIM': 'CRIME, LAW ENFORCEMENT',
  'GDEF':  'DEFENCE',
  'GDIP':  'INTERNATIONAL RELATIONS',
  'GDIS':  'DISASTERS AND ACCIDENTS',
  'GEDU':  'EDUCATION',
  'GENT':  'ARTS, CULTURE, ENTERTAINMENT',
  'GENV':  'ENVIRONMENT AND NATURAL WORLD',
  'GFAS':  'FASHION',
  'GHEA':  'HEALTH',
  'GJOB':  'LABOUR ISSUES',
  'GMIL':  'MILLENNIUM ISSUES',
  'GOBIT': 'OBITUARIES',
  'GODD':  'HUMAN INTEREST',
  'GPOL':  'DOMESTIC POLITICS',
  'GPRO':  'BIOGRAPHIES, PERSONALITIES, PEOPLE',
  'GREL':  'RELIGION',
  'GSCI':  'SCIENCE AND TECHNOLOGY',
  'GSPO':  'SPORTS',
  'GTOUR': 'TRAVEL AND TOURISM',
  'GVIO':  'WAR, CIVIL WAR',
  'GVOTE': 'ELECTIONS',
  'GWEA':  'WEATHER',
  'GWELF': 'WELFARE, SOCIAL SERVICES',
  'M11':   'EQUITY MARKETS',
  'M12':   'BOND MARKETS',
  'M13':   'MONEY MARKETS',
  'M131':  'INTERBANK MARKETS',
  'M132':  'FOREX MARKETS',
  'M14':   'COMMODITY MARKETS',
  'M141':  'SOFT COMMODITIES',
  'M142':  'METALS TRADING',
  'M143':  'ENERGY MARKETS',
  'MCAT':  'MARKETS',
  'MEUR':  'EURO CURRENCY',
  'PRB13': 'CURRENT NEWS - PRESS RELEASE WIRES',
}

valid_categories = {
  '1POL':  'POLITICS',
  '2ECO':  'ECONOMICS',
  '3SPO':  'SPORTS',
  'ECAT':  'ECONOMICS',
  'ENT12': 'ENTERTAINMENT',
  'G12':   'POLITICS',
  'GENT':  'ENTERTAINMENT',
  'GPOL':  'POLITICS',
  'GSCI':  'TECHNOLOGY',
  'GSPO':  'SPORTS',
}

In [5]:
def reuter_rcv1_parse(filename):
    """ Read article in XML format and returns a dictionary with
        'title' (String)
        'body'  (String)
        'categories' (Array)
    """
    result = {}
    xml = ET.parse(filename)
    root = xml.getroot()
    result['headline'] = root.find('headline').text
    result['body'] = ''
    for paragraph in root.find('text'):
        result['body'] += paragraph.text
    result['categories'] = []
    for meta in root.find('metadata'):
        if meta.tag == 'codes' and meta.attrib.get('class') == 'bip:topics:1.0':
            for code in meta.findall('code'):
                result['categories'].append(code.attrib['code'])
        
    return result

In [6]:
def valid_cats(cats):
    retval = []
    for cat in cats:
        if cat in valid_categories:
            retval.append(valid_categories[cat])
    return retval
    

In [9]:
def generate_txt_from_xmls(xmlfiles):
    """ Parse *.xml files and for each one generates a TXT file which name is:
          {folder}/{news_id}-{categ1_[categ_n]}
    """
    for xmlfile in xmlfiles:
        result = reuter_rcv1_parse(xmlfile)
        cats = valid_cats(result['categories'])
        if len(cats):
            folder, output_name = re.search(r'rcv1/(\d+/)(\d+)\D', xmlfile).group(1, 2)
            os.makedirs(output_folder + folder , exist_ok=True)
            txt_filename = output_folder + folder + output_name + '-' + '_'.join(set(cats))
            with open(txt_filename, 'w') as f:
                if result['headline']:
                    f.write(result['headline'] + "\n")
                if result['body']:
                    f.write(result['body'])
            

In [13]:
xml_files = glob.glob(RCV1_root + '199*/*.xml')

In [10]:
shutil.rmtree(output_folder, ignore_errors=True)
os.mkdir(output_folder)
generate_txt_from_xmls(xml_files)

In [11]:
categ_count = {}
for file in glob.glob(output_folder + '**/*'):
    categs = file.split('-')[-1].split('_')
    for categ in categs:
        categ_count.setdefault(categ, 0)
        categ_count[categ] += 1


In [15]:
categ_count

{'ECONOMICS': 117539,
 'ENTERTAINMENT': 3801,
 'POLITICS': 56878,
 'SPORTS': 35317,
 'TECHNOLOGY': 2410}