# Download and Preprocess Reuters Corpus
- Import data
- Remove symbols
- Lower case all
- Remove punctuation
- Remove content specific stop words: financial, dates, units
- Collapse countries to acronyms so single token
- Remove numbers: financial so many numbers, often surface as labels if not removed

In [1]:
#!pip install nltk
#!pip install sklearn
#!pip install gensim
#!pip install matplotlib
#!pip install networkx
#!pip install kneed

In [2]:
import nltk
from nltk.corpus import reuters
import re
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gimli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Import Data

In [3]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each file's categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe
df = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

### Text Processing

In [4]:
# remove line breaks, clean symbols
df.text = df.text.str.replace('\n', ' ')
df.text = df.text.str.replace('&lt;', '<')
df.text = df.text.str.replace("&amp;", "&")

# down case all
df.text = df.text.str.lower()

# remove some symbols
df.text = df.text.str.replace('<', ' ')
df.text = df.text.str.replace('>', ' ')

# remove punctuation
df.text = df.text.apply(lambda row: row.translate(str.maketrans('','', string.punctuation)))

# delete content specific "stop words"
delete_words = ['qtr', 'pct', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'bil', 'mln',
               'quarter', 'percent', 'million', 'billion', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 
                'august', 'september', 'october', 'november', 'december', 'janurary', 'said', 'year', 'month', 'v', 'iv', 'vi',
               'shr', 'cts', 'januarys', 'februarys', 'marchs', 'aprils', 'mays', 'junes', 'julys', 'thous', 
                'augusts', 'septembers', 'octobers', 'novembers', 'decembers', 'previous', 'prev', 'also', 'say', 'says',
               'feet', 'ounces', 'ounce', 'foot', 'ton', 'tons', 'tonnes', 'vs', 'nil', 'pound', 'thou', 'tonne', 'week', 'wk']
for w in delete_words:
    df.text = df.text.str.replace(' ' + w + ' ', ' ') # word with spaces on either side
    df.text = df.text.str.replace(' ' + w + '\\.', '.') # word followed by a period

# collapse countries to acronyms so recognized as one concept/token 
df.text = df.text.str.replace('united states', 'us')
df.text = df.text.str.replace('new zealand', 'nz')
df.text = df.text.str.replace('hong kong', 'hk')
df.text = df.text.str.replace('united kingdom', 'uk')

# dollars is sometimes written as dlr and sometimes as dollars. Make uniform. 
df.text = df.text.str.replace('dlrs', 'dollars')

# remove all numbers that start a word or have a number before it 
df.text = df.text.apply(lambda row: re.sub('\d*', '', row))

# final pass on delete words in case any are now surfaced from other deletions (such as numbers)
for w in delete_words:
    df.text = df.text.str.replace(' ' + w + ' ', ' ')
    df.text = df.text.str.replace(' ' + w + '\\.', '.')
# removing numbers resulted in floating 'th's
for w in ['th']:
    df.text = df.text.str.replace(' ' + w + ' ', ' ')
    df.text = df.text.str.replace(' ' + w + '\\.', '.')

In [5]:
# pickle processed data and save locally
df.to_pickle('reuters_processed') 