## Preprocessing NLTK; Create TF-IDF Vector Space Model

In [1]:
#!pip install nltk
#!pip install sklearn
#!pip install gensim
#!pip install matplotlib
#!pip install networkx
#!pip install kneed

In [2]:
import nltk
from nltk.corpus import reuters
import re
import numpy as np
import pandas as pd
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import collections
import math
import operator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gimli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import Data

In [3]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
og = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

In [4]:
df = og.copy()

### Text Processing

In [5]:
# processing text
df.text = df.text.str.replace('\n', ' ')
df.text = df.text.str.replace('&lt;', '<')
df.text = df.text.str.replace("&amp;", "&")

# down case all
df.text = df.text.str.lower()

# remove symbols
df.text = df.text.str.replace('<', ' ')
df.text = df.text.str.replace('>', ' ')
df.text = df.text.str.replace('-', ' ')

# delete content specific "stop words"
delete_words = ['qtr', 'pct', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'bil', 'mln',
               'quarter', 'percent', 'million', 'billion', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 
                'august', 'september', 'october', 'november', 'december', 'janurary', 'said', 'year', 'month',
               'shr', 'cts', 'january\'s', 'february\'s', 'march\'s', 'april\'s', 'may\'s', 'june\'s', 'july\'s', 
                'august\'s', 'september\'s', 'october\'s', 'november\'s', 'december\'s',
               'feet', 'ounces', 'ounce', 'foot', 'ton', 'tons', 'tonnes']
for w in delete_words:
    df.text = df.text.str.replace(' ' + w + ' ', ' ')
    df.text = df.text.str.replace(' ' + w + '\\.', '.')

# remove punctuation
df.text = df.text.apply(lambda row: row.translate(str.maketrans('','',string.punctuation)))

# collapse words to acronyms so recognized as one concept/token (and currently they are mixed)
df.text = df.text.str.replace('united states', 'us')
df.text = df.text.str.replace('new zealand', 'nz')
df.text = df.text.str.replace('hong kong', 'hk')
df.text = df.text.str.replace('united kingdom', 'uk')
df.text = df.text.str.replace('dlrs', 'dollars')

# remove all numbers
    # originally removing number words
df.text = df.text.apply(lambda row: re.sub('\d*', '', row))

In [6]:
df.to_pickle('reuters_processed') 