## Preprocessing NLTK; Create TF-IDF Vector Space Model

In [1]:
#!pip install nltk
#!pip install sklearn
#!pip install gensim
#!pip install matplotlib
#!pip install networkx
#!pip install kneed

In [2]:
import nltk
from nltk.corpus import reuters
import re
import numpy as np
import pandas as pd
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import collections
import math
import operator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gimli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import Data

In [3]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
og = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

In [4]:
df = og.copy()

### Text Processing

In [9]:
# processing text
df.text = df.text.str.replace('\n', ' ')
df.text = df.text.str.replace('&lt;', '<')
df.text = df.text.str.replace("&amp;", "&")

# down case all
df.text = df.text.str.lower()

# remove symbols
df.text = df.text.str.replace('<', ' ')
df.text = df.text.str.replace('>', ' ')
#df.text = df.text.str.replace('-', ' ')

# remove punctuation
#p = string.punctuation
#p = p.replace('-', '') # keep dash - semantic meaning
df.text = df.text.apply(lambda row: row.translate(str.maketrans('','', string.punctuation)))

# delete content specific "stop words"
delete_words = ['qtr', 'pct', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'bil', 'mln',
               'quarter', 'percent', 'million', 'billion', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 
                'august', 'september', 'october', 'november', 'december', 'janurary', 'said', 'year', 'month', 'v', 'iv', 'vi',
               'shr', 'cts', 'januarys', 'februarys', 'marchs', 'aprils', 'mays', 'junes', 'julys', 'thous',
                'augusts', 'septembers', 'octobers', 'novembers', 'decembers', 'previous', 'prev', 'also', 'say', 'says',
               'feet', 'ounces', 'ounce', 'foot', 'ton', 'tons', 'tonnes', 'vs', 'nil', 'pound', 'thou', 'tonne', 'week', 'wk']
for w in delete_words:
    df.text = df.text.str.replace(' ' + w + ' ', ' ')
    df.text = df.text.str.replace(' ' + w + '\\.', '.')

# collapse words to acronyms so recognized as one concept/token (and currently they are mixed)
df.text = df.text.str.replace('united states', 'us')
df.text = df.text.str.replace('new zealand', 'nz')
df.text = df.text.str.replace('hong kong', 'hk')
df.text = df.text.str.replace('united kingdom', 'uk')
df.text = df.text.str.replace('dlrs', 'dollars')

# remove all numbers that start a word or have a number before it 
    # originally removing number words
#df.text = df.text.apply(lambda row: re.sub(' \d* ', ' ', row)) ### going back to number words
df.text = df.text.apply(lambda row: re.sub('\d*', '', row))

# final pass on delete words in case any are now surfaced from other deletions 
for w in delete_words:
    df.text = df.text.str.replace(' ' + w + ' ', ' ')
    df.text = df.text.str.replace(' ' + w + '\\.', '.')
for w in ['th']:
    df.text = df.text.str.replace(' ' + w + ' ', ' ')
    df.text = df.text.str.replace(' ' + w + '\\.', '.')
# 4th, 1st etc. - number now gone
#for num_words in ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']:
#    df.text = df.text.str.replace(' 1st ', ' ')
#    df.text = df.text.str.replace(' th ', ' ')

KeyboardInterrupt: 

In [6]:
df.to_pickle('reuters_processed') 

In [7]:
df[df.ids == 'test/15871'].text.unique()

array(['net change in export commitments  usda   the us agriculture department gave   the net change in export commitments including sales   cancellations foreign purchases and cumulative exports in the   current seasons through the ended  with   comparisons as follows in except as noted                                             all wheat                           corn                     soybeans                   soy cakemeal                   soybean oil     x                   cottony                xminus total yrunning bales       the indicated totals include reported commitments to both   named and unnamed destinations sales on exporters own account   and optional origin sales plus actual exports already made   during the respective marketing seasons       the usda cautions that reported outstanding sales are   subject to modification deferral or cancellation and it is   unlikely that all reported quantities will be exported       usda gave detailed breakdowns for the  and    

In [8]:
df[df.ids == 'test/15400']

Unnamed: 0,categories,ids,text
299,[yen],test/15400,japan business leaders g accord is worrying ...
