# Feed Forward Neural Network using NLTK Sentiment Analyzer
Loading and Cleaning Reviews
The text data is already pretty clean, so not much preparation is required.
Without getting too much into the details, we will prepare the data using the following method:
* Split tokens on white space.
* Remove all punctuation from words.
* Remove all words that are not purely comprised of alphabetical characters.
* Remove all words that are known stop words.
* Remove all words that have a length <= 1 character.


In [1]:
from typing import Counter

from nltk.corpus import stopwords
import string

In [2]:
print(len(stopwords.words('english')))

179


In [3]:
stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

In [5]:
def load_data_from_file(file_name):
    with open(file_name,'r') as f:
        return f.read()

def clean_document(doc):
    tokens = doc.split()
    
    table = str.maketrans("","",string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words and len(w)>1]
    return tokens
    
        

In [6]:
data = load_data_from_file('data/review_polarity/txt_sentoken/pos/cv000_29590.txt')
tokens = clean_document(data)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [7]:
print(len(tokens))

402


In [14]:
import os


def update_vocab(directory,vocab,skip=None):
    for fileName in os.listdir(directory):
        fileName = directory+"/"+fileName
        if skip and fileName.startswith(skip):
            continue
            
        doc = load_data_from_file(fileName)
        tokens = clean_document(doc)
        vocab.update(tokens)


In [21]:
from collections import Counter
positive = 'data/review_polarity/txt_sentoken/pos'
negative = 'data/review_polarity/txt_sentoken/neg'
vocab = Counter()


In [22]:
update_vocab(positive,vocab,'cv9')


In [23]:
vocab.most_common(50)

[('film', 4866),
 ('one', 2904),
 ('movie', 2392),
 ('like', 1718),
 ('story', 1215),
 ('also', 1200),
 ('films', 1199),
 ('good', 1193),
 ('even', 1174),
 ('time', 1171),
 ('characters', 1038),
 ('much', 1027),
 ('character', 1013),
 ('would', 993),
 ('life', 984),
 ('well', 968),
 ('first', 963),
 ('two', 960),
 ('see', 954),
 ('way', 915),
 ('get', 884),
 ('best', 809),
 ('many', 780),
 ('really', 776),
 ('make', 772),
 ('little', 770),
 ('people', 769),
 ('great', 745),
 ('movies', 734),
 ('new', 718),
 ('never', 717),
 ('scene', 715),
 ('man', 698),
 ('love', 645),
 ('scenes', 635),
 ('could', 627),
 ('world', 622),
 ('doesnt', 597),
 ('still', 585),
 ('us', 581),
 ('plot', 575),
 ('hes', 571),
 ('know', 569),
 ('however', 566),
 ('makes', 561),
 ('another', 559),
 ('back', 558),
 ('performance', 549),
 ('go', 544),
 ('dont', 538)]

In [24]:
update_vocab(negative,vocab,'cv9')

In [25]:
vocab.most_common(50)

[('film', 8860),
 ('one', 5521),
 ('movie', 5440),
 ('like', 3553),
 ('even', 2555),
 ('good', 2320),
 ('time', 2283),
 ('story', 2118),
 ('films', 2102),
 ('would', 2042),
 ('much', 2024),
 ('also', 1965),
 ('characters', 1947),
 ('get', 1921),
 ('character', 1906),
 ('two', 1825),
 ('first', 1768),
 ('see', 1730),
 ('well', 1694),
 ('way', 1668),
 ('make', 1590),
 ('really', 1563),
 ('little', 1491),
 ('life', 1472),
 ('plot', 1451),
 ('people', 1420),
 ('movies', 1416),
 ('could', 1395),
 ('bad', 1374),
 ('scene', 1373),
 ('never', 1364),
 ('best', 1301),
 ('new', 1277),
 ('many', 1268),
 ('doesnt', 1267),
 ('man', 1266),
 ('scenes', 1265),
 ('dont', 1210),
 ('know', 1207),
 ('hes', 1150),
 ('great', 1141),
 ('another', 1111),
 ('love', 1089),
 ('action', 1078),
 ('go', 1075),
 ('us', 1065),
 ('director', 1056),
 ('something', 1048),
 ('end', 1047),
 ('still', 1038)]

In [26]:
len(vocab)

46557

In [27]:
with open('vocab.txt','w') as f:
    tokens = '\n'.join([k for k,v in vocab.items() if v>=2])
    f.write(tokens)
    