# Feed Forward Neural Network using NLTK Sentiment Analyzer
Loading and Cleaning Reviews
The text data is already pretty clean, so not much preparation is required.
Without getting too much into the details, we will prepare the data using the following method:
* Split tokens on white space.
* Remove all punctuation from words.
* Remove all words that are not purely comprised of alphabetical characters.
* Remove all words that are known stop words.
* Remove all words that have a length <= 1 character.


In [2]:
from keras.src.legacy.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
import string

In [9]:
print(len(stopwords.words("english")))
l = list(stopwords.words("english"))
#print(' '.join(l))
print(string.punctuation)

179
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
print(stopwords.words("Spanish")[:5])
print(stopwords.words("Hinglish")[:5])

['de', 'la', 'que', 'el', 'en']
['a', 'aadi', 'aaj', 'aap', 'aapne']


In [19]:
len(set(stopwords.words("english")))

179

In [25]:
def load_data_from_file(file_name):
    with open(file_name, "r") as f:
        return f.read()


def clean_document(doc):
    tokens = doc.split()

    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if not w in stop_words and len(w) > 1]
    return tokens


def get_tokens(file):
    with open(file, "r") as f:
        tokens = f.read()
    tokens = tokens.split()
    table = str.maketrans("", "", string.punctuation)
    stop_words = set(stopwords.words("english"))
    tokens = set(
        [
            w.translate(table)
            for w in tokens
            if (w.isalpha() and len(w) > 1 and w not in stop_words)
        ]
    )
    return tokens

In [35]:
data = load_data_from_file("data/review_polarity/txt_sentoken/pos/cv000_29590.txt")
tokens = set(clean_document(data))
print(len(set(tokens)))
z = get_tokens("data/review_polarity/txt_sentoken/pos/cv000_29590.txt")

341


In [41]:
def get_difference(set1, set2):
    diff = []
    for w in set1:
        if w not in set2:
            diff.append(w)
    return diff


len(z - tokens)
print(get_difference(z, tokens))

[]


In [42]:
import os


def update_vocab(directory, vocab, skip=None):
    for fileName in os.listdir(directory):
        fileName = directory + "/" + fileName
        if skip and fileName.startswith(skip):
            continue

        tokens = get_tokens(fileName)
        vocab.update(tokens)

In [43]:
from collections import Counter

positive = "data/review_polarity/txt_sentoken/pos"
negative = "data/review_polarity/txt_sentoken/neg"
vocab = Counter()

In [44]:
update_vocab(positive, vocab, "cv9")

In [45]:
vocab.most_common(50)

[('film', 883),
 ('one', 881),
 ('movie', 727),
 ('like', 710),
 ('time', 624),
 ('even', 619),
 ('also', 604),
 ('good', 577),
 ('much', 569),
 ('story', 558),
 ('would', 535),
 ('first', 535),
 ('well', 531),
 ('get', 525),
 ('way', 525),
 ('see', 521),
 ('character', 519),
 ('two', 515),
 ('make', 495),
 ('best', 480),
 ('characters', 479),
 ('life', 470),
 ('little', 456),
 ('many', 455),
 ('people', 448),
 ('films', 446),
 ('never', 432),
 ('really', 431),
 ('could', 411),
 ('man', 411),
 ('new', 410),
 ('great', 408),
 ('scene', 398),
 ('makes', 396),
 ('scenes', 390),
 ('another', 389),
 ('still', 382),
 ('back', 373),
 ('director', 369),
 ('go', 368),
 ('plot', 367),
 ('work', 367),
 ('movies', 367),
 ('end', 366),
 ('something', 365),
 ('made', 359),
 ('know', 356),
 ('however', 355),
 ('take', 351),
 ('seen', 351)]

In [46]:
update_vocab(negative, vocab, "cv9")

In [47]:
vocab.most_common(50)

[('one', 1740),
 ('film', 1723),
 ('movie', 1524),
 ('like', 1468),
 ('even', 1289),
 ('time', 1222),
 ('good', 1149),
 ('much', 1128),
 ('would', 1112),
 ('get', 1083),
 ('also', 1070),
 ('story', 1059),
 ('two', 1024),
 ('first', 1019),
 ('character', 1012),
 ('way', 1008),
 ('make', 1003),
 ('well', 987),
 ('see', 967),
 ('characters', 954),
 ('little', 914),
 ('plot', 875),
 ('could', 872),
 ('really', 867),
 ('never', 847),
 ('people', 842),
 ('best', 835),
 ('films', 823),
 ('director', 798),
 ('another', 789),
 ('many', 788),
 ('scene', 777),
 ('life', 777),
 ('scenes', 768),
 ('bad', 760),
 ('man', 760),
 ('new', 758),
 ('know', 748),
 ('end', 737),
 ('go', 734),
 ('made', 728),
 ('movies', 726),
 ('makes', 720),
 ('back', 717),
 ('something', 716),
 ('work', 712),
 ('still', 700),
 ('great', 694),
 ('seems', 686),
 ('better', 678)]

In [49]:
len(vocab)  # 46557
print(len(vocab))  # 46557
# 37589

37589


In [50]:
def save_vocab(file, vocabulary, seperator):
    with open(file, "w") as f:
        contents = seperator.join([k for k, v in vocabulary.items() if v >= 2])
        f.write(contents)

In [52]:
save_vocab("vocab1.txt", vocab, "\n")

In [53]:
def doc_to_line(fileName, vocab):
    """
    This will load a document, clean it, filter out tokens not in the vocabulary, then return the document as a string of white space separated tokens.
    """
    doc = load_data_from_file(fileName)
    tokens = clean_document(doc)
    tokens = [w for w in tokens if w in vocab]
    return " ".join(tokens)

In [54]:
def process_docs(directory, vocab, skip):
    lines = []
    for fileName in os.listdir(directory):
        fileName = directory + "/" + fileName
        if skip and fileName.startswith(skip):
            continue
        line = doc_to_line(fileName, vocab)
        lines.append(line)
    return lines

In [55]:
vocab_filename = "vocab.txt"
vocab = load_data_from_file(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [56]:
positive_lines = process_docs(positive, vocab, "cv9")
negative_lines = process_docs(negative, vocab, "cv9")

In [57]:
print(len(positive_lines), len(negative_lines))

1000 1000


# Movie Reviews to Bag-of-Words Vectors

**We will use the Keras API to convert reviews to encoded document vectors**

Keras provides the Tokenize class that can do some of the cleaning and vocab definition tasks that we took care of in the previous section.

It is better to do this ourselves to know exactly what was done and why. Nevertheless, the Tokenizer class is convenient and will easily transform documents into encoded vectors.First, the Tokenizer must be created, then fit on the text documents in the training dataset.In this case, these are the aggregation of the positive_lines and negative_lines arrays developed in the previous section.

In [61]:
tokenizer = Tokenizer()
print(positive_lines[0])

assume nothing phrase perhaps one used first impressions rumors hardly ever seem phrase especially goes oscar novak architect main focus three tango delightful funny romantic comedy assumptions novak matthew perry shy clumsy chicago based architect along openly gay partner peter steinberg oliver platt fights projects day day one job restoring popular building charles newman dylan mcdermott rich wellknown businessman charles immediately takes liking oscar enjoys personality sense humor seeing oscar someone could trust charles asks watch girlfriend unpredictable adventurous girl named amy post neve campbell makes living blowing glass charles wants know talks goes point make sure shes seeing someone else course oscar gladly takes job meets amy art show sparks fly two get go oscar feels found one meant content idea amy well another popular phrase good things must come end stays true oscar well charles walks amy oscar drink one night oscar amy become great friends doesnt seem mind thinks os

In [63]:
docs = positive_lines + negative_lines
tokenizer.fit_on_texts(docs)

In [64]:
# encode training data set
Xtrain = tokenizer.texts_to_matrix(docs, mode="freq")
print(Xtrain.shape)

(2000, 27140)


In [65]:
print(len(docs))

2000


In [66]:
train, test = docs[:1800], docs[1800:]

In [68]:
Xtrain = tokenizer.texts_to_matrix(train, mode="freq")

In [69]:
Xtrain.shape

(1800, 27140)

In [70]:
Xtrain[0]

array([0.        , 0.00472813, 0.0141844 , ..., 0.        , 0.        ,
       0.        ])

In [71]:
Xtest = tokenizer.texts_to_matrix(test, mode="freq")

In [72]:
Xtest.shape

(200, 27140)

In [73]:
# todo Import Sequential from tensorflow.keras and compile the models

In [6]:
import spacy
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")

doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


# Spacy is not intended for learning but to deploy deep learning models
spaCy is designed specifically for production use and helps you build applications that process and “understand” large volumes of text. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.

In [3]:
print(len(list(doc.text)))

19


In [4]:
doc.text

'This is a sentence.'