In [1]:
from collections import Counter
import json
import nltk
from nltk.corpus import stopwords
WORD_FREQUENCY_FILE_FULL_PATH = "analysis.vocab"

In [90]:
class MyVocabulary:
    
    def __init__(self, vocabulary, wordFrequencyFilePath):
        self.vocabulary = vocabulary
        self.WORD_FREQUENCY_FILE_FULL_PATH = wordFrequencyFilePath
        self.input_word_index = {}
        self.reverse_input_word_index = {}
        
    def PrepareVocabulary(self,reviews):
        self._prepare_Word_Frequency_Count_File(reviews)
        self._create_Vocab_Indexes()
      
    def Get_Top_Words(self, number_words = None):
        if number_words == None:
            number_words = self.vocabulary
        
        chars = json.loads(open(self.WORD_FREQUENCY_FILE_FULL_PATH).read())
        counter = Counter(chars)
        most_popular_words = {key for key, _value in counter.most_common(number_words)}
        return most_popular_words
    
    def _prepare_Word_Frequency_Count_File(self,reviews):
        counter = Counter()    
        for s in reviews:
            counter.update(s.split(" "))
            
        with open(self.WORD_FREQUENCY_FILE_FULL_PATH, 'w') as output_file:
            output_file.write(json.dumps(counter))
                 
    def _create_Vocab_Indexes(self):
        INPUT_WORDS = self.Get_Top_Words(self.vocabulary)

        #word to int
        self.input_word_index = dict(
            [(word, i) for i, word in enumerate(INPUT_WORDS)])


        #int to word
        self.reverse_input_word_index = dict(
            (i, word) for word, i in self.input_word_index.items())

        
        #self.input_word_index = input_word_index
        #self.reverse_input_word_index = reverse_input_word_index
        #seralize.dump(config.DATA_FOLDER_PATH+"input_word_index.p",input_word_index)
        #seralize.dump(config.DATA_FOLDER_PATH+"reverse_input_word_index.p",reverse_input_word_index)
        
        
        

In [84]:
#Download DataSet
#http://ai.stanford.edu/~amaas/data/sentiment/
#https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks
import os

def GetTextFilePathsInDirectory(directory):
    files = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            filePath = os.path.join(directory, file)
            files.append(filePath)
    return files

def GetLinesFromTextFile(filePath):
    with open(filePath,"r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]
    return lines


def RemoveStopWords(line, stopwords):
    words = []
    for word in line.split(" "):
        word = word.strip()
        if word not in stopwords and word != "" and word != "&":
            words.append(word)

    return " ".join(words)


import re
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

#https://gist.github.com/aaronkub/257a1bd9215da3a7221148600d849450#file-clean_movie_reviews-py
def preprocess_reviews(reviews):
    default_stop_words = nltk.corpus.stopwords.words('english')
    stopwords = set(default_stop_words)
    
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    reviews = [RemoveStopWords(line,stopwords) for line in reviews]
    
    return reviews

In [85]:
default_stop_words = nltk.corpus.stopwords.words('english')
stopwords = set(default_stop_words)
RemoveStopWords("this is a very large test",stopwords)

'large test'

<h2>Prepare Data</h2>

In [86]:
positive_files = GetTextFilePathsInDirectory("aclImdb/train/pos/")
negative_files = GetTextFilePathsInDirectory("aclImdb/train/neg/")

reviews_positive = []
for i in range(0,500):
    reviews_positive.extend(GetLinesFromTextFile(positive_files[i]))
    
reviews_negative = []
for i in range(0,500):
    reviews_negative.extend(GetLinesFromTextFile(negative_files[i]))

In [87]:
print("Positive Review---> {0}".format(reviews_positive[5]))
print()
print("Negative Review---> {0}".format(reviews_negative[5]))

print()
reviews_positive = preprocess_reviews(reviews_positive)
print("Processed Positive Review---> {0}".format(reviews_positive[5]))

print()
reviews_negative = preprocess_reviews(reviews_negative)
print("Processed Negative Review---> {0}".format(reviews_negative[5]))

Positive Review---> This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7

<h2>Labeled DataSet</h2>

In [88]:
#Combine DataSets
Reviews_Labeled = [[line,1] for line in reviews_positive]
Reviews_Labeled.extend([[line,-1] for line in reviews_negative])
Reviews_Labeled[10]

['first read armistead maupins story taken human drama displayed gabriel one cares loves said given film version excellent story expected see past gloss hollywood writer armistead maupin director patrick stettner truly succeeded right amount restraint robin williams captures fragile essence gabriel lets us see struggle issues trust personnel lifejess world around himdonna introduced players drama reminded nothing ever seems smallest event change lives irrevocably request review book written young man turns life changing event helps gabriel find strength within carry move forward bad people avoid film say average american probably think robin williams serious role didnt work please give movie chance robin williams touches darkness must find go better people like movie one hour photo stepped actor made another quality piece art oh forget believe bobby cannavale jess steals every scene 1940s leading man looks screen presence hacks opinion could carry movie right s~',
 1]

<h3>Prepare Vocabulary</h3>

In [98]:
vocab = MyVocabulary(100,"analysis.vocab")

reviews_text = [line[0] for line in Reviews_Labeled]
vocab.PrepareVocabulary(reviews_text)

vocab.Get_Top_Words(10)

#vocab.input_word_index["even"]
#vocab.input_word_index

{'even',
 'film',
 'good',
 'like',
 'movie',
 'one',
 'see',
 'story',
 'time',
 'would'}