In [1]:
#http://colah.github.io/posts/2015-08-Understanding-LSTMs/

from collections import Counter
import json
import nltk
from nltk.corpus import stopwords
WORD_FREQUENCY_FILE_FULL_PATH = "analysis.vocab"

In [300]:
class MyVocabulary:
    
    def __init__(self, vocabulary, wordFrequencyFilePath):
        self.vocabulary = vocabulary
        self.WORD_FREQUENCY_FILE_FULL_PATH = wordFrequencyFilePath
        self.input_word_index = {}
        self.reverse_input_word_index = {}
        
        self.input_word_index["START"] = 1
        self.input_word_index["UNKOWN"] = -1
        self.MaxSentenceLength = None
        
    def PrepareVocabulary(self,reviews):
        self._prepare_Word_Frequency_Count_File(reviews)
        self._create_Vocab_Indexes()
        
        self.MaxSentenceLength = max([len(txt.split(" ")) for txt in reviews])
      
    def Get_Top_Words(self, number_words = None):
        if number_words == None:
            number_words = self.vocabulary
        
        chars = json.loads(open(self.WORD_FREQUENCY_FILE_FULL_PATH).read())
        counter = Counter(chars)
        most_popular_words = {key for key, _value in counter.most_common(number_words)}
        return most_popular_words
    
    def _prepare_Word_Frequency_Count_File(self,reviews):
        counter = Counter()    
        for s in reviews:
            counter.update(s.split(" "))
            
        with open(self.WORD_FREQUENCY_FILE_FULL_PATH, 'w') as output_file:
            output_file.write(json.dumps(counter))
                 
    def _create_Vocab_Indexes(self):
        INPUT_WORDS = self.Get_Top_Words(self.vocabulary)

        #word to int
        #self.input_word_index = dict(
        #    [(word, i) for i, word in enumerate(INPUT_WORDS)])
        for i, word in enumerate(INPUT_WORDS):
            self.input_word_index[word] = i
        
        #int to word
        #self.reverse_input_word_index = dict(
        #    (i, word) for word, i in self.input_word_index.items())
        for word, i in self.input_word_index.items():
            self.reverse_input_word_index[i] = word

        #self.input_word_index = input_word_index
        #self.reverse_input_word_index = reverse_input_word_index
        #seralize.dump(config.DATA_FOLDER_PATH+"input_word_index.p",input_word_index)
        #seralize.dump(config.DATA_FOLDER_PATH+"reverse_input_word_index.p",reverse_input_word_index)
        
        
    def TransformSentencesToId(self, sentences):
        vectors = []
        for r in sentences:
            words = r.split(" ")
            vector = np.zeros(len(words))

            for t, word in enumerate(words):
                if word in self.input_word_index:
                    vector[t] = self.input_word_index[word]
                else:
                    pass
                    #vector[t] = 2 #unk
            vectors.append(vector)
            
        return vectors
    
    def ReverseTransformSentencesToId(self, sentences):
        vectors = []
        for r in sentences:
            words = r.split(" ")
            vector = np.zeros(len(words))

            for t, word in enumerate(words):
                if word in self.input_word_index:
                    vector[t] = self.input_word_index[word]
                else:
                    pass
                    #vector[t] = 2 #unk
            vectors.append(vector)
            
        return vectors
        
        
        

In [301]:
#Download DataSet
#http://ai.stanford.edu/~amaas/data/sentiment/
#https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks
import os

def GetTextFilePathsInDirectory(directory):
    files = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            filePath = os.path.join(directory, file)
            files.append(filePath)
    return files

def GetLinesFromTextFile(filePath):
    with open(filePath,"r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]
    return lines


def RemoveStopWords(line, stopwords):
    words = []
    for word in line.split(" "):
        word = word.strip()
        if word not in stopwords and word != "" and word != "&":
            words.append(word)

    return " ".join(words)


import re
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

#https://gist.github.com/aaronkub/257a1bd9215da3a7221148600d849450#file-clean_movie_reviews-py
def preprocess_reviews(reviews):
    default_stop_words = nltk.corpus.stopwords.words('english')
    stopwords = set(default_stop_words)
    
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    reviews = [RemoveStopWords(line,stopwords) for line in reviews]
    
    return reviews

In [302]:
default_stop_words = nltk.corpus.stopwords.words('english')
stopwords = set(default_stop_words)
RemoveStopWords("this is a very large test",stopwords)

'large test'

<h2>Prepare Data</h2>

In [303]:
positive_files = GetTextFilePathsInDirectory("aclImdb/train/pos/")
negative_files = GetTextFilePathsInDirectory("aclImdb/train/neg/")

reviews_positive = []
for i in range(0,500):
    reviews_positive.extend(GetLinesFromTextFile(positive_files[i]))
    
reviews_negative = []
for i in range(0,500):
    reviews_negative.extend(GetLinesFromTextFile(negative_files[i]))

In [304]:
print("Positive Review---> {0}".format(reviews_positive[5]))
print()
print("Negative Review---> {0}".format(reviews_negative[5]))

print()
reviews_positive = preprocess_reviews(reviews_positive)
print("Processed Positive Review---> {0}".format(reviews_positive[5]))

reviews_negative = preprocess_reviews(reviews_negative)
print("Processed Negative Review---> {0}".format(reviews_negative[5]))

Positive Review---> This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7

<h2>Labeled DataSet</h2>

In [305]:
Reviews_Labeled = list(zip(reviews_positive, np.ones(len(reviews_positive))))
Reviews_Labeled.extend(list(zip(reviews_negative, np.zeros(len(reviews_negative)))))
Reviews_Labeled[10]

('first read armistead maupins story taken human drama displayed gabriel one cares loves said given film version excellent story expected see past gloss hollywood writer armistead maupin director patrick stettner truly succeeded right amount restraint robin williams captures fragile essence gabriel lets us see struggle issues trust personnel lifejess world around himdonna introduced players drama reminded nothing ever seems smallest event change lives irrevocably request review book written young man turns life changing event helps gabriel find strength within carry move forward bad people avoid film say average american probably think robin williams serious role didnt work please give movie chance robin williams touches darkness must find go better people like movie one hour photo stepped actor made another quality piece art oh forget believe bobby cannavale jess steals every scene 1940s leading man looks screen presence hacks opinion could carry movie right s~',
 1.0)

<h3>Prepare Vocabulary</h3>

In [307]:
TOP_WORDS = 500
vocab = MyVocabulary(TOP_WORDS,"analysis.vocab")

reviews_text = [line[0] for line in Reviews_Labeled]
vocab.PrepareVocabulary(reviews_text)

#vocab.Get_Top_Words(10)

vocab.input_word_index["START"]
#vocab.input_word_index["UNKOWN"]
#vocab.input_word_index

1

<h2>Integer Encode Words</h2>

In [309]:
reviews,labels=zip(*Reviews_Labeled)
reviews_int = vocab.TransformSentencesToId(reviews)

Reviews_Labeled_Int = list(zip(reviews_int,labels))
#print(len(Reviews_Labeled_Int))
print(Reviews_Labeled_Int[5])

(array([354.,   0.,   0., 418.,   0.,   0.,   0., 418.,   0.,   0.,   0.,
         0., 393., 269., 344.,   0.,   0.,   0., 179., 171.,   0., 354.,
         0.,   0.,   0.,   0.,   0.,   0., 418.,   0.,   0.,   0., 396.,
         0., 491.,   0.,   0., 496.,   0.,   0.,   0., 269., 360.,  62.,
       475., 338.,   0.,   0., 418.,  51., 197., 215.,   0.,   0., 431.,
       428.,   1.,   0., 433., 423.,   0.,   0.,   0.,   0.,   0., 268.,
       338., 424., 427., 418., 299.,   0.,   0.,   0.,  21.,   0.,   0.,
       268., 304., 454.,   0.,   0., 307.,  82., 144., 433., 348.,   0.,
         0., 298.,   0.,   0.,   0., 165.,   0.]), 1.0)


<h2>Split Train and Test </h2>

In [310]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(Reviews_Labeled_Int, test_size=0.2)

<h3>Pad Sentences</h3>

In [311]:
X_train, y_train = list(zip(*train))
X_test, y_test = list(zip(*test))

y_train = np.array(y_train)
y_test = np.array(y_test)

#max_review_length = vocab.MaxSentenceLength
max_review_length = 500

# Truncate and pad the review sequences 
from keras.preprocessing import sequence 
max_review_length = 500 
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) 
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) 

In [312]:
print(type(X_train))
print(type(X_test))

print(X_train.shape)
print(X_test.shape)

print(type(y_train))
print(type(y_test))

print(y_train.shape)
print(y_test.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(800, 500)
(200, 500)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(800,)
(200,)


<h2>Model</h2>

In [313]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM
import keras 

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(TOP_WORDS, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 32)           16000     
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 101       
Total params: 69,301
Trainable params: 69,301
Non-trainable params: 0
_________________________________________________________________
None


In [314]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Train on 800 samples, validate on 200 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2db2f6273c8>

In [289]:
TOP_WORDS

500

In [281]:
w = ["a","b","c"]

for i,w in enumerate(w):
    print(i+2)

2
3
4
