# Sentiment Analysis of movie reviews:
To build a model which  can classify sentiments(positive or negative) of movie reviews. 
Sentiment Analysis is a process of computationally categorizing opinions expressed in text to identify whether the attitude is positive or negative... 
Inspired from  Géron, A. (2019). Hands-on machine learning with Scikit-Learn, Keras, and TensorFlow: Concepts, tools, and techniques to build intelligent systems. O'Reilly & Brownlee, J. (2017). Deep Learning for Natural Language Processing: Develop Deep Learning Models for your Natural Language Problems. Machine Learning Mastery.

In [None]:
from nltk.corpus import stopwords
from collections import Counter
from os import listdir
import string
import re
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [None]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [None]:
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)
def process_docs(directory, vocab):
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path, vocab)

In [None]:
vocab = Counter()
process_docs('D:/review_polarity.tar/txt_sentoken/pos', vocab)
process_docs('D:/review_polarity.tar/txt_sentoken/neg', vocab)
print(len(vocab))
print(vocab.most_common(50))

In [None]:
min_occurrence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurrence]
print(len(tokens))

In [None]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    save_list(tokens, 'vocab.txt')

# Extracting features from the reviews ready for modeling
A word embedding is a way of representing text where each word in the vocabulary is represented by a real valued vector in a high-dimensional space. The vectors are learned in such a way that words that have similar meanings will have similar representation
in the vector space 
The real valued vector representation for words can be learned while training the neural network. This is done by using the Keras deep learning library using the Embedding layer

In [None]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
print(vocab)

#Clean the document

In [None]:
def clean_doc(doc,vocab):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens


#Load all the training data reviews
Load the data, clean it and and return as a list of strings, with one document(review) per string
We want each document to be a string for easy encoding as a sequence of integers later

In [None]:
def process_docs(directory, vocab, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        documents.append(tokens)
    return documents
    print('Loaded %s' % filename)

## *Movie Reviews to Bag-of Words Vectors*
The training documents have to be encoded as sequences of integers as the Keras Embedding layer requires integer inputs where each integer maps to a single token that has a specific real-valued vector representation within the embedding. 
We will use the Tokenizer class in the Keras API. 
Keras API is used to convert the reviews to encoded document vectors.
The Tokenizer class will easily transorm the documents into encoded vectors
1. Create the Tokenizer
2. Fit on the text documents in the training dataset

In [None]:
def load_clean_dataset(vocab, is_train):
    neg = process_docs('D:/Sheny/review_polarity/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('D:/Sheny/review_polarity/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

In [None]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

To encode the reviews in the training dataset the texts to sequences() function on the Tokenizer is called.
For the efficient computation by Keras all the documents should have the same length. thererefore, 
pad all reviews to the length of the longest review in the training dataset. max() function is used for that

The maxlength is used as a parameter in integer encode and padding the sequences.
pad sequences() is used to pad the sequences to the maximum length by adding 0 values on the end.

In [None]:
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

The model will use an Embedding layer as the first hidden layer. 
The Embedding layer requires the specification of the vocabularysize, the size of the real-valued vector space, and the maximum length of input documents. 
The vocabulary size is the total number of words in our vocabulary, plus one for unknown words.
This could be the vocab set length or the size of the vocab within the tokenizer used to integer encode the documents

#Define the Neural Network Model, compile it and display the summary

In [None]:
# define the model
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(64, 8, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
from keras.layers import Dropout
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
train_docs, ytrain = load_clean_dataset(vocab, True)
tokenizer = create_tokenizer(train_docs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
model.save('model.h5')

In [None]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clean_doc(review, vocab)
    padded = encode_docs(tokenizer, max_length, [line])
    yhat = model.predict(padded, verbose=0)
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [None]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

#Load all reviews

In [None]:
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
tokenizer = create_tokenizer(train_docs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)


#Load and Evaluate the model

In [None]:
from keras.models import load_model
model = load_model('model.h5')
_, acc = model.evaluate(Xtrain, ytrain, verbose=0)
print('Train Accuracy: %.2f' % (acc*100))
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))


In [None]:
_, acc = model.evaluate(Xtrain, ytrain, verbose=0)
print('Train Accuracy: %.2f' % (acc*100))
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))
text = """
The Karen Carpenter Story shows a little more about singer Karen Carpenter's complex life.
Though it fails in giving accurate facts, and details.<br /><br />Cynthia Gibb (portrays Karen) was not a fine election. 
She is a good actress , but plays a very naive and sort of dumb Karen Carpenter. 
I think that the role needed a stronger character.
Someone with a stronger personality.<br /><br />Louise Fletcher role as Agnes Carpenter is terrific, 
she does a great job as Karen's mother.<br /><br />It has great songs, which could have been included in a soundtrack album.
Unfortunately they weren't, though this movie was on the top of the ratings in USA and other several countries

"""
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text,sentiment, percent*100))
