<a href="https://colab.research.google.com/github/iamlekh/NLP/blob/master/movie_review_sentiment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Load Text Data**

In [1]:
path = '/content/drive/My Drive/review_polarity/'

In [2]:
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from pandas import DataFrame
import numpy as np

In [3]:
from os import listdir
# load doc into memory
def load_doc(filename):
  file = open(filename, 'r' )
  text = file.read()
  file.close()
  return text


def process_docs(directory):
  for filename in listdir(directory):
    if not filename.endswith(".txt"):
      next
    path = directory + '/' + filename
    doc = load_doc(path)
    print( 'Loaded %s' % filename)

# directory = path + '/txt_sentoken/neg'
# process_docs(directory)

# **Clean Text Data**

In [4]:
def clean_doc(doc):
  tokens = doc.split()
  re_punc = re.compile( '[%s]' % re.escape(string.punctuation))
  tokens = [re_punc.sub( '' , w) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  stop_words = set(stopwords.words( 'english' ))
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

## **Vocab**

In [5]:
def add_doc_to_vocab(filename, vocab):
  doc = load_doc(filename)
  tokens = clean_doc(doc)
  vocab.update(tokens)

In [6]:
def process_docs(directory, vocab):
  for filename in listdir(directory):
    if not filename.endswith(".txt"):
      next
    path = directory + '/' + filename
    add_doc_to_vocab(path, vocab)

In [7]:
# save list to file
def save_list(lines, filename):
  data = '\n' .join(lines)
  file = open(filename, 'w' )
  file.write(data)
  file.close()

In [23]:
# define vocab
vocab = Counter()

process_docs( path + 'txt_sentoken/neg' , vocab)
process_docs( path + 'txt_sentoken/pos' , vocab)

print(len(vocab))

print(vocab.most_common(50))

min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

save_list(tokens, path + 'vocab.txt' )

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]
14803


In [None]:
!python -m nltk.downloader all

## **Bag-of-Words Representation**

In [8]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
  doc = load_doc(filename)
  tokens = clean_doc(doc)
  tokens = [w for w in tokens if w in vocab]
  return ' ' .join(tokens)

In [9]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
  lines = list()
  for filename in listdir(directory):
    if is_train and filename.startswith( 'cv9' ):
      continue
    if not is_train and not filename.startswith( 'cv9' ):
      continue
    path = directory + '/' + filename
    line = doc_to_line(path, vocab)
    lines.append(line)
  return lines

In [13]:
# load and clean a dataset
def load_clean_dataset(vocab, is_train):
  neg = process_docs( path + 'txt_sentoken/neg' , vocab, is_train)
  pos = process_docs( path + 'txt_sentoken/pos' , vocab, is_train)
  docs = neg + pos
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [14]:
# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [17]:
# # load the vocabulary
# vocab_filename = path + 'vocab.txt'
# vocab = load_doc(vocab_filename)
# vocab = vocab.split()
# vocab = set(vocab)
# docs, labels = load_clean_dataset(vocab)
# print(len(docs), len(labels))

In [1]:
# load the vocabulary
vocab_filename = path + 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode= 'freq' )
Xtest = tokenizer.texts_to_matrix(test_docs, mode= 'freq' )
print(Xtrain.shape, Xtest.shape)

## **Model**

In [20]:
# define the model
def define_model(n_words):
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation= 'relu' ))
  model.add(Dense(1, activation= 'sigmoid' ))
  model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
  return model

# evaluate a neural network model
def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
  scores = list()
  n_repeats = 10
  n_words = Xtest.shape[1]
  for i in range(n_repeats):
    model = define_model(n_words)
    model.fit(Xtrain, ytrain, epochs=10, verbose=0)
    _, acc = model.evaluate(Xtest, ytest, verbose=0)
    scores.append(acc)
  print( '%d accuracy: %s' % ((i+1), acc))
  return scores

In [21]:
# prepare bag of words encoding of docs
def prepare_data(train_docs, test_docs, mode):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(train_docs)
  Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
  Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
  return Xtrain, Xtest



vocab_filename = path + 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

train_docs = np.asarray(train_docs)
ytrain = np.asarray(ytrain)
test_docs = np.asarray(test_docs)
ytest = np.asarray(ytest)

modes = [ 'binary' , 'count' , 'tfidf' , 'freq' ]
results = DataFrame()
for mode in modes:
  # prepare data for mode
  Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
  # evaluate model on data for mode
  results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)

print(results.describe())


10 accuracy: 0.9399999976158142
10 accuracy: 0.9049999713897705
10 accuracy: 0.8799999952316284
10 accuracy: 0.8799999952316284
          binary      count      tfidf       freq
count  10.000000  10.000000  10.000000  10.000000
mean    0.931000   0.896500   0.871000   0.869000
std     0.009369   0.007091   0.019408   0.009068
min     0.915000   0.885000   0.835000   0.850000
25%     0.925000   0.895000   0.863750   0.865000
50%     0.935000   0.895000   0.880000   0.872500
75%     0.935000   0.898750   0.883750   0.875000
max     0.945000   0.910000   0.890000   0.880000


In [23]:
tokenizer = create_tokenizer(train_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode= 'binary' )
Xtest = tokenizer.texts_to_matrix(test_docs, mode= 'binary' )
# define network
n_words = Xtrain.shape[1]
model = define_model(n_words)
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
57/57 - 0s - loss: 0.4817 - accuracy: 0.7794
Epoch 2/10
57/57 - 0s - loss: 0.0813 - accuracy: 0.9878
Epoch 3/10
57/57 - 0s - loss: 0.0221 - accuracy: 0.9994
Epoch 4/10
57/57 - 0s - loss: 0.0098 - accuracy: 1.0000
Epoch 5/10
57/57 - 0s - loss: 0.0058 - accuracy: 1.0000
Epoch 6/10
57/57 - 0s - loss: 0.0038 - accuracy: 1.0000
Epoch 7/10
57/57 - 0s - loss: 0.0027 - accuracy: 1.0000
Epoch 8/10
57/57 - 0s - loss: 0.0019 - accuracy: 1.0000
Epoch 9/10
57/57 - 0s - loss: 0.0014 - accuracy: 1.0000
Epoch 10/10
57/57 - 0s - loss: 0.0011 - accuracy: 1.0000


<tensorflow.python.keras.callbacks.History at 0x7f28259c0a58>

## **Prediction**

In [26]:
# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, model):
  tokens = clean_doc(review)
  tokens = [w for w in tokens if w in vocab]
  line = ' ' .join(tokens)
  encoded = tokenizer.texts_to_matrix([line], mode= 'binary' )
  yhat = model.predict(encoded, verbose=0)
  percent_pos = yhat[0,0]
  if round(percent_pos) == 0:
    return (1-percent_pos), ' NEGATIVE '
  return percent_pos, ' POSITIVE '

# test positive text
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print( 'Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print( 'Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Best movie ever! It was great, I recommend it.]
Sentiment:  POSITIVE  (58.316%)
Review: [This is a bad movie.]
Sentiment:  NEGATIVE  (64.568%)
