# Sentiment Analysis on movies Using CNN

In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load Movie Review Dataset
dataset = pd.read_csv(r'movie_reviews.csv')

In [3]:
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
dataset['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
# Split the dataset inot train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

In [6]:
# Preprocessing
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(train_reviews)

In [7]:
train_sequences = t.texts_to_sequences(train_reviews)
test_sequences = t.texts_to_sequences(test_reviews)

In [8]:
print("Vocabulary size={}".format(len(t.word_counts)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=105664
Number of Documents=35000


In [9]:
# padding the dataset to a maximum review length in words
MAX_SEQUENCE_LENGTH = 500
X_train = sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Encoding Labels

In [10]:
# Encoding Labels
label_encoder = LabelEncoder()
num_classes=2 # positive - 1, negative - 0
y_train = label_encoder.fit_transform(train_sentiments)
y_test = label_encoder.transform(test_sentiments)

In [11]:
VOCAB_SIZE = len(t.word_counts)+1

## Model Training

In [12]:
EMBED_SIZE = 32
EPOCHS=5
BATCH_SIZE=128

In [13]:
# Since it is textual data, using 1D convolutions to scan through the sentences
# create the model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           3381280   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               2000250   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 5,384,885
Trainable params: 5,384,885
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Fit the model
model.fit(X_train, y_train, 
          validation_split=0.2,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          verbose=1)

Train on 28000 samples, validate on 7000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x19821a43f60>

## Model Accuracy

In [15]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 89.43%
