In [None]:
# Use 1D Convolutional Neural Network for text mining in IMDB( Internet Movie Database) to predicate sentimental analysis 

In [None]:
import numpy as np
import pandas as pd
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
# Keras has this dataset built in, we can just load it. Also, the words have been replaced by integers  in the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data()

In [None]:
# Join the dataset to see the shape of dataset
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)


In [None]:
#print the shape of dataset. we can see the total is 50,000 rows
print("The shape of the dataset is: " )
print(X.shape)
print(y.shape)

The shape of the dataset is: 
(50000,)
(50000,)


In [None]:
# Check unique class value. We can see that it is a binary classification problem for good and bad sentiment in the movie review
print("Class:")
print(np.unique(y))

Class:
[0 1]


In [None]:
# Check the the total number of unique words in the dataset. We can see that there are less 100,000 words used in whole dataset,
print(len(np.unique(np.hstack(X))))

88585


In [None]:
# Let's we are interested in only the first 10,000 word in dataset. So we can load dataset as below
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [None]:
#  we can get the average of the reveiw length and standard deviation, we can see the review length is below 500
review_len = [len(x) for x in X]
print("the mean and standard deviation are")
print(np.mean(review_len), np.std(review_len))

the mean and standard deviation are
234.75892 172.91149458735703


In [None]:
# We'll choose review length as 500 and trancate if longer that and pad with 0 if short than that
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)


In [None]:
#Define model
model = Sequential()


In [None]:
# Add word embedding layer with 32- dimension vector to represent each word
model.add(Embedding(top_words, 32, input_length=max_words))

In [None]:
# Addd 1D CNN layer
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))

In [None]:
# Add default Maxpooling
model.add(MaxPooling1D())

In [None]:
# Add flatten  layer
model.add(Flatten())

In [None]:
# Add densen layer 
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
#Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 250)               2000250   
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 251       
Total params: 2,323,605
Trainable params: 2,323,605
Non-trainable params: 0
____________________________________________

In [None]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)

Epoch 1/2
196/196 - 26s - loss: 0.4678 - accuracy: 0.7338 - val_loss: 0.2818 - val_accuracy: 0.8806
Epoch 2/2
196/196 - 26s - loss: 0.1946 - accuracy: 0.9266 - val_loss: 0.2785 - val_accuracy: 0.8861


<tensorflow.python.keras.callbacks.History at 0x7fe4e6a179e8>

In [None]:
# Final evaluation of the model.  We can see that the model achieves the accuracy of 88.61% 
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print(scores[0])

Accuracy: 88.61%
0.27853327989578247
