IMDB Movie Comments Sentiment Analysis Practice
===
Kenny Hsieh, 2018/3/15

- Construct MLP, RNN, LSTM Model to overcome the task

## Download the IMDB Dataset

In [1]:
import urllib.request
import os
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "daclImdb_v1.tar.gz"
if not os.path.isfile(filepath): 
  response = urllib.request.urlretrieve(url, filepath)
  print("Downloaded : ", response)
  
tfile = tarfile.open("daclImdb_v1.tar.gz", 'r:gz')
result = tfile.extractall()

Downloaded :  ('daclImdb_v1.tar.gz', <http.client.HTTPMessage object at 0x7fbe89265b38>)


In [2]:
!ls -ls

total 82164
    4 drwxr-xr-x 4 7297 1000     4096 Jun 26  2011 aclImdb
82156 -rw-r--r-- 1 root root 84125825 Mar 21 13:43 daclImdb_v1.tar.gz
    4 drwxr-xr-x 1 root root     4096 Mar 21 13:42 datalab


## Data Preprocessing : Remove the html Tag

In [3]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import re
def rm_tags(text):
  re_tag = re.compile(r'<[^>]+>')
  return re_tag.sub("", text)

import os
def read_files(filetype):
  path = "aclImdb/"
  file_list = []
  
  positive_path = path + filetype + "/pos/"
  for f in os.listdir(positive_path):
    file_list += [positive_path + f]
    
  negative_path = path + filetype + "/neg/"
  for f in os.listdir(negative_path):
    file_list += [negative_path + f]
    
  print("Read", filetype, 'files :', len(file_list))
  
  all_labels = ([1] * 12500 + [0] * 12500)
  all_texts = []
  for fi in file_list:
    with open(fi, encoding = 'utf-8') as file_input:
      all_texts += [rm_tags("".join(file_input.readlines()))]
      
  return all_labels, all_texts

Using TensorFlow backend.


In [4]:
y_train, train_text = read_files("train")
y_test, test_text = read_files("test")

print("Label : ", y_train[0])
print("Context : ", train_text[0])

Read train files : 25000
Read test files : 25000
Label :  1
Context :  Maria Braun is an extraordinary woman presented fully and very credibly, despite being so obtuse as to border on implausibility. She will do everything to make her marriage work, including shameless opportunism and sexual manipulation. And thus beneath the vicey exterior, she reveals a rather sweet value system. The film suffers from an abrupt and unexpected ending which afterwards feels wholly inadequate, with the convenience familiar from ending your school creative writing exercise with 'and then I woke up'. It is also book-ended at the other end with the most eccentric title sequence I've ever seen, but don't let any of that put you off.


## Construct the Tokenizer from Training Data

In [5]:
token = Tokenizer(num_words = 4000)
token.fit_on_texts(train_text)

print(token.document_count)
#print(token.word_index)

25000


## Tokenlize Text to Sequence

In [6]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

print(train_text[0])
print(x_train_seq[0])

Maria Braun is an extraordinary woman presented fully and very credibly, despite being so obtuse as to border on implausibility. She will do everything to make her marriage work, including shameless opportunism and sexual manipulation. And thus beneath the vicey exterior, she reveals a rather sweet value system. The film suffers from an abrupt and unexpected ending which afterwards feels wholly inadequate, with the convenience familiar from ending your school creative writing exercise with 'and then I woke up'. It is also book-ended at the other end with the most eccentric title sequence I've ever seen, but don't let any of that put you off.
[2902, 6, 31, 2797, 251, 1346, 1310, 2, 51, 462, 108, 34, 13, 5, 3604, 19, 55, 76, 78, 282, 5, 93, 37, 1338, 153, 582, 2, 860, 2, 1339, 1, 55, 2664, 3, 243, 1041, 1103, 1502, 1, 18, 2468, 35, 31, 2, 2067, 273, 59, 3491, 759, 15, 1, 1074, 35, 273, 125, 391, 1514, 483, 3457, 15, 91, 9, 8, 6, 77, 270, 1050, 29, 1, 81, 126, 15, 1, 87, 421, 716, 203, 12

## Padding Sequence to Same Length

In [7]:
x_train = sequence.pad_sequences(x_train_seq, maxlen = 400)
x_test = sequence.pad_sequences(x_test_seq, maxlen = 400)

print("Before Padding : ", len(x_train_seq[0]))
print("After Padding : ", len(x_train[0]))

# Forward add 0 / delete exceed numeric 

Before Padding :  88
After Padding :  400


## Multilayer Perceptron (MLP)

In [8]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding # Numeric list to Vector list

model = Sequential()
model.add(Embedding(output_dim = 32,
                   input_dim = 4000,
                   input_length = 400))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(units = 256,
               activation = 'relu'))
model.add(Dropout(0.35))

model.add(Dense(units = 1,
               activation = 'sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 32)           128000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               3277056   
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 3,405,313
Trainable params: 3,405,313
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])

train_history = model.fit(x_train, y_train,
                          validation_split = 0.2,
                         batch_size = 100,
                         epochs = 10)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.84156

In [11]:
predict = model.predict_classes(x_test)
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int32)

## Observe the Specific Commits with Actual & Predict Label

In [12]:
Sentiment_dict = {1 : 'Pos', 0 : 'Neg'}
def display_test_sentiment(i):
  print(test_text[i])
  print('Actual Answer : ', Sentiment_dict[y_test[i]])
  print('Predict Answer : ', Sentiment_dict[predict_classes[i]])
  
display_test_sentiment(2)

I've watched this movie on a fairly regular basis for most of my life, and it never gets old. For all the snide remarks and insults (mostly from David Spade), "Tommy Boy" has a giant heart. And that's what keeps this movie funny after all these years.Tommy Callahan (Chris Farley) is the son of Big Tom Callahan (Brian Dennehy), master car parts salesman, and has ridden on that all his life. But after his died dies on his wedding day, Tommy learns that the company is in debt, and about to be bought by Ray Zalinsky (Dan Akroyd), the owner of a huge car parts company. So in order to save the company, Tommy has to go on the road to sell the company's new brake pads. Along for the ride, though not by choice, is Richard Hayden (David Spade) a former classmate of Tommy's who was Big Tom's right-hand man.The movie rides on the chemistry between the two SNL stars (and real-life best friends) Chris Farley and David Spade. The duo has enough comic energy going between them to power the world. It's

## Check User Prompt Whether is Pos/Neg

In [0]:
def prompt_predict(input_text):
  token_text = token.texts_to_sequences([input_text])
  pad_text = sequence.pad_sequences(token_text, maxlen = 400)
  predict_result = model.predict_classes(pad_text)
  
  print("This review sentiment is : " + Sentiment_dict[predict_result[0][0]])

In [14]:
user_input = '''Featuring the very latest in computer generated effects and a plot that would insult a retarded six year old, Ironman is the latest in a long line of Hollywood product culled from the dizzying literary heights of superhero comic books.

Product this disposable doesn't happen by accident. Teams of industry product makers work around the clock for years to make something this tasteless, tedious and utterly bland. Don't even try to characterize this product as a "movie." It simply isn't.

If Ironman was edible, it would be a jar of baby food. Library paste flavored baby food. If it were music, it would be chopsticks. If it were a vehicle, it would be a tricycle. A tricycle with training wheels. If it were...

What? Oh yeah, the product. It's something about the usual gazillionaire playboy who flies around in a sooper-dooper extra-gadgety metal suit and frees Afghanistan from the Evil Brown People, making the country safe for, I dunno, oil companies I guess. The plot is such a pile of moronic sub-juvenile drivel it's impossible to focus on for more than a few minutes before being distracted by something more interesting like, say, the butter-flavored grease stains on the bottom of your popcorn bag.'''

prompt_predict(user_input)

This review sentiment is : Neg


## Recurrent Neural Network (RNN)

In [15]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

model = Sequential()
model.add(Embedding(output_dim = 32,
                   input_dim = 4000,
                   input_length = 400))
model.add(Dropout(0.2))

model.add(SimpleRNN(units = 16))
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))

model.add(Dense(units = 1, activation = 'sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 32)           128000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 133,393
Trainable params: 133,393
Non-trainable params: 0
_________________________________________________________________
None

In [16]:
model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])

train_history = model.fit(x_train, y_train,
                          validation_split = 0.2,
                         batch_size = 100,
                         epochs = 10)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
 3500/20000 [====>.........................] - ETA: 26s - loss: 0.2493 - acc: 0.9066

Epoch 4/10
Epoch 5/10

Epoch 6/10
Epoch 7/10

Epoch 8/10
Epoch 9/10

Epoch 10/10


In [17]:
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.8308

## Long Short-term Memory (LSTM)

In [18]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

model = Sequential()
model.add(Embedding(output_dim = 32,
                   input_dim = 4000,
                   input_length = 400))
model.add(Dropout(0.2))

model.add(LSTM(units = 32))
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))

model.add(Dense(units = 1, activation = 'sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 400, 32)           128000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_5 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 145,025
Trainable params: 145,025
Non-trainable params: 0
_________________________________________________________________
None

In [19]:
model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])

train_history = model.fit(x_train, y_train,
                          validation_split = 0.2,
                          batch_size = 100,
                          epochs = 10)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
 3300/20000 [===>..........................] - ETA: 1:43 - loss: 0.1902 - acc: 0.9321

Epoch 4/10
Epoch 5/10

Epoch 6/10
Epoch 7/10

Epoch 8/10
Epoch 9/10

Epoch 10/10


In [20]:
scores = model.evaluate(x_test, y_test, verbose = 1)
scores[1]



0.83332