# IMDB Sentiment prediction with MLP
* Data Process
```python
import tarfile
import os
import pandas as pd
from keras.preprocessing.text import Tokenizer
```
* MLP
```python
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten, Dense, Dropout
```
* Tools for prediction
```python
import numpy as np
import random
from sklearn.metrics import confusion_matrix
```
* 如果採用序列式(時間相關)模型
* RNN: Recurrent Nueron Network
* 情緒分析的時候，使用RNN的影響不大
```python
from keras.layers import SimpleRNN
```

In [1]:
import tarfile
import os
if not os.path.exists("imdb"):
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall("imdb")
    tar.close

In [2]:
import glob
import pandas as pd
base = "imdb/aclImdb/train/"

postxt = []
pos = glob.glob(base + "pos/*.txt")
for p in pos:
    with open(p, "r", encoding="utf-8") as f:
        postxt.append(f.read())

negtxt = []
neg = glob.glob(base + "neg/*.txt")
for n in neg:
    with open(n, "r", encoding="utf-8") as f:
        negtxt.append(f.read())
        
train = pd.DataFrame({
    "content": postxt + negtxt,
    "sentiment":[1]*len(postxt)+[0]*len(negtxt)
    
})

train

Unnamed: 0,content,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1
...,...,...
24995,"My comments may be a bit of a spoiler, for wha...",0
24996,"The ""saucy"" misadventures of four au pairs who...",0
24997,"Oh, those Italians! Assuming that movies about...",0
24998,Eight academy nominations? It's beyond belief....,0


In [3]:
base = "imdb/aclImdb/test/"
postxt = []
pos = glob.glob(base + "pos/*.txt")
for p in pos:
    with open(p, "r", encoding="utf-8") as f:
        postxt.append(f.read())

negtxt = []
neg = glob.glob(base + "neg/*.txt")
for n in neg:
    with open(n, "r", encoding="utf-8") as f:
        negtxt.append(f.read())
        
test = pd.DataFrame({
    "content": postxt + negtxt,
    "sentiment":[1]*len(postxt)+[0]*len(negtxt)
    
})

test

Unnamed: 0,content,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1
...,...,...
24995,With actors like Depardieu and Richard it is r...,0
24996,If you like to get a couple of fleeting glimps...,0
24997,When something can be anything you want it to ...,0
24998,"I had heard good things about ""States of Grace...",0


In [4]:
# Step1. 幫你列出所有出現過的字
from keras.preprocessing.text import Tokenizer
# 先創造你care的詞的詞典，2000頁的辭典(等一下每一個詞都是2000維度的向量)
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train["content"])


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
tok.word_counts

OrderedDict([('for', 44335),
             ('a', 163040),
             ('movie', 43564),
             ('that', 69787),
             ('gets', 3204),
             ('no', 12690),
             ('respect', 498),
             ('there', 15735),
             ('sure', 2686),
             ('are', 29425),
             ('lot', 3966),
             ('of', 145847),
             ('memorable', 664),
             ('quotes', 97),
             ('listed', 125),
             ('this', 75974),
             ('gem', 359),
             ('imagine', 737),
             ('where', 6321),
             ('joe', 661),
             ('piscopo', 17),
             ('is', 107313),
             ('actually', 4237),
             ('funny', 4276),
             ('maureen', 49),
             ('stapleton', 15),
             ('scene', 5348),
             ('stealer', 14),
             ('the', 336148),
             ('moroni', 4),
             ('character', 6703),
             ('an', 21538),
             ('absolute', 352),
             ('

In [6]:
tok.word_counts.items()



In [7]:
# 如果你要看詞轉化的數字
# tok.word_index

# 根據出現次數做一個排序
sorted(tok.word_counts.items(), key = lambda x:x[1],reverse=True)

[('the', 336148),
 ('and', 164097),
 ('a', 163040),
 ('of', 145847),
 ('to', 135708),
 ('is', 107313),
 ('br', 101871),
 ('in', 93934),
 ('it', 79058),
 ('i', 77142),
 ('this', 75974),
 ('that', 69787),
 ('was', 48195),
 ('as', 46927),
 ('for', 44335),
 ('with', 44122),
 ('movie', 43564),
 ('but', 42594),
 ('film', 39095),
 ('on', 34188),
 ('not', 30610),
 ('you', 29877),
 ('are', 29425),
 ('his', 29366),
 ('have', 27726),
 ('he', 26952),
 ('be', 26948),
 ('one', 26513),
 ('all', 23953),
 ('at', 23507),
 ('by', 22539),
 ('an', 21538),
 ('they', 21139),
 ('who', 20599),
 ('so', 20586),
 ('from', 20494),
 ('like', 20272),
 ('her', 18407),
 ('or', 17994),
 ('just', 17759),
 ('about', 17371),
 ("it's", 17153),
 ('out', 17092),
 ('if', 16790),
 ('has', 16790),
 ('some', 15743),
 ('there', 15735),
 ('what', 15349),
 ('good', 15100),
 ('more', 14246),
 ('when', 14175),
 ('very', 14062),
 ('up', 13274),
 ('no', 12690),
 ('time', 12682),
 ('she', 12657),
 ('even', 12650),
 ('my', 12492),
 ('wou

In [8]:
# Step2. 根據剛才統計的辭典，把每一個詞化成一個數字
# !!! 2000個以外的詞都丟掉了
x_train_num = tok.texts_to_sequences(train["content"])
x_test_num = tok.texts_to_sequences(test["content"])
pd.DataFrame(x_train_num)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,15,3,17,12,211,54,1158,47,249.0,23.0,...,,,,,,,,,,
1,1159,186,17,1058,16,800,1586,18,31.0,300.0,...,,,,,,,,,,
2,3,1153,44,19,14,13,386,58,511.0,170.0,...,,,,,,,,,,
3,42,3,676,544,5,866,581,8,3.0,747.0,...,,,,,,,,,,
4,22,239,29,457,121,11,31,147,18.0,454.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,58,792,200,27,3,224,4,3,1358.0,15.0,...,,,,,,,,,,
24996,1,4,685,34,8,1313,20,1,169.0,248.0,...,,,,,,,,,,
24997,446,145,12,99,41,16,912,8,838.0,2.0,...,,,,,,,,,,
24998,1802,42,721,10,67,61,101,9,13.0,3.0,...,,,,,,,,,,


In [9]:
#(optional)看看數字代表哪一個詞
reverse_index = {v:k for k,v in tok.word_index.items()}
reverse_index[6]

'is'

In [10]:
# Step3. 把每一篇文章截長補短成一樣多的詞數
# 截長：後面截的
# 補短：補0
from keras.preprocessing.sequence import pad_sequences
x_train_num_pad = pad_sequences(x_train_num, maxlen=200)
x_test_num_pad = pad_sequences(x_test_num, maxlen=200)
# 輸入：一篇文章：200詞(出現次數最多2000詞之內)
# 輸出：0~1 (負面 ~ 正面)
pd.DataFrame(x_train_num_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0,0,0,0,0,0,0,0,...,32,1552,103,15,1605,1,1789,14,3,565
1,0,0,0,0,0,0,0,0,0,0,...,31,164,485,34,3,274,16,297,238,36
2,0,0,0,0,0,0,0,0,0,0,...,10,444,1,2,1,134,33,15,5,1
3,24,277,1951,1509,295,9,642,12,26,29,...,17,84,147,10,525,25,5,543,8,98
4,0,0,0,0,0,0,0,0,0,0,...,29,213,43,20,285,46,248,1213,15,883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,672,6,1,956,4,117,89,435,126,55
24996,247,109,620,5,3,1329,8,1313,118,46,...,415,4,19,228,3,749,88,4,1,1550
24997,0,0,0,0,0,0,0,0,0,0,...,273,5,435,130,7,7,1455,155,7,7
24998,6,9,3,729,209,4,112,1,102,2,...,84,12,141,27,345,3,2,80,1,1873


In [11]:
# Embedding:輸入 → 整數；輸出 → n為度的向量
# 通常imput dimention不用選太多：人對於文章的感受沒那麼多種啦
# input length: 200
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten, Dense, Dropout
model = Sequential()
model.add(Embedding(2000, 32, input_length=200))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

# 2000詞 * 32(降為64維度) = 64000


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           64000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 6400)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               1638656   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 1,702,913
Trainable params: 1,702,913
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss="binary_crossentropy",
             optimizer="adam",
             metrics=["accuracy"])

In [13]:
y_train = train["sentiment"]
y_test = test["sentiment"]

In [14]:
model.fit(x_train_num_pad, y_train,
         batch_size=200,
         epochs=1,
         validation_split=0.1,
         verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/1
 - 6s - loss: 0.4985 - acc: 0.7299 - val_loss: 0.3789 - val_acc: 0.8264


<keras.callbacks.History at 0x1342f6e10>

In [15]:
model.evaluate(x_train_num_pad, y_train)



[0.2555586082267761, 0.89876]

In [16]:
# 挑一篇預測錯的文章來看
import numpy as np
pre = model.predict_classes(x_train_num_pad).reshape(-1)
print("pre type:",type(pre))
y_test_np = np.array(y_test).reshape(-1)
noneq = pre!=y_test_np
idx = np.nonzero(noneq)[0]
print("idx:",idx)
first =idx[0]
print("first=",first)
s = ["負面","正面"]
print("原本情緒：",s[y_test_np[first]])
print("預測情緒：",s[pre[first]])
print("原文",test["content"].iloc[first])

pre type: <class 'numpy.ndarray'>
idx: [   25    27    74 ... 24990 24994 24998]
first= 25
原本情緒： 正面
預測情緒： 負面
原文 The movie was very good. I'm an avid mystery fan and I usually figure out who is going to be killed and who did the killing. While I did figure out who was going to be killed I didn't figure out who did it. I wasn't happy with the portrayal of the Gerda character but given the year the movie was supposed to take place it is possible the woman would have been that 'cloying'. Please know that while these Poirot movies are good, they just don't have the same dynamic to them as the series does because they don't have Japp, Ms. Lemon and especially Hastings! David Suchet is definitely Poirot. I have seen every actor who's played him. The worst was Peter Ustinov!


In [17]:
from sklearn.metrics import confusion_matrix
s = ["負面","正面"]
row = [c + "(答案)" for c in s]
col = [c + "(預測)" for c in s]
pd.DataFrame(confusion_matrix(y_test_np, pre),
             columns = col,
             index = row)


Unnamed: 0,負面(預測),正面(預測)
負面(答案),10908,1592
正面(答案),939,11561


In [18]:
# Word2vec
# such as face_recognition's image feature matrix(找出圖片特徵向量檔)
# distill article's feature matrix(找出文章語意的向量檔)
# 文章轉成embedding後就是feature matrix
# input: 上 + 下文
# output: 中間詞

In [19]:
model.layers

[<keras.layers.embeddings.Embedding at 0x132d486d8>,
 <keras.layers.core.Flatten at 0x132d48f28>,
 <keras.layers.core.Dense at 0x132d48be0>,
 <keras.layers.core.Dropout at 0x132d48160>,
 <keras.layers.core.Dense at 0x132d3b9b0>]

In [20]:
model.layers[0]

<keras.layers.embeddings.Embedding at 0x132d486d8>

In [21]:
# 拿出副產物是大部分語言模型會做的事
w = model.layers[0].get_weights()
w

[array([[-0.0038462 , -0.01468211, -0.00959965, ..., -0.00614225,
          0.01202007, -0.01588892],
        [ 0.0228773 , -0.04527731, -0.03130185, ..., -0.02707363,
          0.02411339, -0.02985743],
        [ 0.04230611, -0.0360918 , -0.04479472, ...,  0.02134322,
          0.03960734,  0.05705769],
        ...,
        [-0.02021505, -0.06563606, -0.01483669, ...,  0.07212427,
          0.03308339,  0.06931505],
        [ 0.02754164, -0.03535805, -0.00468473, ..., -0.03703921,
         -0.02763023, -0.00151417],
        [ 0.03389607, -0.04394251,  0.01910388, ..., -0.00705455,
          0.02410809, -0.00097448]], dtype=float32)]

In [22]:
from keras.models import Sequential
embed = Sequential()
embed.add(Embedding(2000,32,input_length=1))
embed.layers[0].set_weights(w)
embed.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 32)             64000     
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


In [23]:
# from keras.models import Model
# from keras.layers import Input
# i = Input(shape=(1,))
# o = model.layers[0](i)
# embed = Model(input = i, output=o)
# embed.summary()

In [24]:
import random
c = random.randint(1,2000)
print("詞：", reverse_index[c])
print("化成的詞向量：")
print(embed.predict([c]))

詞： disappointed
化成的詞向量：
[[[-0.06019773  0.00288653  0.01935991  0.01943269 -0.03163244
    0.07534344 -0.01938086 -0.0114468   0.0011487   0.0474615
    0.02545121  0.04425084 -0.02650946 -0.01370623 -0.01732771
    0.06566623  0.00499412  0.03274011  0.05466926 -0.00183589
    0.00015002 -0.052589    0.06810324 -0.03124956  0.00577444
   -0.05435186 -0.02393979 -0.00602113 -0.03025211 -0.00572064
   -0.0593499  -0.01897124]]]


In [26]:
# 如果採用序列式(時間相關)模型
# RNN: Recurrent Nueron Network
# 情緒分析的時候，使用RNN的影響不大
from keras.layers import SimpleRNN

model = Sequential()
model.add(Embedding(2000, 32, input_length=200))
model.add(SimpleRNN(16))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

# 2000詞 * 32(降為64維度) = 64000

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 32)           64000     
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss="binary_crossentropy",
             optimizer="adam",
             metrics=["accuracy"])

In [28]:
model.fit(x_train_num_pad, y_train,
         batch_size=200,
         epochs=1,
         validation_split=0.1,
         verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/1
 - 9s - loss: 0.6421 - acc: 0.6038 - val_loss: 0.3450 - val_acc: 0.8772


<keras.callbacks.History at 0x1342f6be0>

In [29]:
model.evaluate(x_train_num_pad, y_train)



[0.41809886823654174, 0.81104]