In [43]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Tokenizer練習

In [45]:
sample = ['the cat is fat','the dog eat my the homework']
tokenizer = Tokenizer(num_words=9)
# num_words是指你這個Tokenizer分詞器最多可以分幾個單詞，例如9個，
# 就是分詞器最多能為9個不同的字編碼，
# 如果num_words小於實際上總數的詞，就會有詞沒有編碼到，直接被省略。
tokenizer.fit_on_texts(sample)
# 建立字典
s = tokenizer.texts_to_sequences(sample)
# 依照字典編碼，將文字轉數字
print(s)

[[1, 2, 3, 4], [1, 5, 6, 7, 1, 8]]


In [46]:
index = tokenizer.word_index
# 字典的內部關係
print(index)
one = tokenizer.texts_to_matrix(sample,mode='binary')
print(one)

{'the': 1, 'cat': 2, 'is': 3, 'fat': 4, 'dog': 5, 'eat': 6, 'my': 7, 'homework': 8}
[[0. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 1. 1.]]


# IMDB電影好壞預測，用全連階層分類

In [151]:
data = pd.read_csv(r'drive/MyDrive/AI專案練習/IMDB.csv')
data.columns

Index(['review', 'sentiment'], dtype='object')

In [152]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
texts = data['review']
labels = labelencoder.fit_transform(data['sentiment'])
print(len(texts))
print(len(labels))
print(data['sentiment'][:10])
print(labels[:10])

50000
50000
0    positive
1    positive
2    positive
3    negative
4    positive
5    positive
6    positive
7    negative
8    negative
9    positive
Name: sentiment, dtype: object
[1 1 1 0 1 1 1 0 0 1]


In [153]:
maxlen = 100
train_sample = 200
val_sample = 10000
max_word = 10000
tokenizer = Tokenizer(num_words=max_word)
tokenizer.fit_on_texts(texts)
sequence = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('tokenizer的分詞字典內容有幾個 : ',len(tokenizer.word_index))

tokenizer的分詞字典內容有幾個 :  124252


In [154]:
content = pad_sequences(sequence,maxlen=maxlen)
# 只取筆評論的前100字
labels = np.asarray(labels)
print(content.shape)
print(labels.shape)

(50000, 100)
(50000,)


In [155]:
indices = np.arange(data.shape[0])
# 產生一個list 0~49999
np.random.shuffle(indices)
# 把裡面的內容打亂
content = content[indices]
labels = labels[indices]
print(content.shape)

x_train = content[:train_sample]
y_train = labels[:train_sample]
x_test = content[train_sample:train_sample+val_sample]
y_test = labels[train_sample:train_sample+val_sample]
print(x_test.shape)
print(y_train.shape)

(50000, 100)
(10000, 100)
(200,)


In [156]:
embedding_index = {}
# 建立一個字典，儲存每個字對應的向量空間 
embedding_dim = 100
f = open(r'drive/MyDrive/AI專案練習/glove.6B.100d.txt')
for line in f:
  value = line.split()
  word = value[0]
  coef = np.asarray(value[1:],dtype='float32')
  embedding_index[word] = coef
f.close()
print(len(embedding_index))

400000


In [157]:
# 建立一個嵌入式矩陣
embedding_matrix = np.zeros((max_word,embedding_dim))
# 取得你要的字的向量空間，並把它存成矩陣
for word,i in word_index.items():
  if i < max_word:
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [158]:
model = Sequential()
model.add(Embedding(max_word,embedding_dim,input_length=maxlen))
# max_word是embedding在輸入一句話時，最多面對幾種不同的單詞
# embedding_dim是輸入的維度
# input_length是在下層是flatten or Dense設定的，告訴輸出的維度
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
flatten_19 (Flatten)         (None, 10000)             0         
_________________________________________________________________
dense_37 (Dense)             (None, 32)                320032    
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [159]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [160]:
print(x_train.shape)

(200, 100)


In [161]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              #loss = 'categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train,
                    y_train,
                    batch_size=32,
                    epochs=10,
                    validation_data=(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [162]:
for i in range(10):
  test = x_train[i]
  test = test[np.newaxis,:]
  r = model.predict(test)
  print('real value ',y_train[i],', predict value ',r)

real value  1 , predict value  [[0.9959372]]
real value  0 , predict value  [[0.03950259]]
real value  1 , predict value  [[0.99449533]]
real value  1 , predict value  [[0.9666947]]
real value  0 , predict value  [[0.00366661]]
real value  1 , predict value  [[0.9939977]]
real value  0 , predict value  [[0.01439723]]
real value  1 , predict value  [[0.9898896]]
real value  0 , predict value  [[0.01063946]]
real value  0 , predict value  [[0.01433861]]
