# 課前預備/ 預習
## 1. Word2Vec - gensim
> https://www.kaggle.com/jerrykuo7727/word2vec

In [1]:
from gensim.models.word2vec import Word2Vec

#### 語料庫（corpus）、向量（vector）、模型（model）

In [None]:
from gensim import corpora, models, similarities

### 資料前處理
在進入 gensim 建模以前，主要進行兩件事情：斷詞、同義字處理。

以下先**使用已經斷詞好的csv**，以下是中文的處理方式

英文可以參考:
> https://ithelp.ithome.com.tw/articles/10191922

In [2]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

DF = pd.read_csv('title1-8_final.csv')
DF.dropna(axis=0, inplace=True)

segement = DF['Title_CKIP'].copy()
segement = segement.astype(str)

segement_arr = []
for index in range(segement.size):
    segement.values[index] = re.sub("\\u30001", "", segement.values[index])
    segement.values[index] = re.sub("\\u3000", " ", segement.values[index])
    segement.values[index] = re.sub("\d+|\?|\.|\:|\《|\|》\？|\「|\」|\！|\：|\、|\【|\】", "", segement.values[index]) 
    # 在DF中用array存斷詞
    words = segement.values[index].strip().split(' ')
    # 移除空字串
    words = list(filter(None, words))
    segement_arr.append(words)
                
segement_df = pd.Series(segement_arr)

In [43]:
# 每筆資料須為list格式
segement_df.head()

0    [捷運, 間隔, 分鐘, 放, 人, 進站, 人潮, 塞爆, 國父, 紀念館, 站]
1       [高捷, 延至, 兩, 點收, 班, 夢時代, 跨年, 人潮, 小時, 散去]
2           [日, 北, 捷運, 量, 萬, 人, 比, 前, 年少, 萬, 人]
3     [北捷, 跨年, 輸運, 減少, 萬, 人次, 是, 因, 假日, 沒, 上班族]
4              [元旦, 連假, 收尾, 高鐵, 烏日站, 午後, 湧, 人潮]
dtype: object

### 以下是其他常用的參數：
> * sg=0：sg=0時以CBOW來訓練，sg=1時以Skip-gram來訓練
>     
>     而在特性上，Skip-gram比CBOW通常對低頻詞有更好的訓練效果 
>
>     基於以上的猜想，我們可以嘗試用Skip-gram來訓練詞向量，看看能否得到更高的準確度
>
> * window=5：CBOW下決定Word2Vec一次取多少詞來預測中間詞（Skip-gram的狀況是反過來的）
>
>     需要多少詞才能預測中間詞呢？要讓多少詞的含意來影響中間詞的含意呢？
>
>     思考完就自己作點實驗吧～（※window只差1就有巨大的影響！請務必微調這個參數）
> * min_count=5：出現次數大於等於min_count的詞，才會納入Word2Vec的詞典中
> * max_vocab_size=None：Word2Vec的詞典容納上限，出現次數最低的詞會優先被剔除
>
>     降低詞典的最大詞數，可能讓模型更容易抓到規則（噪音減少了），但也可能更難抓到規則（認識的詞太少）
> * hs=0：hs=0時採用Negative Sampling，hs=1時採用Hierarchical Softmax
> * negative=5：Negative Sampling的取樣數量，5~20適合小數據，2~5適合大數據
> * workers=3：訓練用的線程數量（可以加快訓練速度）


In [3]:
model_word2vec = Word2Vec(segement_df, size=250, iter=10, workers=4)

#### 模型儲存/ 開啟

In [None]:
import os

if os.path.isfile('word2vec.model'):
    model = Word2Vec.load('word2vec.model')
    
# model_word2vec.save('word2vec.model')

### 作圖(word2vec投影到2D)
若中文顯示有錯誤，可以參考:
> https://medium.com/marketingdatascience/%E8%A7%A3%E6%B1%BApython-3-matplotlib%E8%88%87seaborn%E8%A6%96%E8%A6%BA%E5%8C%96%E5%A5%97%E4%BB%B6%E4%B8%AD%E6%96%87%E9%A1%AF%E7%A4%BA%E5%95%8F%E9%A1%8C-f7b3773a889b

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# a dict of {word: object of numeric vector}
vocab = list(model_word2vec.wv.vocab)
# 存字典裡全部的單字
X = model_word2vec[vocab]
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
# 存每個單字的x, y座標
df_tsne = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df_tsne['x'], df_tsne['y'])
# 在圖上顯示詞
for word, pos in df_tsne.iterrows():
    ax.annotate(word, pos)
    
# plt.show()

### 比較詞的相似程度(可以一次代多詞)

In [44]:
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

most_similar(model_word2vec, ['北捷'])

Unnamed: 0,北捷,cos
0,捷運,0.99996
1,站,0.999956
2,不,0.999955
3,機捷,0.999951
4,台北,0.99995
5,營運,0.99995
6,將,0.999949
7,到,0.999948
8,一,0.999947
9,桃捷,0.999946


#### 處理label

In [40]:
one_hot = OneHotEncoder()
label_ohe = one_hot.fit_transform(DF[['Category']])
label_ohe_array = label_ohe.toarray()

### 將剛訓練好的word_embedding運用在RNN上

In [5]:
embedding_matrix = np.zeros((len(model_word2vec.wv.vocab.items()) + 1, model_word2vec.vector_size))
word2idx = {}

vocab_list = [(word, model_word2vec.wv[word]) for word, _ in model_word2vec.wv.vocab.items()]
for i, vocab in enumerate(vocab_list):
    word, vec = vocab
    embedding_matrix[i + 1] = vec
    word2idx[word] = i + 1

根據 model_word2vec 將辭轉成數字/ index

In [6]:
def text_to_index(corpus):
    new_corpus = []
    for doc in corpus:
        new_doc = []
        for word in doc:
            try:
                new_doc.append(word2idx[word])
            except:
                new_doc.append(0)
        new_corpus.append(new_doc)
    return np.array(new_corpus)

#### 確保每筆資料長度相同

In [8]:
from keras.preprocessing.sequence import pad_sequences

X_train = text_to_index(segement_df)
X_train = pad_sequences(X_train, maxlen=15)

Using TensorFlow backend.


### Emebedding Layer

In [9]:
from keras.layers.embeddings import Embedding

embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            trainable=False)

W0628 16:49:00.412173 19364 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [10]:
from keras.layers.recurrent import SimpleRNN
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten

model_RNN = Sequential()

# model_RNN.add(Embedding(output_dim=64, input_dim=len(list(model_word2vec.wv.vocab)), input_length=15))
model_RNN.add(embedding_layer)

model_RNN.add(SimpleRNN(64))
model_RNN.add(Dense(units=32, activation='relu'))

model_RNN.add(Dense(units=4, activation='softmax'))

W0628 16:49:08.121798 19364 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0628 16:49:08.140827 19364 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0628 16:49:08.147852 19364 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0628 16:49:08.148853 19364 deprecation_wrapper.py:119] From c:\users\wade\.virtualenvs\19'_summer_vacation-ib8vnh7u\lib\site-packages\keras\backend\tensorflow_backend.py:181: The name tf.ConfigProto is d

In [42]:
model_RNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
train_history_RNN = model_RNN.fit(X_train, label_ohe_array, batch_size=30, epochs=10,verbose=2, validation_split=0.2)

Train on 1160 samples, validate on 291 samples
Epoch 1/10
 - 1s - loss: 1.1782 - acc: 0.4957 - val_loss: 1.2741 - val_acc: 0.4261
Epoch 2/10
 - 0s - loss: 1.0838 - acc: 0.5500 - val_loss: 1.2882 - val_acc: 0.4089
Epoch 3/10
 - 0s - loss: 1.0857 - acc: 0.5233 - val_loss: 1.3517 - val_acc: 0.4399
Epoch 4/10
 - 0s - loss: 1.0727 - acc: 0.5534 - val_loss: 1.3076 - val_acc: 0.4399
Epoch 5/10
 - 0s - loss: 1.0543 - acc: 0.5681 - val_loss: 1.3261 - val_acc: 0.4502
Epoch 6/10
 - 0s - loss: 1.0495 - acc: 0.5707 - val_loss: 1.3122 - val_acc: 0.4364
Epoch 7/10
 - 0s - loss: 1.0390 - acc: 0.5707 - val_loss: 1.3618 - val_acc: 0.4433
Epoch 8/10
 - 0s - loss: 1.0406 - acc: 0.5629 - val_loss: 1.2786 - val_acc: 0.4502
Epoch 9/10
 - 0s - loss: 1.0260 - acc: 0.5724 - val_loss: 1.2986 - val_acc: 0.4570
Epoch 10/10
 - 0s - loss: 1.0176 - acc: 0.5862 - val_loss: 1.2840 - val_acc: 0.4570
