# gensim을 이용한 FastText

In [1]:
# My Google Drive Mount하기!
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# Download Data

import pandas as pd

df = pd.read_csv('https://github.com/e9t/nsmc/raw/master/ratings_train.txt', sep='\t')
print(df.shape)

(150000, 3)


In [3]:
df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [4]:
# Data Cleaning

import re

def GetKoreanIntoList(text):
    return re.findall(
        r'[ㄱ-ㅎ가-힣]+',
        text
    )

data_input = df[df['document'].notnull()]['document'].map(GetKoreanIntoList)

print(data_input.shape)
print(data_input[0])
print(data_input[1])

(149995,)
['아', '더빙', '진짜', '짜증나네요', '목소리']
['흠', '포스터보고', '초딩영화줄', '오버연기조차', '가볍지', '않구나']


In [5]:
# Save Into File

def ConcatenateKorean(text):
    return ' '.join(GetKoreanIntoList(text))

data_input_concat = df[df['document'].notnull()]['document'].map(ConcatenateKorean)

print(data_input_concat.shape)
print(data_input_concat[0])
print(data_input_concat[1])

(149995,)
아 더빙 진짜 짜증나네요 목소리
흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나


In [7]:
FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'
FName = 'nsmc.txt'

with open(FolderPath + '/' + FName, 'w', encoding='utf8') as f:
    f.write('\n'.join(data_input_concat))

In [8]:
# FastText Model 학습

from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec

FastText 모형을 만든다. 설정값에는 다음과 같은 것들이 있다.

* size: 임베딩의 크기 (기본값 100)
* sg: 0이면 CBOW(기본값), 1이면 Skip-gram
* alpha: 학습률 (기본값 0.025)
* min_alpha: 최소 학습률. FastText는 학습과정에서 학습률을 이 수준까지 점점 낮춘다. (기본값 0.0001)
* window: 문장 내에서 주변 단어와 대상 단어의 최대 거리(기본값 5)
* min_count: 임베딩을 학습할 단어의 최소 출현 빈도 (기본값 5)
* Word2Vec도 사용방법은 같다.

In [9]:
model = FastText(size=16) # gensim 4.0 이상의 경우 size=16 대신 vector_size=16 으로 입력한다.

# 어휘를 파악한다. 파일로 저장한 경우에는 sentence=data_input 대신에 corpus_file='nsmc.txt'라고 한다.

model.build_vocab(sentences=data_input) # gensim 4.0 이상의 경우 sentences=data_input 대신 corpus_iterable=data_input 으로 입력한다.

model.train(
    sentences=data_input, # gensim 4.0 이상의 경우 sentences=data_input 대신 corpus_iterable=data_input 으로 입력한다.
    epochs=5,
    total_examples=model.corpus_count,
    total_words=model.corpus_total_words
)

In [10]:
# Save and Load

FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'
FName = 'nsmc.fasttext'

model.save(FolderPath + '/' + FName)
model = FastText.load(FolderPath + '/' + FName)

# FastText Embedding

In [11]:
# Load Model

FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'
FName = 'nsmc.fasttext'

model = FastText.load(FolderPath + '/' + FName)

In [12]:
# Word Embedding

# Does 히어로 exist in the model?
'히어로' in model.wv.vocab

True

In [13]:
model.wv['히어로']

array([ 0.20867114, -0.5223027 , -0.1795259 ,  0.37305787,  0.07951393,
       -0.1785026 , -0.7062732 ,  0.56880057,  0.9015204 ,  0.82078373,
        0.448159  , -0.1970236 , -0.1619406 , -0.185235  , -0.5008343 ,
       -0.12032036], dtype=float32)

In [14]:
# Does 슈퍼히어로 exist in the model?
'슈퍼히어로' in model.wv.vocab

False

In [15]:
# But 준단어 토큰의 임베딩을 더해서 Embedding을 계산해준다.
model.wv['슈퍼히어로']

array([ 0.32333004, -0.527832  , -0.29030827,  0.56260544,  0.23507488,
       -0.10828726, -0.8223953 ,  0.813443  ,  0.96966064,  0.8640916 ,
        0.3718171 , -0.26532453, -0.25231934, -0.22725323, -0.55477047,
       -0.19241847], dtype=float32)

In [16]:
model.wv['김진석천재']

array([ 0.05084642, -0.25860977, -0.02021028,  0.15942974,  0.14562488,
        0.03249365, -0.49203897,  0.4486512 ,  0.55074024,  0.49481142,
        0.31952596, -0.1617882 , -0.08090141, -0.22547024, -0.30492228,
        0.1061475 ], dtype=float32)

In [17]:
# 유사도 (Similarity)

# 슈퍼히어로 & 히어로 have a high similarity!
model.wv.similarity('슈퍼히어로', '히어로')

0.98523986

In [18]:
# 히어로 & 평론가 have a low similarity!
model.wv.similarity('히어로', '평론가')

0.6648355

In [25]:
# 평론가와 비슷한 단어들
model.wv.most_similar('평론가')

[('높은거야', 0.9932022094726562),
 ('평론', 0.9897940754890442),
 ('기자', 0.9889047145843506),
 ('점대야', 0.9868314266204834),
 ('평론가들', 0.9867388010025024),
 ('점대지', 0.986352801322937),
 ('높은거지', 0.9862076044082642),
 ('점대라', 0.9859142303466797),
 ('점이라니', 0.9851040244102478),
 ('점대면', 0.9847214221954346)]

In [26]:
# Data Cleaning

# Review가 있는 data만 선택한다.

df = df[df['document'].notnull()]

# Split into Train Set and Validate Set

from sklearn.model_selection import train_test_split

train_doc, test_doc, train_target, test_target = \
    train_test_split(
        df['document'],
        df['label'],
        test_size=0.2,
        random_state=1234
    )

print(train_doc.shape, train_target.shape)
print(test_doc.shape, test_target.shape)

(119996,) (119996,)
(29999,) (29999,)


In [27]:
import re
import numpy as np



def GetKoreanIntoList(text):
    return re.findall(
        r'[ㄱ-ㅎ가-힣]+',
        text
    )

train_input = np.zeros((
    train_doc.shape[0], # 각 doc
    16 # 각 doc의 16-dimensional embedding
))

# 각 document에서 Korean Word를 찾아 단어 임베딩을 구하고, 이를 document마다 average를 낸다.

for idx, doc in enumerate(train_doc):
    try:
        WordEmbedding_arr = [model.wv[word] for word in GetKoreanIntoList(doc)]
    except:
        WordEmbedding_arr = None
    if WordEmbedding_arr:
        train_input[idx, ] = np.mean(WordEmbedding_arr, axis=0)

In [28]:
# train_input은 각 document 내 단어들의 Embedding 평균이다.
train_input[0]

array([ 0.11174098, -0.27238566,  0.75638545,  0.5578711 ,  0.38306081,
        0.28719813, -2.32267499,  1.55786109,  2.33526325,  1.10593116,
        1.57947898, -0.94467002, -1.5340426 , -1.08389604, -1.6771946 ,
        2.27855158])

In [30]:
# Model Training

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(16, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

model.fit(
    train_input,
    train_target.values,
    epochs=20,
    validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd2e41d7450>

In [31]:
import pandas as pd

df = pd.read_csv('https://github.com/euphoris/datasets/raw/master/imdb.zip')
print(df.shape)

(1000, 2)


In [32]:
import joblib

FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'
FName = 'tokenizer.pkl'

tokenizer = joblib.load(FolderPath + '/' + FName)

In [33]:
from sklearn.model_selection import train_test_split

train_doc, test_doc, train_target, test_target = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2,
    random_state=1234
)

print(train_doc.shape, train_target.shape)
print(test_doc.shape, test_target.shape)

(800,) (800,)
(200,) (200,)


In [37]:
train_seqlist = tokenizer.texts_to_sequences(train_doc)
test_seqlist = tokenizer.texts_to_sequences(test_doc)
print(train_seqlist[0])
print(test_seqlist[0])

[154, 356]
[2, 14, 311, 525, 10, 2, 1, 114, 11, 2, 297, 5, 329, 587, 53, 30, 104, 163, 1, 38, 30, 141, 10, 30, 321]


In [39]:
# 순방향 RNN

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# 패딩을 한다. 길이가 짧으면 앞쪽에 0을 채운다(padding='pre'). maxlen은 최대 길이를 지정할 수 있다.
# 지정하지 않으면 가장 긴 문자열의 길이로 지정된다. truncating='pre'는 maxlen보다 긴 문자열일 경우 앞쪽을 자른다.
# 뒤쪽을 자르게 하려면 'post'로 설정한다.

train_seqlist_padding = pad_sequences(train_seqlist, maxlen=50, padding='pre', truncating='pre')
test_seqlist_padding = pad_sequences(test_seqlist, maxlen=50, padding='pre', truncating='pre')

word_cnt = tokenizer.num_words + 1

print(train_seqlist_padding[0])
print(test_seqlist_padding[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0 154 356]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   2  14 311 525  10   2   1 114  11   2 297
   5 329 587  53  30 104 163   1  38  30 141  10  30 321]


In [40]:
# Embedding에서 mask_zero=True로 설정하면 0으로 패딩된 부분의 예측은 손실에 반영하지 않는다.

model = Sequential()

model.add(Embedding(
    input_dim=word_cnt,
    output_dim=8,
    mask_zero=True
))
model.add(LSTM(8))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           16008     
_________________________________________________________________
lstm (LSTM)                  (None, 8)                 544       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 16,561
Trainable params: 16,561
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

model.fit(
    train_seqlist_padding,
    train_target,
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd2e468ba50>

In [43]:
# 역방향 RNN

train_seqlist_padding = pad_sequences(train_seqlist, maxlen=50, padding='post')
test_seqlist_padding = pad_sequences(test_seqlist, maxlen=50, padding='post')

model = Sequential()

model.add(Embedding(
    input_dim=word_cnt,
    output_dim=8,
    mask_zero=True
))
model.add(LSTM(8, go_backwards=True))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
lstm_2 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
Total params: 16,561
Trainable params: 16,561
Non-trainable params: 0
_________________________________________________________________


In [44]:
# Bidirectional RNN

from tensorflow.keras.layers import Bidirectional

model = Sequential()

model.add(Embedding(
    input_dim=word_cnt,
    output_dim=8,
    mask_zero=True
))
model.add(Bidirectional(LSTM(8)))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
bidirectional (Bidirectional (None, 16)                1088      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 17        
Total params: 17,113
Trainable params: 17,113
Non-trainable params: 0
_________________________________________________________________
