#### 필요한 라이브러리 import

In [None]:
import numpy as np
import pandas as pd
import gensim
import os
import re
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, LSTM
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## *Get the data*

In [None]:
data = pd.read_csv("cleaned_hm.csv") 
data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


## *Basic data cleaning*

In [None]:
data["predicted_category"].value_counts()

affection           34168
achievement         33993
enjoy_the_moment    11144
bonding             10727
leisure              7458
nature               1843
exercise             1202
Name: predicted_category, dtype: int64

In [None]:
data["num_sentence"].value_counts()

In [None]:
# 문장 길이가 10개 이하인 것만 남겨서 mod_data로 저장
mod_data = data.loc[data['num_sentence'] <= 10]
mod_data["predicted_category"].value_counts()

affection           34020
achievement         33966
enjoy_the_moment    11115
bonding             10700
leisure              7458
nature               1839
exercise             1202
Name: predicted_category, dtype: int64

In [None]:
## 수치형으로 바꿔주기위한 맵핑 준비
encode = {
    "affection" : 0,
    "achievement"  : 1,       
    "bonding" : 2,    
    "enjoy_the_moment" : 3,     
    "leisure"  : 4,    
    "nature" : 5,    
    "exercise" : 6
}

In [None]:
mod_data["predicted_category"] = mod_data["predicted_category"].apply(lambda x: encode[x])
mod_data.head()
#위에서 할당해준 각 수치들로 감정 분류를 수치로 바꿔줌

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,0
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,0
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,6
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,2
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,0


### 전처리

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

happy_lines = list() #빈리스트 생성
lines = mod_data["cleaned_hm"].values.tolist() #각 텍스트(문장)을 담은 리스트가 만들어짐

# 각 문장들을 단어단어로 쪼개는 과정
for line in lines:
    # tokens가 단어들을 담은 리스트를 의미
    # 모든 단어를 소문자로 바꿔준다
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    # 메서드 이용해서 콤마나 느낌표 등등 없애주고 알파벳인 것만 남겨서 새로운 변수 words에 담기
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    happy_lines.append(words)
    #words가 각 문장을 의미함
happy_lines[0:5]
len(happy_lines)

100300

### Train, test 데이터 나누기

In [None]:
validation_split = 0.2
max_length = 55 #pad_sequences 파라미터로 쓰이는 변수. 시퀀스 크기가 이 길이 기준으로 맞춰지게 된다

tokenizer_obj = Tokenizer() #토큰 객체 만들고
tokenizer_obj.fit_on_texts(happy_lines) #텍스트가 담긴 리스트인 happy_lines로부터 단어 인덱스를 구축한다
sequences = tokenizer_obj.texts_to_sequences(happy_lines) #위에서 구축한 단어 인덱스를 바탕으로 각 문장들을 정수 인덱스의 리스트로 변환

word_index = tokenizer_obj.word_index #단어가 key이고 인덱스가 value인 딕셔너리
print("단어 사전:",word_index)
print("unique tokens - "+str(len(word_index)))
vocab_size = len(tokenizer_obj.word_index) + 1 #임베딩 레이어의 input_dim과 같음.
print('vocab_size - '+str(vocab_size))

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
print("문장 2D 텐서 예시:", lines_pad[0])
# 정수 리스트를 2D 텐서로 바꿔주는 부분
category =  mod_data['predicted_category'].values #레이블 하나하나를 한 행이 되게끔 꼴을 array로 바꾸기

indices = np.arange(lines_pad.shape[0]) #문장 개수 n까지의 array
np.random.shuffle(indices) #섞어주기
lines_pad = lines_pad[indices] #문장과
category = category[indices] #레이블도 그에 맞춰서 셔플되게끔

#카테로리 인코딩 : 제일 큰 값 개수, 즉 총 카테고리 (레이블) 개수만큼의 열을 가진 배열이 만들어진다.
#각 문장에 대해 해당 카테고리만 값이 1이고 나머지는 0
n_values = np.max(category) + 1 #카테고리 중 제일 큰 값 + 1
Y = np.eye(n_values)[category] 
print(Y, Y.shape[0], Y.shape[1])

num_validation_samples = int(validation_split * lines_pad.shape[0]) 

X_train_pad = lines_pad[:-num_validation_samples]
y_train = Y[:-num_validation_samples]
X_test_pad = lines_pad[-num_validation_samples:]
y_test = Y[-num_validation_samples:]
#lines_pad.shape[0] = Y.shape[0] = 문장 개수

unique tokens - 26183
vocab_size - 26184
문장 2D 텐서 예시: [   1   23   16    3  716  300   13  282    1   91 9793    5 2148   13
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]
[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]] 100300 7


In [None]:
print('Shape of X_train_pad:', X_train_pad.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_test_pad:', X_test_pad.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train_pad: (80240, 55)
Shape of y_train: (80240, 7)
Shape of X_test_pad: (20060, 55)
Shape of y_test: (20060, 7)


## *GloVe Model*
### https://nlp.stanford.edu/projects/glove/

In [None]:
#임베딩 벡터 맵핑을 위해 단어 : 벡터 딕셔너리를 구축하는 과정
#이미 만들어진, 사전 훈련된 glove 임베딩을 로드하는 것. 파일명에 임베딩 차원이 명시되어있다 
embedding_index = {}
embedding_dim = 100 #다운받은 데이터에 맞게 dimension 지정
# glove_dir = "D:\Pre-trained Word Vectors\glove.twitter.27B"
f = open('/glove.6B.100d.txt', encoding = "utf-8")
# 데이터는 어떻게 생겼나?
# the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062


In [None]:
# 단어와 벡터로 파싱해주는 작업이 필요
for line in f:
    values = line.split()
    word = values[0]
    coeff = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coeff
f.close()
#coeff가 결국 단어별 벡터를 의미함
# print(embedding_index)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# 이제 임베딩 층에 넣어주기 위해 (단어의 총 개수, embedding_dim) 크기의 2D 가중치 행렬 만들어줌
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) #인덱스 0부터 시작하니까 단어개수 맞춰주려면 +1 해줘야함; vocab_size와 같음
for word, i in word_index.items():#아까 만들었던 단어 : 인덱스 딕셔너리를 기준으로
    embedding_vector = embedding_index.get(word) #단어(key)마다 임베딩 벡터 (100개의 값 value)를 임시 변수에 저장
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector #단어 인덱스에 따라 glove 벡터들을 맵핑
      # 즉 임베딩 차원만큼의 크기를 가지는 행렬, 각 i번째 원소는 i번째 인데스에 상응하는 단어 벡터
print(embedding_matrix[1])

[-0.046539    0.61966002  0.56647003 -0.46584001 -1.18900001  0.44599
  0.066035    0.31909999  0.14679    -0.22119001  0.79238999  0.29905
  0.16073     0.025324    0.18678001 -0.31000999 -0.28108001  0.60514998
 -1.0654      0.52476001  0.064152    1.03579998 -0.40779001 -0.38011
  0.30801001  0.59964001 -0.26991001 -0.76034999  0.94221997 -0.46919
 -0.18278     0.90652001  0.79671001  0.24824999  0.25713     0.6232
 -0.44768     0.65357     0.76902002 -0.51229    -0.44332999 -0.21867
  0.38370001 -1.14830005 -0.94397998 -0.15062     0.30012    -0.57805997
  0.20175    -1.65910006 -0.079195    0.026423    0.22051001  0.99713999
 -0.57538998 -2.72659993  0.31448001  0.70521998  1.43809998  0.99125999
  0.13976     1.34739995 -1.1753      0.0039503   1.02980006  0.064637
  0.90886998  0.82871997 -0.47003001 -0.10575     0.5916     -0.42210001
  0.57331002 -0.54114002  0.10768     0.39783999 -0.048744    0.064596
 -0.61436999 -0.28600001  0.50669998 -0.49757999 -0.81569999  0.16407999
 

In [None]:
#임베딩 층 만들기
#가중치행렬이 바로 위에서 만든 행렬이 되는 것
#input_length 맞춰주기
embedding_layer = Embedding(vocab_size, #총 단어 개수, 즉 텍스트 데이터의 전체 단어 집합의 크기
                            embedding_dim, #벡터 공간 크기
                            weights=[embedding_matrix],#가중치 행렬 지정해줌으로써 glove행렬을 로드
                            input_length=max_length,#input text들의 크기(아까 맞춰둔 max_length인 55로 설정)
                            trainable=False) #사전 훈련된 워드 임베딩을 그대로 사용할 것이므로, 별도로 더 이상 훈련을 하지 않는다는 옵션

In [None]:
model_glove = Sequential() #모델 정의
model_glove.add(embedding_layer)
model_glove.add(LSTM(units=32,  dropout=0.2, recurrent_dropout=0.25))
model_glove.add(Dense(7, activation='softmax'))

model_glove.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model_glove.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 55, 100)           2618400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 231       
Total params: 2,635,655
Trainable params: 17,255
Non-trainable params: 2,618,400
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint('model-{epoch:03d}-{val_acc:03f}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')

In [None]:
history_glove = model_glove.fit(X_train_pad, y_train, batch_size=32, epochs=10, validation_data=(X_test_pad, y_test), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
loss, accuracy = model_glove.evaluate(X_test_pad, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 87.068796


### glove 모델을 import 해서 사용하기

In [None]:
pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K     |████████████████████████████████| 266kB 4.3MB/s 
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python: filename=glove_python-0.1.0-cp36-cp36m-linux_x86_64.whl size=700246 sha256=8f43a06e3130f7684d68b08be50090ae552f9624c7237625635b349740af885d
  Stored in directory: /root/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0


In [None]:
class Glove(object):
    """
    Class for estimating GloVe word embeddings using the
    corpus coocurrence matrix.
    """

    def __init__(self, no_components=30, learning_rate=0.05,
                 alpha=0.75, max_count=100, max_loss=10.0,
                 random_state=None):
        """
        Parameters:
        - int no_components: number of latent dimensions
        - float learning_rate: learning rate for SGD estimation.
        - float alpha, float max_count: parameters for the weighting function (see the paper).
        - float max_loss: the maximum absolute value of calculated
                          gradient for any single co-occurrence pair.
                          Only try setting to a lower value if you
                          are experiencing problems with numerical
                          stability.
        - random_state: random statue used to intialize optimization
        """
        
    def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
    """
    Estimate the word embeddings.
    Parameters:
    - scipy.sparse.coo_matrix matrix: coocurrence matrix
    - int epochs: number of training epochs
    - int no_threads: number of training threads
    - bool verbose: print progress messages if True
    """

In [None]:
from glove import Corpus, Glove
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
from nltk.corpus import gutenberg

sentences = list(gutenberg.sents('shakespeare-hamlet.txt'))
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]
print(sentences[3])

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['enter', 'barnardo', 'and', 'francisco', 'two', 'centinels']


In [None]:
# 훈련 데이터로부터 corpus를 통해 GloVe에서 사용할 동시 등장 행렬 생성
corpus = Corpus() 
corpus.fit(sentences, window=5)
# 행렬을 만드는 고려 대상 문맥의 길이가 window 사이즈

print(type(corpus.matrix))
print(len(corpus.dictionary), corpus.matrix.shape[0])

glove = Glove(no_components=100, learning_rate=0.05) #학습률 0.05, 아웃풋 벡터의 차원 100 (워드 임베딩 차원)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
# co-occurrence matrix (동시등장행렬)을 fit 함수의 input 으로 이용

<class 'scipy.sparse.coo.coo_matrix'>
4699 4699
Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [None]:
glove.save('glove_model')
glove.load('glove_model')

glove.add_dictionary(corpus.dictionary) #딕셔너리 구축; 단어 : int 형태
print(corpus.dictionary)

glove.most_similar('king',number = 5)
#입력 단어의 가장 유사한 단어들의 리스트를 리턴



[('the', 0.9986006096030661),
 ('queene', 0.9968731918744674),
 ('matter', 0.9953247177253361),
 ('world', 0.9947444694920317)]