# 1. Import

In [1]:
# Required Imports
import re
import numpy as np
import pandas as pd
from pprint import pprint
import multiprocessing
import nltk
import matplotlib.pyplot as plt

In [2]:
!pip install gensim



In [3]:
# Gensim for text preprocessing and LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
!pip install spacy



In [5]:
# spacy for lemmatization ans Stemming
import spacy

In [6]:
!pip install pyLDAvis



In [7]:

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [8]:
# Suppress all the warnings
import warnings
warnings.filterwarnings("ignore")

In [9]:
# Latent Dirichlet Allocation (LDA), using all CPU cores to parallelize and speed up model training
cores = multiprocessing.cpu_count()

In [10]:
!pip install konlpy



In [11]:
import konlpy
from konlpy.tag import Kkma
Kkma = Kkma()

In [12]:
import time
import datetime
start = time.time()

# 2. Load Data

## 공통 전처리 과정을 끝낸 파일 로드

In [13]:
# Read the data set from training file => pre_1992_2020.tsv
df = pd.read_csv('pre_1992_2020.tsv', sep ='\t', encoding = 'utf-8')

In [14]:
# Check the loaded data 
df.head(2)

Unnamed: 0.1,Unnamed: 0,년도,가수,제목,성별,장르,가사
0,0,19920000,잼,난 멈추지 않는다,mixed,댄스,"['이제', '모든걸', '다시', '시작해', '내겐', '아직도', '시간이',..."
1,1,19920000,잼,우리 모두 사랑하자,mixed,댄스,"['워', '우리', '모두', '사랑하자', '우리의', '젊은날을', '위하여'..."


In [15]:
# For LDA, we are only concerned with the lyrics
# therefore we will be selecting 'lyrics.text' column as training data
# Converting data to a list for further text preprocessing
data = df['가사'].tolist()

In [16]:
# Print the number of rows/ elements in list
len(data)

5776

# 3. Preprocess Data
### LDA 모델 적용을 위한 추가 전처리


In [17]:
# print the first element of the list
#data[:1]

## 3-1. 정규표현식 및 Konlpy 이용한 명사 추출

In [18]:
from konlpy.tag import Kkma
kkma = Kkma()

In [19]:
import time
import datetime
start = time.time()

### (데이터 전처리 다시 4-2에 해당)

In [87]:
# 내가, 나의, 나도, 무엇,...
data2 = []
for i in range(len(data)):
    data2.append(re.sub('내가',' ',data[i]))
data3 = []
for i in range(len(data)):
    data3.append(re.sub('무엇',' ',data2[i]))
data4 = []
for i in range(len(data)):
    data4.append(re.sub('나의',' ',data3[i]))
data5 = []
for i in range(len(data)):
    data5.append(re.sub('무엇',' ',data4[i]))
data6 = []
for i in range(len(data)):
    data6.append(re.sub('나는',' ',data5[i]))
data7 = []
for i in range(len(data)):
    data7.append(re.sub('나',' ',data6[i]))
data8 = []
for i in range(len(data)):
    data8.append(re.sub('내',' ',data7[i]))
data9 = []
for i in range(len(data)):
    data9.append(re.sub('(\',\', \'SP\')',' ',data8[i]))
data10 = []
for i in range(len(data)):
    data10.append(re.sub('(\',\', \'SS\')',' ',data9[i]))
data11 = []
for i in range(len(data)):
    data11.append(re.sub('너',' ',data10[i]))
data = data11
#data

In [88]:
dataset = []
# 가사에서 명사만 추출
for i in range(len(data)):
    #print(i)
    dataset.append(kkma.nouns(data[i]))
    #print(dataset[i])

# 로직이 실행되는 시간 파악을 위해 time 사용
#sec = time.time() - start
#times = str(datetime.timedate(seconds=sec)).spilit(".")
#times = times[0]
#times

# 동사 형용사 추가하기
## 동사 : VV
## 형용사 : VA

In [89]:
data[0]

"['이제', '모든걸', '다시', '시작해', ' 겐', '아직도', '시간이', '있어', '때론', '상처가', '좌절로', '남아', '돌이킬수', '없는', '후회도', '하고', '그러 ', '우리', '잊어선', '안돼', '지금의', ' ', ' ', '아닌걸', '신문에', '실려온', '얘기들', '헝클어진', '우리들을', '탓할순', '없어', '이제', '모든걸', '다시', '시작해', '이렇게', '여기서', '끝낼순', '없어', ' 겐', '아직도', '시간이', '있어', '지금', '이렇게', '지금', '멈출수는', '없어', '신문에', '실려온', '얘기들', '헝클어진', '우리들을', '탓할순', '없어', '이제', '모든걸', '다시', '시작해', '이렇게', '여기서', '끝낼순', '없어', ' 겐', '아직도', '시간이', '있어', '지금', '이렇게', '지금', ' ', '여기서', '멈출순', '없어', '이제', '모든걸', '다시', '시작해', ' ', '여기서', '멈출순', '없어', '이제', '모든걸', '다시', '시작해']"

In [90]:
len(data)

5776

In [92]:
data[0]

"['이제', '모든걸', '다시', '시작해', ' 겐', '아직도', '시간이', '있어', '때론', '상처가', '좌절로', '남아', '돌이킬수', '없는', '후회도', '하고', '그러 ', '우리', '잊어선', '안돼', '지금의', ' ', ' ', '아닌걸', '신문에', '실려온', '얘기들', '헝클어진', '우리들을', '탓할순', '없어', '이제', '모든걸', '다시', '시작해', '이렇게', '여기서', '끝낼순', '없어', ' 겐', '아직도', '시간이', '있어', '지금', '이렇게', '지금', '멈출수는', '없어', '신문에', '실려온', '얘기들', '헝클어진', '우리들을', '탓할순', '없어', '이제', '모든걸', '다시', '시작해', '이렇게', '여기서', '끝낼순', '없어', ' 겐', '아직도', '시간이', '있어', '지금', '이렇게', '지금', ' ', '여기서', '멈출순', '없어', '이제', '모든걸', '다시', '시작해', ' ', '여기서', '멈출순', '없어', '이제', '모든걸', '다시', '시작해']"

In [93]:
data_pos = []
for i in range(1000):
    print(i)
    data_pos.append(kkma.pos(data[i]))
    #print(data_pos[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [94]:
for i in range(len(data_pos)):
    print(i)
    for k in range(len(data_pos[i])):
         if data_pos[i][k][1] == 'VV':
            print(data_pos[i][k])
            dataset.append(data_pos[i][k][0])
         elif data_pos[i][k][1] == 'VA':
            print(data_pos[i][k])
            dataset.append(data_pos[i][k][0])


0
('걸', 'VV')
('시작하', 'VV')
('있', 'VV')
('남', 'VV')
('돌이키', 'VV')
('없', 'VA')
('하', 'VV')
('그', 'VV')
('잊', 'VV')
('안되', 'VV')
('아니', 'VV')
('실려오', 'VV')
('헝클어지', 'VV')
('없', 'VA')
('걸', 'VV')
('시작하', 'VV')
('이렇', 'VA')
('끝내', 'VV')
('없', 'VA')
('있', 'VV')
('이렇', 'VA')
('멈추', 'VV')
('없', 'VA')
('실려오', 'VV')
('헝클어지', 'VV')
('없', 'VA')
('걸', 'VV')
('시작하', 'VV')
('이렇', 'VA')
('끝내', 'VV')
('없', 'VA')
('있', 'VV')
('이렇', 'VA')
('멈추', 'VV')
('없', 'VA')
('걸', 'VV')
('시작하', 'VV')
('멈추', 'VV')
('없', 'VA')
('걸', 'VV')
('시작하', 'VV')
1
('젊', 'VA')
('아무렇', 'VA')
('입', 'VV')
('걷', 'VV')
('스쳐가', 'VV')
('받', 'VV')
('갖', 'VV')
('있', 'VV')
('젊', 'VA')
('좋아하', 'VV')
('좋아하', 'VV')
('오', 'VV')
('좋', 'VA')
('이렇', 'VA')
('있', 'VV')
('떠들', 'VV')
('그러', 'VV')
('없', 'VA')
('젊', 'VA')
('날으', 'VV')
('하', 'VV')
('보', 'VV')
('괜찮', 'VA')
('말하', 'VV')
('힘들', 'VA')
('꾸밈없', 'VA')
('늦', 'VA')
('젊', 'VA')
('젊', 'VA')
2
('드리우', 'VV')
('지치', 'VV')
('밀려오', 'VV')
('재우', 'VV')
('헝클어지', 'VV')
('길', 'VA')
('하', 'VV')
('취하', 'VV'

('만나', 'VV')
('하', 'VV')
('많', 'VA')
('말하', 'VV')
('싶', 'VV')
('가', 'VV')
('싶', 'VV')
('이렇', 'VA')
('가', 'VV')
('오', 'VV')
('작', 'VA')
('돌아오', 'VV')
('같', 'VA')
('만나', 'VV')
('하', 'VV')
('많', 'VA')
('말하', 'VV')
('싶', 'VV')
('가', 'VV')
('싶', 'VV')
('이렇', 'VA')
('가', 'VV')
('오', 'VV')
33
('돌아가', 'VV')
('없', 'VA')
('만나', 'VV')
('남기', 'VV')
('작', 'VA')
('잊', 'VV')
('끄', 'VV')
('만나', 'VV')
('오', 'VV')
('때', 'VV')
('낯설', 'VA')
('보이', 'VV')
('느끼', 'VV')
('지나', 'VV')
('아쉽', 'VA')
('하', 'VV')
('아니', 'VV')
('끄', 'VV')
('하', 'VV')
('오', 'VV')
('되', 'VV')
('끄', 'VV')
('잊', 'VV')
('흐르', 'VV')
('하', 'VV')
('있', 'VV')
('우리', 'VV')
('아니', 'VV')
('끄', 'VV')
('하', 'VV')
('오', 'VV')
('되', 'VV')
('아니', 'VV')
('끄', 'VV')
('하', 'VV')
('오', 'VV')
('되', 'VV')
('끄', 'VV')
34
('아', 'VV')
('그러', 'VV')
('아', 'VV')
('낄리', 'VV')
('뜨', 'VV')
('가', 'VV')
('아', 'VV')
('그러', 'VV')
('아', 'VV')
('낄리', 'VV')
('뜨', 'VV')
('가', 'VV')
('아', 'VV')
('그러', 'VV')
('아', 'VV')
('낄리', 'VV')
('뜨', 'VV')
('가', 'VV')
('아', 'VV')
('그러'

('떨리', 'VV')
('늘', 'VV')
('알', 'VV')
('있', 'VV')
('슬퍼지', 'VV')
('힘들', 'VV')
('돌', 'VV')
('보', 'VV')
('스', 'VV')
('있', 'VV')
('갈', 'VV')
('힘들', 'VV')
('쉬', 'VV')
('때', 'VV')
('보', 'VV')
('오', 'VV')
('있', 'VV')
('그립', 'VA')
('힘들', 'VV')
('돌', 'VV')
('보', 'VV')
('스', 'VV')
('있', 'VV')
('갈', 'VV')
('힘들', 'VV')
('쉬', 'VV')
('때', 'VV')
('봐주', 'VV')
57
('찾', 'VV')
('쓰', 'VV')
('걸', 'VV')
('쉬', 'VV')
('흘르', 'VV')
('가늘', 'VA')
('있', 'VV')
('해지', 'VV')
('막', 'VV')
('길', 'VA')
('버리', 'VV')
('없었', 'VV')
('없었', 'VV')
('크', 'VA')
('더하', 'VV')
('가', 'VV')
('되', 'VV')
('사라지', 'VV')
('뜨', 'VV')
('거칠', 'VA')
('뜨', 'VV')
('태어나', 'VV')
('없', 'VA')
('힘겹', 'VA')
('마르', 'VV')
('날리', 'VV')
('둘러보', 'VV')
('기다리', 'VV')
('있', 'VV')
('되', 'VV')
('날', 'VV')
('우리', 'VV')
('젊', 'VA')
('괜찮', 'VA')
('있', 'VV')
('차갑', 'VA')
('닦', 'VV')
('뜨', 'VV')
('거칠', 'VA')
('뜨', 'VV')
('터지', 'VV')
('같', 'VA')
('미치', 'VV')
('만들', 'VV')
('같', 'VA')
('깨닫', 'VV')
('아', 'VV')
('뜨', 'VV')
('거칠', 'VA')
('뜨', 'VV')
('찾', 'VV')
('쓰', 'VV')


('살아나', 'VV')
('느끼', 'VV')
('있', 'VV')
('잊', 'VV')
('갈', 'VV')
('늘', 'VV')
('울리', 'VV')
('않', 'VV')
('떠오르', 'VV')
('지우', 'VV')
('없', 'VA')
('하', 'VV')
('늘', 'VV')
('하', 'VV')
('어울리', 'VV')
('헤어지', 'VV')
('머물', 'VV')
('하', 'VV')
('되돌아가', 'VV')
('걸어가', 'VV')
('뜨', 'VV')
('찾', 'VV')
('하', 'VV')
('살', 'VV')
('있', 'VV')
('알', 'VV')
('없', 'VA')
('하', 'VV')
('있', 'VV')
('원하', 'VV')
('잊', 'VV')
('갈', 'VV')
('늘', 'VV')
('울리', 'VV')
('않', 'VV')
('떠오르', 'VV')
('지우', 'VV')
('없', 'VA')
('하', 'VV')
('늘', 'VV')
('하', 'VV')
('어울리', 'VV')
('헤어지', 'VV')
('머물', 'VV')
('하', 'VV')
('되돌', 'VV')
('가', 'VV')
('걸어가', 'VV')
('되돌', 'VV')
('가', 'VV')
('걸어가', 'VV')
87
('스', 'VV')
('꾸', 'VV')
('알', 'VV')
('없', 'VA')
('피우', 'VV')
('그려지', 'VV')
('어리', 'VV')
('섞이', 'VV')
('해지', 'VV')
('갈', 'VV')
('깊', 'VA')
('스', 'VV')
('듣', 'VV')
('갈', 'VV')
('보', 'VV')
('웃', 'VV')
('커지', 'VV')
('돌', 'VV')
('보', 'VV')
('스', 'VV')
('꾸', 'VV')
('버리', 'VV')
('걷', 'VV')
('없었', 'VV')
('이렇', 'VA')
('머무', 'VV')
('피우', 'VV')
('그려지', 'VV')
('

('재잘거리', 'VV')
('해맑', 'VA')
('찾', 'VV')
('없', 'VA')
('되', 'VV')
('없', 'VA')
('슬프', 'VA')
('보이', 'VV')
('되', 'VV')
('숨', 'VV')
('피하', 'VV')
('작', 'VA')
('듣', 'VV')
('하', 'VV')
('달라', 'VV')
('거칠', 'VA')
('묻', 'VV')
('오', 'VV')
('어', 'VV')
('디', 'VV')
('가', 'VV')
('보이', 'VV')
('숨', 'VV')
('있', 'VV')
('깨', 'VV')
('기다리', 'VV')
('보', 'VV')
('없', 'VA')
('하', 'VV')
('없었', 'VV')
('흘리', 'VV')
('들킬', 'VV')
('녀', 'VV')
('찾', 'VV')
('없다', 'VA')
('스쳐가', 'VV')
('어', 'VV')
('디', 'VV')
('가', 'VV')
('없다', 'VA')
('가', 'VV')
('되', 'VV')
('말', 'VV')
('하얗', 'VA')
('만들', 'VV')
('눕', 'VV')
('불르', 'VV')
('보', 'VV')
('보', 'VV')
('보', 'VV')
('녹', 'VV')
('버리', 'VV')
('되', 'VV')
('가리', 'VV')
('뜨', 'VV')
('없', 'VA')
('살', 'VV')
('거칠', 'VA')
('찾아오', 'VV')
('마', 'VV')
('데리', 'VV')
('보', 'VV')
('없', 'VA')
('하', 'VV')
('알', 'VV')
('닿', 'VA')
('하얗', 'VA')
('녀', 'VV')
('찾', 'VV')
('없다', 'VA')
('스쳐가', 'VV')
('어', 'VV')
('디', 'VV')
('가', 'VV')
('없다', 'VA')
('가', 'VV')
('되', 'VV')
('말', 'VV')
('보', 'VV')
('없', 'VA')
('하', '

('싸', 'VV')
('들어가', 'VV')
('쉬', 'VV')
('하', 'VV')
('감', 'VV')
('빠지', 'VV')
('잃어버리', 'VV')
('남', 'VV')
('하', 'VV')
('없', 'VA')
('기쁘', 'VA')
('하', 'VV')
('없', 'VA')
('그러', 'VV')
('보이', 'VV')
('눈부시', 'VA')
('눈부시', 'VA')
('가지', 'VV')
('잃', 'VV')
('없', 'VA')
('눕', 'VV')
('보', 'VV')
('아름답', 'VA')
('쉽', 'VA')
('있', 'VV')
('그리하', 'VV')
('갈', 'VV')
('원하', 'VV')
('멍들', 'VV')
('깨지', 'VV')
('부서지', 'VV')
('아파하', 'VV')
('뒹굴', 'VV')
('흘르', 'VV')
('가', 'VV')
('견디', 'VV')
('힘들', 'VV')
('이렇', 'VA')
('움직이', 'VV')
('오', 'VV')
('거두어들이', 'VV')
('찾', 'VV')
('꾸미', 'VV')
('작', 'VA')
('모이', 'VV')
('그', 'VV')
('어', 'VV')
('있', 'VV')
('알', 'VV')
('흘르', 'VV')
('되', 'VV')
('되', 'VV')
('하', 'VV')
('살', 'VV')
('있', 'VV')
('그리하', 'VV')
('하', 'VV')
('찾', 'VV')
('꾸미', 'VV')
('작', 'VA')
('모이', 'VV')
('그', 'VV')
('어', 'VV')
('있', 'VV')
('알', 'VV')
('흘르', 'VV')
145
('깨우', 'VV')
('일으키', 'VV')
('반짝이', 'VV')
('다가오', 'VV')
('하', 'VV')
('하', 'VV')
('망설이', 'VV')
('있', 'VV')
('보', 'VV')
('되', 'VV')
('반짝이', 'VV')
('수많', 'VA')
('버리

('거리', 'VV')
('마', 'VV')
('보', 'VV')
('까', 'VV')
('마', 'VV')
('보', 'VV')
('거리', 'VV')
('있', 'VV')
('떨리', 'VV')
('보', 'VV')
('알', 'VV')
('보', 'VV')
('이렇', 'VA')
('없', 'VA')
('오', 'VV')
('마주치', 'VV')
('쑥스럽', 'VA')
('버리', 'VV')
('스치', 'VV')
('어쩌', 'VV')
('모르', 'VV')
('이렇', 'VA')
('달라지', 'VV')
('있', 'VV')
('모르', 'VV')
('이렇', 'VA')
('마', 'VV')
('되', 'VV')
('만나', 'VV')
('아니', 'VV')
('마', 'VV')
('되', 'VV')
('하', 'VV')
('하', 'VV')
('되', 'VV')
('이렇', 'VA')
('우리', 'VV')
('보', 'VV')
('없', 'VA')
('싫', 'VV')
('괜찮', 'VA')
('까', 'VV')
('보', 'VV')
('참으', 'VV')
('내', 'VV')
('있', 'VV')
('감', 'VV')
('보', 'VV')
('있', 'VV')
('떠나', 'VV')
('몰르', 'VV')
('가', 'VV')
('그렇', 'VA')
('이렇', 'VA')
('달라지', 'VV')
('있', 'VV')
('모르', 'VV')
('이렇', 'VA')
('이렇', 'VA')
('달라지', 'VV')
('괜찮', 'VA')
('하', 'VV')
('마', 'VV')
('바라보', 'VV')
('싶', 'VV')
('되', 'VV')
('같', 'VA')
('삐지', 'VV')
('같', 'VA')
('보', 'VV')
('있', 'VV')
176
('같', 'VA')
('같', 'VA')
('되', 'VV')
('없', 'VA')
('잃', 'VV')
('어떻', 'VA')
('드', 'VV')
('하', 'VV')
('없', 'VV

('쉽', 'VA')
('정해지', 'VV')
('버리', 'VV')
('돌리', 'VV')
('아프', 'VA')
('저리', 'VA')
('감추', 'VV')
('쉽', 'VA')
('정해지', 'VV')
('버리', 'VV')
('돌리', 'VV')
('아니', 'VV')
('크', 'VA')
('남', 'VV')
('말하', 'VV')
('없', 'VA')
('해하', 'VV')
202
('믿', 'VV')
('지우', 'VV')
('믿', 'VV')
('이렇', 'VA')
('말하', 'VV')
('슬프', 'VA')
('듣', 'VV')
('없', 'VA')
('남기', 'VV')
('없', 'VA')
('싫', 'VV')
('싫', 'VV')
('이렇', 'VA')
('저렇', 'VA')
('아무렇', 'VA')
('없', 'VA')
('힘들', 'VA')
('꿈꾸', 'VV')
('오', 'VV')
('있', 'VV')
('믿', 'VV')
('키', 'VV')
('없', 'VA')
('되', 'VV')
('하', 'VV')
('있', 'VV')
('아', 'VV')
('갈', 'VV')
('없', 'VA')
('찾', 'VV')
('이렇', 'VA')
('같', 'VA')
('슬프', 'VA')
('느끼', 'VV')
('없', 'VA')
('하', 'VV')
('지치', 'VV')
('가', 'VV')
('있', 'VV')
('견디', 'VV')
('있', 'VV')
('보내', 'VV')
('그러', 'VV')
('되', 'VV')
('까', 'VV')
('하', 'VV')
('되', 'VV')
('힘들', 'VA')
('꿈꾸', 'VV')
('오', 'VV')
('있', 'VV')
('믿', 'VV')
('키', 'VV')
('없', 'VA')
('되', 'VV')
203
('없', 'VA')
('늘', 'VV')
('모르', 'VV')
('바라보', 'VV')
('있', 'VV')
('헤어지', 'VV')
('하', 'VV')
('꾸',

('잊', 'VV')
('하', 'VV')
('없애', 'VV')
('있', 'VV')
('대', 'VV')
('없', 'VA')
('보', 'VV')
('차가워지', 'VV')
('비웃', 'VV')
('마르', 'VV')
('지우', 'VV')
('가', 'VV')
('믿', 'VV')
('오', 'VV')
('따', 'VV')
('없', 'VA')
('떠오르', 'VV')
('잊', 'VV')
('하', 'VV')
('없애', 'VV')
('있', 'VV')
('대', 'VV')
('떠올리', 'VV')
('말', 'VV')
('보내', 'VV')
('그', 'VV')
('놓치', 'VV')
('없었', 'VV')
('지나', 'VV')
('잊', 'VV')
('하', 'VV')
('겹쳐지', 'VV')
('비웃', 'VV')
('믿', 'VV')
('오', 'VV')
('따', 'VV')
('없', 'VA')
('떠오르', 'VV')
('잊', 'VV')
('하', 'VV')
('없애', 'VV')
('있', 'VV')
('떠오르', 'VV')
('잊', 'VV')
('하', 'VV')
('없애', 'VV')
('있', 'VV')
('떠오르', 'VV')
('잊', 'VV')
('하', 'VV')
('없애', 'VV')
('있', 'VV')
235
('보', 'VV')
('마', 'VV')
('보', 'VV')
('있었', 'VV')
('모르', 'VV')
('늘', 'VV')
('가지', 'VV')
('그', 'VV')
('이렇', 'VA')
('있', 'VV')
('가', 'VV')
('득하', 'VV')
('열', 'VV')
('받', 'VV')
('있', 'VV')
('지키', 'VV')
('보', 'VV')
('주', 'VV')
('줄', 'VV')
('있', 'VV')
('되', 'VV')
('기대', 'VV')
('하', 'VV')
('되', 'VV')
('있', 'VV')
('기다리', 'VV')
('변함없', 'VA')
('그', 'VV

('깨닫', 'VV')
('하', 'VV')
('채', 'VV')
('오', 'VV')
('있', 'VV')
('흘르', 'VV')
('가', 'VV')
('알', 'VV')
('하', 'VV')
('힘들', 'VV')
('채', 'VV')
('오', 'VV')
('힘들', 'VV')
('채', 'VV')
('오', 'VV')
('아니', 'VV')
('힘들', 'VV')
('길', 'VA')
('힘들', 'VV')
('깨닫', 'VV')
('채', 'VV')
('오', 'VV')
261
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('아니', 'VV')
('뜨', 'VV')
('하', 'VV')
('빠지', 'VV')
('빠져들', 'VV')
('하', 'VV')
('그렇', 'VA')
('살아가', 'VV')
('가', 'VV')
('없', 'VA')
('되', 'VV')
('같', 'VA')
('그렇', 'VA')
('알', 'VV')
('없', 'VA')
('살', 'VV')
('없', 'VA')
('입', 'VV')
('놓치', 'VV')
('싫어하', 'VV')
('어딨', 'VA')
('오', 'VV')
('하', 'VV')
('없', 'VA')
('날', 'VV')
('그러', 'VV')
('그', 'VV')
('알', 'VV')
('이러', 'VV')
('알', 'VV')
('되', 'VV')
('돌아가', 'VV')
('알', 'VV')
('보', 'VV')
('좋', 'VA')
('그', 'VV')
('좋', 'VA')
('좋', 'VA')
('이루', 'VV')
('질', 'VV')
('믿', 'VV')
('마', 'VV')
('그', 'V

('쓰라리', 'VV')
('그러', 'VV')
('흘르', 'VV')
('그렇', 'VA')
('받아들', 'VV')
('없', 'VA')
('보', 'VV')
('살아가', 'VV')
('있', 'VV')
('말하', 'VV')
('보', 'VV')
('그', 'VV')
('아니', 'VV')
('걸', 'VV')
('지우', 'VV')
('싶', 'VV')
('잊', 'VV')
('없', 'VA')
('달라지', 'VV')
('되', 'VV')
('이렇', 'VA')
('없', 'VA')
('찾', 'VV')
('끄', 'VV')
('쉽', 'VA')
('그러', 'VV')
('없', 'VA')
('없', 'VA')
('오', 'VV')
('같', 'VA')
('하', 'VV')
('없', 'VA')
('그리하', 'VV')
('머무', 'VV')
('채우', 'VV')
('하', 'VV')
('지치', 'VV')
('말하', 'VV')
('보', 'VV')
('그', 'VV')
('아니', 'VV')
('걸', 'VV')
('지우', 'VV')
('싶', 'VV')
('잊', 'VV')
('없', 'VA')
('버리', 'VV')
('말하', 'VV')
('그러', 'VV')
('보', 'VV')
('하', 'VV')
('하', 'VV')
('안기', 'VV')
('자리', 'VV')
('아', 'VV')
('얼', 'VV')
('알', 'VV')
('되', 'VV')
('말하', 'VV')
('보', 'VV')
('그', 'VV')
('아니', 'VV')
('걸', 'VV')
('지우', 'VV')
('싶', 'VV')
('잊', 'VV')
('없', 'VA')
('아', 'VV')
('달라지', 'VV')
('이렇', 'VA')
('아니', 'VV')
('안되', 'VV')
('돌', 'VV')
('오', 'VV')
('기다리', 'VV')
('끄', 'VV')
287
('느껴지', 'VV')
('않', 'VV')
('이렇', 'VA')
('숨막히'

('같', 'VA')
('그', 'VV')
('좋아하', 'VV')
('있', 'VV')
('다가오', 'VV')
('보', 'VV')
('좋', 'VA')
('그렇', 'VA')
('말하', 'VV')
('묻', 'VV')
('싶', 'VV')
('다가오', 'VV')
('좋', 'VA')
('그러', 'VV')
('하', 'VV')
('망치', 'VV')
('틀리', 'VV')
('그렇', 'VA')
('전하', 'VV')
('젊', 'VA')
('치', 'VV')
('많', 'VA')
('가', 'VV')
('이기', 'VV')
('이기', 'VV')
('우러러보', 'VV')
('내', 'VV')
('숨기', 'VV')
('하', 'VV')
('알', 'VV')
('있', 'VV')
('오', 'VV')
('같', 'VA')
('그', 'VV')
('좋아하', 'VV')
('있', 'VV')
('다가오', 'VV')
('보', 'VV')
('좋', 'VA')
('그렇', 'VA')
('말하', 'VV')
('묻', 'VV')
('싶', 'VV')
('다가오', 'VV')
('좋', 'VA')
('그러', 'VV')
('맞추', 'VV')
('치', 'VV')
('불르', 'VV')
('오', 'VV')
('주', 'VV')
('보', 'VV')
('오', 'VV')
('시', 'VV')
('추', 'VV')
('좋', 'VA')
('그렇', 'VA')
('말하', 'VV')
('묻', 'VV')
('싶', 'VV')
('다가오', 'VV')
('좋', 'VA')
('그러', 'VV')
('맞추', 'VV')
('불르', 'VV')
('오', 'VV')
('오', 'VV')
('시', 'VV')
('추', 'VV')
325
('오', 'VV')
('빠지', 'VV')
('안되', 'VV')
('벌쓰', 'VV')
('차', 'VV')
('보이', 'VV')
('넘치', 'VV')
('미치', 'VV')
('하', 'VV')
('보', 'VV')
('되',

('머뭇거리', 'VV')
('돌', 'VV')
('망설이', 'VV')
('알', 'VV')
('없', 'VA')
('기다리', 'VV')
('마', 'VV')
('가', 'VV')
('바라보', 'VV')
('싫', 'VV')
('가', 'VV')
('안', 'VV')
('아', 'VV')
('좋아하', 'VV')
('두렵', 'VA')
('믿', 'VV')
('있', 'VV')
('보', 'VV')
('기다리', 'VV')
('마', 'VV')
('가', 'VV')
('바라보', 'VV')
('싫', 'VV')
('가', 'VV')
('안', 'VV')
('아', 'VV')
358
('정해지', 'VV')
('아프', 'VV')
('변함없', 'VA')
('슬프', 'VA')
('하', 'VV')
('있', 'VV')
('없잖', 'VA')
('말하', 'VV')
('더느', 'VV')
('감추', 'VV')
('아니', 'VV')
('안기', 'VV')
('있', 'VV')
('보', 'VV')
('떠나', 'VV')
('깊', 'VA')
('있', 'VV')
('힘들', 'VA')
('하', 'VV')
('뜨', 'VV')
('싶', 'VV')
('미워하', 'VV')
('하', 'VV')
('빌', 'VV')
('하', 'VV')
('있', 'VV')
('없잖', 'VA')
('말하', 'VV')
('더느', 'VV')
('감추', 'VV')
('아니', 'VV')
('안기', 'VV')
('있', 'VV')
('보', 'VV')
('떠나', 'VV')
('깊', 'VA')
('있', 'VV')
('힘들', 'VA')
('하', 'VV')
('뜨', 'VV')
('싶', 'VV')
('돋', 'VV')
('보', 'VV')
('말', 'VV')
('달려가', 'VV')
('잡', 'VV')
('이렇', 'VA')
('오', 'VV')
('기다리', 'VV')
('알', 'VV')
359
('없다', 'VA')
('그렇', 'VA')
('하', 'VV

('잊', 'VV')
('그', 'VV')
('쉽', 'VA')
('않', 'VV')
('지르', 'VV')
('살아가', 'VV')
('이렇', 'VA')
('모르', 'VV')
('늘', 'VV')
('머', 'VA')
('가지', 'VV')
('가버리', 'VV')
('하', 'VV')
('하', 'VV')
('잊', 'VV')
('없', 'VA')
('잊', 'VV')
('피우', 'VV')
('해지', 'VV')
('갈', 'VV')
('묻', 'VV')
('있', 'VV')
('어', 'VV')
('돌아오', 'VV')
('주', 'VV')
('버리', 'VV')
('가', 'VV')
('담', 'VV')
('두', 'VV')
('쓰', 'VV')
('끄', 'VV')
('태우', 'VV')
('울', 'VV')
('더느', 'VV')
('가지', 'VV')
('없', 'VA')
('버리', 'VV')
('없', 'VA')
('가엽', 'VA')
('잊', 'VV')
('말하', 'VV')
('잊', 'VV')
('그', 'VV')
('쉽', 'VA')
('않', 'VV')
('지르', 'VV')
('살아가', 'VV')
('이렇', 'VA')
('모르', 'VV')
('늘', 'VV')
('머', 'VA')
('알', 'VV')
('되', 'VV')
('늦', 'VA')
('돌아오', 'VV')
('않', 'VV')
('이렇', 'VA')
('슬프', 'VA')
('모르', 'VV')
('보', 'VV')
('울', 'VV')
('이렇', 'VA')
('되', 'VV')
('주', 'VV')
('돌아오', 'VV')
('이렇', 'VA')
('모르', 'VV')
('늘', 'VV')
('머', 'VA')
('가지', 'VV')
('가버리', 'VV')
('이렇', 'VA')
('모르', 'VV')
('늘', 'VV')
('머', 'VA')
('않', 'VV')
('알', 'VV')
('되', 'VV')
('어', 'VV')
('돌아오', 'VV')

('가', 'VV')
('가리', 'VV')
('향하', 'VV')
('뻗', 'VV')
('보', 'VV')
('이루', 'VV')
('없', 'VA')
('닿', 'VA')
('없', 'VA')
('늘', 'VV')
('늘', 'VV')
('알', 'VV')
('모르', 'VV')
('믿', 'VV')
('떠들', 'VV')
('살', 'VV')
('그', 'VV')
('뛰', 'VV')
('있', 'VV')
('늘', 'VV')
('만들', 'VV')
('늘', 'VV')
('차리', 'VV')
('떠나', 'VV')
('가', 'VV')
('가리', 'VV')
('향하', 'VV')
('뻗', 'VV')
('보', 'VV')
('이루', 'VV')
('없', 'VA')
('닿', 'VA')
('없', 'VA')
409
('오', 'VV')
('자', 'VV')
('있', 'VV')
('자', 'VV')
('걸리', 'VV')
('얻어터지', 'VV')
('서', 'VV')
('커지', 'VV')
('하', 'VV')
('커지', 'VV')
('걸리', 'VV')
('얻어터지', 'VV')
('이렇', 'VA')
('미치', 'VV')
('버리', 'VV')
('하', 'VV')
('않', 'VV')
('기다리', 'VV')
('보', 'VV')
('바꾸', 'VV')
('미치', 'VV')
('걸', 'VV')
('빠지', 'VV')
('없', 'VA')
('미치', 'VV')
('아니', 'VV')
('빼먹', 'VV')
('갈', 'VV')
('가득차', 'VV')
('바라', 'VV')
('이렇', 'VA')
('미치', 'VV')
('버리', 'VV')
('하', 'VV')
('않', 'VV')
('기다리', 'VV')
('보', 'VV')
('바꾸', 'VV')
('원하', 'VV')
('바래', 'VV')
('꿈꾸', 'VV')
('말', 'VV')
('비뚜', 'VV')
('바라보', 'VV')
('그만두', 'VV')
('없었', 'VV'

('보', 'VV')
('하', 'VV')
('흘르', 'VV')
('감추', 'VV')
('있', 'VV')
('그렇', 'VA')
('스쳐가', 'VV')
('남', 'VV')
('있', 'VV')
('묻', 'VV')
441
('지르', 'VV')
('던지', 'VV')
('차갑', 'VA')
('말하', 'VV')
('믿', 'VV')
('없', 'VA')
('웃', 'VV')
('헤어지', 'VV')
('뜨', 'VV')
('가', 'VV')
('덜', 'VV')
('돋', 'VV')
('보', 'VV')
('걸', 'VV')
('울', 'VV')
('있', 'VV')
('하', 'VV')
('어떻', 'VA')
('망설이', 'VV')
('없', 'VA')
('있', 'VV')
('하', 'VV')
('그리하', 'VV')
('걸', 'VV')
('주', 'VV')
('쉽', 'VA')
('하', 'VV')
('하', 'VV')
('있', 'VV')
('어떻', 'VA')
('그렇', 'VA')
('쉽', 'VA')
('갈', 'VV')
('알', 'VV')
('말하', 'VV')
('보내', 'VV')
('말하', 'VV')
('있', 'VV')
('버리', 'VV')
('말하', 'VV')
('그', 'VV')
('잊', 'VV')
('없', 'VA')
('버리', 'VV')
('없', 'VA')
('지우', 'VV')
('없', 'VA')
('하', 'VV')
('그립', 'VA')
('어떻', 'VA')
('보', 'VV')
('이렇', 'VA')
('아프', 'VA')
('하', 'VV')
('하', 'VV')
('하', 'VV')
('있', 'VV')
('어떻', 'VA')
('그렇', 'VA')
('쉽', 'VA')
('갈', 'VV')
('헤어지', 'VV')
('뜨', 'VV')
('가', 'VV')
('덜', 'VV')
('돋', 'VV')
('보', 'VV')
('걸', 'VV')
('울', 'VV')
('어떻', 'VA')
('

('되', 'VV')
('하', 'VV')
('떠오', 'VV')
('이렇', 'VA')
('뜨', 'VV')
('가', 'VV')
('오', 'VV')
('뜨', 'VV')
('가', 'VV')
('이렇', 'VA')
('헤어지', 'VV')
('이렇', 'VA')
('뜨', 'VV')
('가', 'VV')
('오', 'VV')
('뜨', 'VV')
('가', 'VV')
465
('다가오', 'VV')
('줄', 'VV')
('믿', 'VV')
('고맙', 'VA')
('하', 'VV')
('지키', 'VV')
('믿', 'VV')
('되', 'VV')
('끄', 'VV')
('좋', 'VA')
('있', 'VV')
('좋', 'VA')
('그', 'VV')
('오', 'VV')
('갈', 'VV')
('봐', 'VV')
('보', 'VV')
('하', 'VV')
('아름답', 'VA')
('느껴지', 'VV')
('달라지', 'VV')
('찾', 'VV')
('줄', 'VV')
('받', 'VV')
('하', 'VV')
('같', 'VA')
('믿', 'VV')
('고맙', 'VA')
('하', 'VV')
('오', 'VV')
('이루', 'VV')
('갈', 'VV')
('늘', 'VV')
('믿', 'VV')
('믿', 'VV')
('어떠하', 'VA')
('힘들', 'VA')
('우리', 'VV')
('이기', 'VV')
('내', 'VV')
('있', 'VV')
('힘들', 'VA')
('보', 'VV')
('느끼', 'VV')
('지키', 'VV')
('하', 'VV')
('없', 'VA')
('설레이', 'VV')
('가득차', 'VV')
('그러', 'VV')
('그렇', 'VA')
('다그', 'VV')
('가', 'VV')
('위하', 'VV')
('갖', 'VV')
('위하', 'VV')
('하', 'VV')
('다가오', 'VV')
('믿', 'VV')
('고맙', 'VA')
('하', 'VV')
('보', 'VV')
('죽이', 'VV

('보', 'VV')
('보', 'VV')
('그렇', 'VA')
('같', 'VA')
('좋', 'VA')
('웃', 'VV')
('스', 'VV')
('말하', 'VV')
('없', 'VA')
('않', 'VV')
485
('보', 'VV')
('멈추', 'VV')
('보', 'VV')
('늘', 'VV')
('멋지', 'VV')
('이러', 'VV')
('말', 'VV')
('쳐다보', 'VV')
('쭐', 'VV')
('있었', 'VV')
('있었', 'VV')
('보', 'VV')
('미치', 'VV')
('끌려가', 'VV')
('모르', 'VV')
('오', 'VV')
('모르', 'VV')
('마', 'VV')
('모르', 'VV')
('괜찮', 'VA')
('이렇', 'VA')
('쉽', 'VA')
('끝내', 'VV')
('없', 'VA')
('이렇', 'VA')
('좋', 'VA')
('죽', 'VV')
('뛰', 'VV')
('알', 'VV')
('알', 'VV')
('없었', 'VV')
('끌려가', 'VV')
('모르', 'VV')
('오', 'VV')
('모르', 'VV')
('마', 'VV')
('모르', 'VV')
('괜찮', 'VA')
('오', 'VV')
('모르', 'VV')
('오', 'VV')
('모르', 'VV')
('마', 'VV')
('모르', 'VV')
('괜찮', 'VA')
('오', 'VV')
486
('다가오', 'VV')
('살', 'VV')
('흔들', 'VV')
('놓', 'VV')
('가', 'VV')
('꾸', 'VV')
('시작하', 'VV')
('괜찮', 'VA')
('담기', 'VV')
('되', 'VV')
('담기', 'VV')
('오', 'VV')
('되', 'VV')
('멋지', 'VV')
('오', 'VV')
('잡', 'VV')
('흔들', 'VV')
('괜찮', 'VA')
('담기', 'VV')
('되', 'VV')
('담기', 'VV')
('오', 'VV')
('되', 'VV')
(

('달려가', 'VV')
('끄', 'VV')
('보', 'VV')
('하', 'VV')
('가', 'VV')
('바뀌', 'VV')
('같애', 'VV')
('오', 'VV')
('잡', 'VV')
('하', 'VV')
('오', 'VV')
('꾸미', 'VV')
('뜨', 'VV')
('어떻', 'VA')
('되', 'VV')
('걸', 'VV')
('듣', 'VV')
('멋있', 'VA')
('달리', 'VV')
('넘어지', 'VV')
('일으키', 'VV')
('괜찮', 'VA')
('괜찮', 'VA')
('어쩌', 'VV')
('다치', 'VV')
('다치', 'VV')
('기다리', 'VV')
('설레이', 'VV')
('하', 'VV')
('부르', 'VV')
('달려가', 'VV')
('멀', 'VA')
('되', 'VV')
('모르', 'VV')
513
('갇히', 'VV')
('갇히', 'VV')
('갇히', 'VV')
('갇히', 'VV')
('갇히', 'VV')
('갇히', 'VV')
('갇히', 'VV')
('갇히', 'VV')
('그립', 'VA')
('지리', 'VV')
('그립', 'VA')
('지리', 'VV')
('헤어지', 'VV')
('우리', 'VV')
('갈', 'VV')
('없', 'VA')
('커지', 'VV')
('갈', 'VV')
('버리', 'VV')
('하', 'VV')
('뜨', 'VV')
('가리', 'VV')
('하', 'VV')
('하', 'VV')
('가', 'VV')
('있', 'VV')
('그렇', 'VA')
('바라보', 'VV')
('알', 'VV')
('없', 'VA')
('좋', 'VA')
('보이', 'VV')
('잊', 'VV')
('부르', 'VV')
('어떻', 'VA')
('드', 'VV')
('모르', 'VV')
('치', 'VV')
('찾', 'VV')
('어떻', 'VA')
('드', 'VV')
('죽', 'VV')
('찾', 'VV')
('찾', 'VV')
('위하', 'V

('없', 'VA')
('멈추', 'VV')
('달려오', 'VV')
('있', 'VV')
('없', 'VA')
('그', 'VV')
('이렇', 'VA')
('피하', 'VV')
('없', 'VA')
('믿', 'VV')
('쉽', 'VA')
('견디', 'VV')
('오', 'VV')
('되', 'VV')
('지키', 'VV')
('봐주', 'VV')
('멈추', 'VV')
('없', 'VA')
('걷', 'VV')
('꿇', 'VV')
('보이', 'VV')
('하', 'VV')
('있', 'VV')
('가지', 'VV')
('죽', 'VV')
('울', 'VV')
('슬프', 'VA')
('웃', 'VV')
('있', 'VV')
('듣', 'VV')
('잊', 'VV')
('끝내', 'VV')
('시작하', 'VV')
('없', 'VA')
('부딪히', 'VV')
('아파하', 'VV')
('있', 'VV')
('따르', 'VV')
('없', 'VA')
('않', 'VV')
('치', 'VV')
('쓰', 'VV')
('일어서', 'VV')
('그렇', 'VA')
('없', 'VA')
('지키', 'VV')
('보', 'VV')
('멈추', 'VV')
('없', 'VA')
('걷', 'VV')
('꿇', 'VV')
('보이', 'VV')
('하', 'VV')
('있', 'VV')
('가지', 'VV')
('죽', 'VV')
('울', 'VV')
('슬프', 'VA')
('웃', 'VV')
('있', 'VV')
('듣', 'VV')
('잊', 'VV')
('보', 'VV')
('치', 'VV')
('힘들', 'VV')
('향하', 'VV')
('벌리', 'VV')
('두렵', 'VV')
('하', 'VV')
('피하', 'VV')
('하', 'VV')
('믿', 'VV')
('끄', 'VV')
('걷', 'VV')
('꺽이', 'VV')
('않', 'VV')
('안', 'VV')
('죽', 'VV')
('울', 'VV')
('슬프', 'VA')
('웃',

('대', 'VV')
('들어가', 'VV')
('받', 'VV')
('들이', 'VV')
('안기', 'VV')
('흘르', 'VV')
('하', 'VV')
('맺히', 'VV')
('말하', 'VV')
('싶', 'VV')
('싶', 'VV')
('흘르', 'VV')
('가', 'VV')
('아니', 'VV')
('타', 'VV')
('맡', 'VV')
('오', 'VV')
('비치', 'VV')
('같', 'VA')
('아니', 'VV')
('보', 'VV')
('하', 'VV')
('있', 'VV')
('맞이하', 'VV')
('입', 'VV')
('받', 'VV')
('까', 'VV')
('까', 'VV')
('묻히', 'VV')
('보', 'VV')
('마', 'VV')
('마', 'VV')
('마', 'VV')
('마', 'VV')
('보', 'VV')
('비치', 'VV')
('보', 'VV')
('묻히', 'VV')
('숨쉬', 'VV')
('메마르', 'VA')
('기다리', 'VV')
('맴돌', 'VV')
('말하', 'VV')
('싶', 'VV')
('붙잡', 'VV')
('싶', 'VV')
('가두', 'VV')
('싶', 'VV')
('아무렇', 'VA')
('까', 'VV')
('까맣', 'VA')
('이렇', 'VA')
('타', 'VV')
('타', 'VV')
('사라지', 'VV')
('빠르', 'VA')
('스며들', 'VV')
('알', 'VV')
('없', 'VA')
('아무렇', 'VA')
('숨쉬', 'VV')
('메마르', 'VA')
('기다리', 'VV')
('맴돌', 'VV')
('말하', 'VV')
('싶', 'VV')
('붙잡', 'VV')
('싶', 'VV')
('가두', 'VV')
('급하', 'VA')
('보', 'VV')
('치', 'VV')
('뜨', 'VV')
('그러', 'VV')
('보', 'VV')
('마', 'VV')
('보', 'VV')
('멈추', 'VV')
('없', 'VA')
('까'

('되', 'VV')
('떠나', 'VV')
('보', 'VV')
('이러', 'VV')
('보', 'VV')
('드', 'VV')
('알', 'VV')
('얼', 'VV')
('잊', 'VV')
('보', 'VV')
('닮', 'VV')
('숙이', 'VV')
('슬프', 'VA')
('새기', 'VV')
('대', 'VV')
('안', 'VA')
('되', 'VV')
('괜찮', 'VA')
('고맙', 'VA')
('가', 'VV')
('하', 'VV')
('숨기', 'VV')
('다시', 'VA')
('보', 'VV')
('없', 'VA')
('어루만지', 'VV')
('못하', 'VA')
('머무', 'VV')
('바라', 'VV')
('놔주', 'VV')
('뜨', 'VV')
('듣', 'VV')
('웃', 'VV')
('지키', 'VV')
('태어나', 'VV')
('오', 'VV')
('더느', 'VV')
('모르', 'VV')
('뜨', 'VV')
('지키', 'VV')
599
('만나', 'VV')
('떠오', 'VV')
('걷', 'VV')
('늘', 'VV')
('없', 'VA')
('하얗', 'VA')
('덜', 'VV')
('떠오', 'VV')
('흘르', 'VV')
('뜨', 'VV')
('느끼', 'VV')
('있', 'VV')
('없', 'VA')
('크', 'VA')
('들', 'VV')
('있', 'VV')
('많', 'VA')
('지새', 'VV')
('흘리', 'VV')
('떠오', 'VV')
('모르', 'VV')
('하', 'VV')
('있', 'VV')
('덜', 'VV')
('뜨', 'VV')
('덜', 'VV')
('없', 'VA')
('크', 'VA')
('느껴지', 'VV')
('남', 'VV')
('취하', 'VV')
('치', 'VV')
('없', 'VA')
('뜨', 'VV')
('뜨', 'VV')
('느끼', 'VV')
('있', 'VV')
('없', 'VA')
('크', 'VA')
('들', 'VV')


('돋', 'VV')
('서', 'VV')
('아름답', 'VA')
('남', 'VV')
('많', 'VA')
('예쁘', 'VA')
('하', 'VV')
('고맙', 'VA')
('믿', 'VV')
('싶', 'VV')
('이러', 'VV')
('보', 'VV')
('없', 'VA')
('잘하', 'VV')
('짧', 'VA')
('남기', 'VV')
('마', 'VV')
('안되', 'VV')
631
('날으', 'VV')
('오르', 'VV')
('잠들', 'VV')
('가', 'VV')
('싶', 'VV')
('뽀얗', 'VA')
('안기', 'VV')
('하', 'VV')
('듣', 'VV')
('되', 'VV')
('따르', 'VV')
('대', 'VV')
('잊', 'VV')
('하', 'VV')
('같', 'VA')
('깨우', 'VV')
('사라지', 'VV')
('가', 'VV')
('늘', 'VV')
('그렇', 'VA')
('쉽', 'VA')
('웃', 'VV')
('돌', 'VV')
('보', 'VV')
('울', 'VV')
('대', 'VV')
('잊', 'VV')
('하', 'VV')
('같', 'VA')
('웃', 'VV')
('안', 'VV')
('하', 'VV')
('하', 'VV')
('울', 'VV')
('돌아오', 'VV')
('기다리', 'VV')
632
('작', 'VA')
('어', 'VV')
('죽', 'VV')
('갈', 'VV')
('어리', 'VA')
('기다리', 'VV')
('있', 'VV')
('다르', 'VA')
('그', 'VV')
('없었', 'VV')
('이렇', 'VA')
('살', 'VV')
('날으', 'VV')
('오르', 'VV')
('숨차', 'VV')
('어', 'VV')
('되', 'VV')
('되', 'VV')
('차갑', 'VA')
('대', 'VV')
('찾', 'VV')
('내', 'VV')
('날으', 'VV')
('오르', 'VV')
('헤엄치', 'VV')
('숨차', '

('되', 'VV')
('끄', 'VV')
('걸', 'VV')
('하', 'VV')
('웃', 'VV')
('원하', 'VV')
('하', 'VV')
('되', 'VV')
('달르', 'VA')
('끄', 'VV')
('보', 'VV')
('말하', 'VV')
('다가오', 'VV')
('하', 'VV')
('물어보', 'VV')
('대', 'VV')
('원하', 'VV')
('하', 'VV')
('만하', 'VV')
('없', 'VA')
('걸', 'VV')
('하', 'VV')
('웃', 'VV')
('원하', 'VV')
('하', 'VV')
('되', 'VV')
('알', 'VV')
('말하', 'VV')
('다가오', 'VV')
('알려주', 'VV')
('하', 'VV')
665
('알', 'VV')
('있', 'VV')
('갖', 'VV')
('어쩌', 'VV')
('떵떵거리', 'VV')
('시키', 'VV')
('아니', 'VV')
('아끼', 'VV')
('그러', 'VV')
('줄', 'VV')
('가지', 'VV')
('있', 'VV')
('알', 'VV')
('있', 'VV')
('갖', 'VV')
('아니', 'VV')
('바치', 'VV')
('기쁘', 'VA')
('하', 'VV')
('그', 'VV')
('멋지', 'VV')
('위하', 'VV')
('주', 'VV')
('버리', 'VV')
('하', 'VV')
('걸', 'VV')
('알', 'VV')
('있', 'VV')
('갖', 'VV')
('하', 'VV')
('있', 'VV')
('걸', 'VV')
('걷', 'VV')
('버리', 'VV')
('마', 'VV')
('줄', 'VV')
('있', 'VV')
('아프', 'VA')
('없', 'VA')
('버리', 'VV')
('알', 'VV')
('있', 'VV')
('갖', 'VV')
666
('타오르', 'VV')
('커지', 'VV')
('오', 'VV')
('그리하', 'VV')
('많', 'VA')
('많', 

('안', 'VV')
('웃', 'VV')
('되', 'VV')
('오', 'VV')
('치', 'VV')
('믿', 'VV')
('오', 'VV')
('지키', 'VV')
('마', 'VV')
('아프', 'VA')
('아니', 'VV')
('아니', 'VV')
('지키', 'VV')
('우리', 'VV')
('없', 'VA')
('다가오', 'VV')
('오', 'VV')
('치', 'VV')
('믿', 'VV')
('오', 'VV')
696
('돌아보', 'VV')
('그러', 'VV')
('만들', 'VV')
('바라보', 'VV')
('만들', 'VV')
('바라보', 'VV')
('그러', 'VV')
('뜨', 'VV')
('없', 'VA')
('걸어가', 'VV')
('위하', 'VV')
('위하', 'VV')
('버티', 'VV')
('말', 'VV')
('넓', 'VA')
('하', 'VV')
('없', 'VA')
('많', 'VA')
('알', 'VV')
('없', 'VA')
('많', 'VA')
('휘둘', 'VV')
('없', 'VA')
('가지', 'VV')
('줄', 'VV')
('위하', 'VV')
('위하', 'VV')
('크', 'VV')
('없', 'VA')
('걸어가', 'VV')
('타오르', 'VV')
('닮', 'VV')
('바래', 'VV')
('앞서가', 'VV')
('바래', 'VV')
('잊', 'VV')
('치', 'VV')
('슬프', 'VA')
('합치', 'VV')
('되', 'VV')
('맞서', 'VV')
('멈추', 'VV')
('위하', 'VV')
('위하', 'VV')
('위하', 'VV')
('위하', 'VV')
697
('이렇', 'VA')
('있', 'VV')
('수줍', 'VA')
('잡', 'VV')
('떨리', 'VV')
('하', 'VV')
('설레이', 'VV')
('상처받', 'VV')
('아파하', 'VV')
('없', 'VA')
('없', 'VA')
('스', 'VV')
('벅차

('드', 'VV')
('치', 'VV')
('갈', 'VV')
('있', 'VV')
('있', 'VV')
('줄', 'VV')
('하', 'VV')
('괜찮', 'VA')
('기다리', 'VV')
725
('이르', 'VV')
('대', 'VV')
('바래다주', 'VV')
('그렇', 'VA')
('기다리', 'VV')
('모르', 'VV')
('이렇', 'VA')
('불', 'VV')
('대', 'VV')
('모르', 'VV')
('많', 'VA')
('파', 'VV')
('그렇', 'VA')
('눈물겹', 'VA')
('알', 'VV')
('있', 'VV')
('기다리', 'VV')
('오', 'VV')
('보', 'VV')
('잊', 'VV')
('갈', 'VV')
('있', 'VV')
('오', 'VV')
('주', 'VV')
('모르', 'VV')
('이렇', 'VA')
('불', 'VV')
('대', 'VV')
('모르', 'VV')
('많', 'VA')
('기', 'VV')
('대', 'VV')
('기다리', 'VV')
('믿', 'VV')
('모르', 'VV')
('힘겹', 'VA')
('대', 'VV')
('모르', 'VV')
('있', 'VV')
('대', 'VV')
('오', 'VV')
('모르', 'VV')
('힘겹', 'VA')
('대', 'VV')
('모르', 'VV')
('있', 'VV')
('대', 'VV')
('오', 'VV')
726
('헤어지', 'VV')
('구차하', 'VA')
('줄', 'VV')
('어렵', 'VV')
('어떻', 'VA')
('드', 'VV')
('보', 'VV')
('보', 'VV')
('헤어지', 'VV')
('헤어지', 'VV')
('힘들', 'VA')
('하', 'VV')
('되', 'VV')
('들리', 'VV')
('많', 'VA')
('같', 'VA')
('이렇', 'VA')
('떠올르', 'VV')
('알', 'VV')
('있', 'VV')
('알', 'VV')
('이렇', 'VA')

('빠지', 'VV')
('만', 'VV')
('지키', 'VV')
('같', 'VA')
('믿', 'VV')
('다하', 'VV')
('되', 'VV')
('없', 'VA')
('낫', 'VV')
('되', 'VV')
('맞', 'VV')
('되', 'VV')
('헤어지', 'VV')
('슬프', 'VA')
('잊', 'VV')
('흘르', 'VV')
('갈', 'VV')
('더하', 'VV')
('갈', 'VV')
('있', 'VV')
('대이', 'VV')
('지키', 'VV')
('아파하', 'VV')
('믿', 'VV')
('마', 'VV')
('믿', 'VV')
('일어서', 'VV')
('지키', 'VV')
('보', 'VV')
('힘들', 'VA')
('그', 'VV')
('받', 'VV')
('들이', 'VV')
('힘들', 'VV')
('지치', 'VV')
('뒤돌아보', 'VV')
('달라지', 'VV')
('시작하', 'VV')
('대이', 'VV')
('지키', 'VV')
('아파하', 'VV')
('믿', 'VV')
('마', 'VV')
('믿', 'VV')
('일어서', 'VV')
('대이', 'VV')
('지키', 'VV')
('아파하', 'VV')
('믿', 'VV')
('마', 'VV')
('믿', 'VV')
('일어서', 'VV')
('지키', 'VV')
('보', 'VV')
('이루', 'VV')
('그렇', 'VA')
('그러', 'VV')
('흔해빠지', 'VV')
('만', 'VV')
('지키', 'VV')
('같', 'VA')
('믿', 'VV')
755
('뜨', 'VV')
('가', 'VV')
('아무렇', 'VA')
('아름답', 'VA')
('벌', 'VA')
('돌아오', 'VV')
('않', 'VV')
('이렇', 'VA')
('쉽', 'VA')
('다가오', 'VV')
('감하', 'VV')
('없', 'VA')
('덜', 'VV')
('그렇', 'VA')
('헤어지', 'VV')
('하', 'VV')
(

('마르', 'VV')
('시들어가', 'VV')
('길', 'VA')
('알', 'VV')
('줄', 'VV')
('잃', 'VV')
('머무', 'VV')
('남기', 'VV')
('아프', 'VA')
('살', 'VV')
('하', 'VV')
('하', 'VV')
('어떻', 'VA')
('잊', 'VV')
('살아가', 'VV')
('울', 'VV')
('아프', 'VV')
('견디', 'VV')
('서', 'VV')
('슬프', 'VA')
('울', 'VV')
('들려주', 'VV')
('곱', 'VA')
('같', 'VA')
('기다리', 'VV')
('지치', 'VV')
('갈', 'VV')
('오', 'VV')
('날아가', 'VV')
('찾', 'VV')
('멀', 'VA')
('가엽', 'VA')
('살', 'VV')
('하', 'VV')
('하', 'VV')
('어떻', 'VA')
('잊', 'VV')
('살아가', 'VV')
('울', 'VV')
('아프', 'VV')
('견디', 'VV')
('서', 'VV')
('슬프', 'VA')
('울', 'VV')
('태우', 'VV')
('헤매', 'VV')
('없', 'VA')
('살', 'VV')
('하', 'VV')
('하', 'VV')
('어떻', 'VA')
('잊', 'VV')
('살아가', 'VV')
('울', 'VV')
('아프', 'VV')
('견디', 'VV')
('서', 'VV')
('슬프', 'VA')
('울', 'VV')
('믿', 'VV')
('덮', 'VV')
('없', 'VA')
('지치', 'VV')
('갈', 'VV')
('묻어두', 'VV')
('없', 'VA')
('남기', 'VV')
809
('울', 'VV')
('말', 'VV')
('이렇', 'VA')
('뜨', 'VV')
('아프', 'VV')
('보', 'VV')
('없었', 'VV')
('안', 'VV')
('하', 'VV')
('하', 'VV')
('하', 'VV')
('흔들리', 'VV')
('만들

('없', 'VA')
('있', 'VV')
('바라', 'VV')
('있', 'VV')
('피우', 'VV')
('주', 'VV')
('하', 'VV')
('달리', 'VV')
('있', 'VV')
('알', 'VV')
('싶', 'VV')
('이렇', 'VA')
('싫', 'VA')
('느', 'VV')
('하', 'VV')
('느끼', 'VV')
('키', 'VV')
('없잖', 'VA')
('싫', 'VA')
('느', 'VV')
('하', 'VV')
('느끼', 'VV')
('키', 'VV')
('없잖', 'VA')
('달리', 'VV')
('있', 'VV')
('알', 'VV')
('싶', 'VV')
('이렇', 'VA')
('달리', 'VV')
('있', 'VV')
('알', 'VV')
('싶', 'VV')
('이렇', 'VA')
837
('보', 'VV')
('만들', 'VV')
('늘', 'VV')
('사라지', 'VV')
('까', 'VV')
('보', 'VV')
('듣', 'VV')
('재', 'VV')
('중하', 'VA')
('무디', 'VA')
('흘르', 'VV')
('하', 'VV')
('가', 'VV')
('아', 'VV')
('채우', 'VV')
('죽', 'VV')
('하', 'VV')
('없', 'VA')
('흘르', 'VV')
('까', 'VV')
('없', 'VA')
('파고들', 'VV')
('아프', 'VA')
('더느', 'VV')
('참으', 'VV')
('없', 'VA')
('바로잡', 'VV')
('있', 'VV')
('말', 'VV')
('비추', 'VV')
('되', 'VV')
('없다', 'VA')
('하', 'VV')
('바라보', 'VV')
('보이', 'VV')
('있', 'VV')
('드', 'VV')
('걸', 'VV')
('말하', 'VV')
('버리', 'VV')
('남', 'VV')
('남', 'VV')
('즐기', 'VV')
('보', 'VV')
('되', 'VV')
('보여주', 'VV')

('보', 'VV')
('해지', 'VV')
('다가오', 'VV')
('달', 'VV')
('죽', 'VV')
('죽', 'VV')
('아프', 'VV')
('괴롭', 'VA')
('기울이', 'VV')
('보', 'VV')
('그러', 'VV')
('뜨', 'VV')
('날', 'VV')
('버리', 'VV')
('외치', 'VV')
('보', 'VV')
('해지', 'VV')
('다가오', 'VV')
('달', 'VV')
('그러', 'VV')
('뜨', 'VV')
('털', 'VV')
('날', 'VV')
('버리', 'VV')
('외치', 'VV')
('보', 'VV')
('해지', 'VV')
('다가오', 'VV')
('달', 'VV')
877
('없', 'VA')
('하', 'VV')
('가지', 'VV')
('없', 'VA')
('잊', 'VV')
('시작하', 'VV')
('울', 'VV')
('되', 'VV')
('보', 'VV')
('좋', 'VA')
('놓치', 'VV')
('안되', 'VV')
('말', 'VV')
('그리하', 'VV')
('가', 'VV')
('더느', 'VV')
('남기', 'VV')
('말', 'VV')
('알', 'VV')
('같', 'VA')
('대놓', 'VV')
('하', 'VV')
('대', 'VV')
('모', 'VV')
('말', 'VV')
('울', 'VV')
('울', 'VV')
('안되', 'VV')
('착하', 'VA')
('다치', 'VV')
('하', 'VV')
('하', 'VV')
('잊', 'VV')
('말', 'VV')
('그리하', 'VV')
('헤어지', 'VV')
('지', 'VV')
('돌아오', 'VV')
878
('없', 'VA')
('눈부시', 'VA')
('웃', 'VV')
('헤어지', 'VV')
('모르', 'VV')
('슬프', 'VA')
('일', 'VV')
('데려가', 'VV')
('못하', 'VA')
('하', 'VA')
('깊', 'VA')
('박히', 'V

917
('잊', 'VV')
('맞', 'VV')
('걷', 'VV')
('걷', 'VV')
('있', 'VV')
('잊', 'VV')
('없', 'VA')
('오', 'VV')
('하', 'VV')
('없', 'VA')
('잊', 'VV')
('없', 'VA')
('하', 'VV')
('맞', 'VV')
('걷', 'VV')
('걷', 'VV')
('있', 'VV')
('같', 'VA')
('느끼', 'VV')
('보', 'VV')
('흐르', 'VV')
('있', 'VV')
('잊', 'VV')
('없', 'VA')
('오', 'VV')
('하', 'VV')
('있', 'VV')
('잊', 'VV')
('없', 'VA')
('오', 'VV')
('하', 'VV')
('없', 'VA')
('잊', 'VV')
('없', 'VA')
('하', 'VV')
918
('스미', 'VV')
('오', 'VV')
('아니', 'VV')
('하', 'VV')
('돋', 'VV')
('다가서', 'VV')
('길', 'VA')
('수많', 'VA')
('잊히', 'VV')
('않', 'VV')
('슬프', 'VA')
('많', 'VA')
('드', 'VV')
('있', 'VV')
('스', 'VV')
('잊', 'VV')
('그립', 'VA')
('보이', 'VV')
('살아가', 'VV')
('보', 'VV')
('없', 'VA')
('되', 'VV')
('보', 'VV')
('쉬', 'VV')
('갈', 'VV')
('스', 'VV')
('잊', 'VV')
('그립', 'VA')
('보이', 'VV')
('깊', 'VA')
('빠지', 'VV')
('오', 'VV')
('바라', 'VV')
('하', 'VV')
('뜨', 'VV')
('아프', 'VA')
('기다리', 'VV')
('많', 'VA')
('흘르', 'VV')
('지우', 'VV')
('없', 'VA')
('그리', 'VV')
('보', 'VV')
('없', 'VA')
('힘들', 'VV')
('기다리', 

('어쩌', 'VV')
('되', 'VV')
('모르', 'VV')
('떠나', 'VV')
('모르', 'VV')
('알', 'VV')
('채', 'VV')
('같', 'VA')
('오', 'VV')
('마주치', 'VV')
('돌아오', 'VV')
('힘들', 'VV')
('하', 'VV')
('깊', 'VA')
('빠지', 'VV')
('알', 'VV')
('뜨', 'VV')
('있', 'VV')
('버리', 'VV')
('지나', 'VV')
('돌아가', 'VV')
('뜨', 'VV')
('갈', 'VV')
('겪', 'VV')
('같', 'VA')
('질', 'VV')
('원하', 'VV')
('하', 'VV')
('잡', 'VV')
('없', 'VA')
('아프', 'VA')
('하', 'VV')
('잊히', 'VV')
('미워하', 'VV')
('달', 'VV')
('그렇', 'VA')
('하', 'VV')
('알', 'VV')
('있', 'VV')
('하', 'VV')
('벌', 'VA')
('버리', 'VV')
('벌', 'VA')
('버리', 'VV')
('쉽', 'VA')
('뜨', 'VV')
('있', 'VV')
('버리', 'VV')
('지나', 'VV')
('아', 'VV')
('있', 'VV')
('뜨', 'VV')
('있', 'VV')
('버리', 'VV')
('지나', 'VV')
('돌아가', 'VV')
('뜨', 'VV')
('갈', 'VV')
('겪', 'VV')
('같', 'VA')
('질', 'VV')
('아', 'VV')
('없', 'VA')
950
('새롭', 'VA')
('다가오', 'VV')
('향하', 'VV')
('열', 'VV')
('다가오', 'VV')
('느끼', 'VV')
('있', 'VV')
('다가오', 'VV')
('밝히', 'VV')
('짙', 'VV')
('어지럽', 'VA')
('하', 'VV')
('살', 'VV')
('채우', 'VV')
('다가오', 'VV')
('다가오', 'VV')
('밝

('대', 'VV')
976
('놓치', 'VV')
('없', 'VA')
('키우', 'VV')
('놓치', 'VV')
('없', 'VA')
('오', 'VV')
('기다리', 'VV')
('하', 'VV')
('가지', 'VV')
('다가오', 'VV')
('하', 'VV')
('짓', 'VV')
('앉', 'VV')
('짓', 'VV')
('보', 'VV')
('알', 'VV')
('놀리', 'VV')
('그렇', 'VA')
('어렵', 'VV')
('스치', 'VV')
('질', 'VV')
('놓치', 'VV')
('없', 'VA')
('기다리', 'VV')
('숨기', 'VV')
('이렇', 'VA')
('놓치', 'VV')
('없', 'VA')
('받', 'VV')
('바래', 'VV')
('이렇', 'VA')
('날으', 'VV')
('외롭', 'VA')
('바래', 'VV')
('이렇', 'VA')
('외롭', 'VA')
('기다리', 'VV')
('들키', 'VV')
('놀', 'VV')
('갖', 'VV')
('바래', 'VV')
('까', 'VV')
('스치', 'VV')
('질', 'VV')
('놓치', 'VV')
('없', 'VA')
('기다리', 'VV')
('숨기', 'VV')
('이렇', 'VA')
('놓치', 'VV')
('없', 'VA')
('받', 'VV')
('바래', 'VV')
('기다리', 'VV')
('있', 'VV')
('섣부르', 'VV')
('거두', 'VV')
('가', 'VV')
('수많', 'VA')
('보', 'VV')
('되', 'VV')
('되', 'VV')
('작', 'VA')
('안기', 'VV')
('되', 'VV')
('스치', 'VV')
('질', 'VV')
('놓치', 'VV')
('없', 'VA')
('기다리', 'VV')
('숨기', 'VV')
('이렇', 'VA')
('놓치', 'VV')
('없', 'VA')
('받', 'VV')
('바래', 'VV')
('스치', 'VV')
('질', '

# 4. gensim을 이용한 언어모델 생성
## 4-1. 빈도수가 높은 단어들 중에서 토픽 선정과는 상관없는 단어 제거 


### 각 데이터셋에 단어별 빈도 수 체크

In [95]:
# Calculate the frequency of each word in dataset
from collections import defaultdict
frequency = defaultdict(int)
for text in dataset:
    for token in text:
        frequency[token] += 1

In [96]:
# Print the list of most frequent occuring words
import operator
most_frq = sorted(frequency.items(),key = operator.itemgetter(1), reverse=True)

# Top 100 most frquent words
for i in range(100):
    print(i+1,"word :",most_frq[i][0], " | ", "frequency:",most_frq[i][1])

1 word : 하  |  frequency: 7841
2 word : 나  |  frequency: 4330
3 word : 지  |  frequency: 4282
4 word : 날  |  frequency: 3662
5 word : 보  |  frequency: 3557
6 word : 수  |  frequency: 3512
7 word : 오  |  frequency: 3387
8 word : 리  |  frequency: 3295
9 word : 말  |  frequency: 3191
10 word : 그  |  frequency: 3186
11 word : 없  |  frequency: 3120
12 word : 사랑  |  frequency: 2971
13 word : 있  |  frequency: 2832
14 word : 널  |  frequency: 2737
15 word : 아  |  frequency: 2687
16 word : 맘  |  frequency: 2590
17 word : 때  |  frequency: 2453
18 word : 눈  |  frequency: 2449
19 word : 무  |  frequency: 2393
20 word : 우리  |  frequency: 2366
21 word : 안  |  frequency: 2346
22 word : 속  |  frequency: 2326
23 word : 니  |  frequency: 2294
24 word : 이  |  frequency: 2292
25 word : 가  |  frequency: 2279
26 word : 의  |  frequency: 2188
27 word : 마  |  frequency: 2171
28 word : 걸  |  frequency: 2155
29 word : 너  |  frequency: 2049
30 word : 만  |  frequency: 1958
31 word : 시간  |  frequency: 1746
32 word : 어  |

## 4-2. 선 토픽모델링 시각화 후 다시 전처리


In [97]:
# 내가, 나의, 나도, 무엇,...
data2 = []
for i in range(len(data)):
    data2.append(re.sub('내가',' ',data[i]))
data3 = []
for i in range(len(data)):
    data3.append(re.sub('무엇',' ',data2[i]))
data4 = []
for i in range(len(data)):
    data4.append(re.sub('나의',' ',data3[i]))
data5 = []
for i in range(len(data)):
    data5.append(re.sub('무엇',' ',data4[i]))
data6 = []
for i in range(len(data)):
    data6.append(re.sub('나는',' ',data5[i]))
data7 = []
for i in range(len(data)):
    data7.append(re.sub('나',' ',data6[i]))
data8 = []
for i in range(len(data)):
    data8.append(re.sub('내',' ',data7[i]))
data9 = []
for i in range(len(data)):
    data9.append(re.sub('(\',\', \'SP\')',' ',data8[i]))
data10 = []
for i in range(len(data)):
    data10.append(re.sub('(\',\', \'SS\')',' ',data9[i]))
data11 = []
for i in range(len(data)):
    data11.append(re.sub('너',' ',data10[i]))
#data6

# 5. dataset으로 gensim 모델 학습 시키기
- 데이터를 dictionary 형태로 명사 리스트 만들기
- 명사 형태의 문서별로 말뭉치 만들기

In [98]:
from gensim import corpora, models
import gensim

In [99]:
high_score_reviews = dataset

In [100]:
# gensim 라이브러리 : LDA 적용을 위한 텍스트의 벡터화
# 문자열 길이가 1이 아닐 때, y를 x만큼 넣고, x를 high_score_review에 리스트에 하나씩 저장
high_score_reviews = [[y for y in x if not len(y)==1]
                        for x in high_score_reviews]
dictionary = corpora.Dictionary(high_score_reviews)
corpus = [dictionary.doc2bow(text) for text in high_score_reviews]
                

# 6. Perplexity 및 Coherence을 통한 모델 평가 및 토픽 최적화

## 6-1. CoherenceModel을 통한 토픽 최적화

### 의미 : 토픽이 얼마나 의미론적으로 일관성 있는지 판단, 높을수록 의미론적 일관성 높음
### 주용도 : 해당 모델이 얼마나 실제로 의미 있는 결론을 내는지 확인

- 기존에 언어 모델 평가로 해당 모델만을 사용 후 원하는 토픽 개수의 모델을 지속 학습 시켜 토픽 할당
- 추후에는 두 가지 모델 함께 적용해보는 것과 좀 더 정밀한 사용이 필요

In [None]:
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel

coherence_values = []
for i in range(2,15):
    print(i)
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = i, id2word = dictionary)
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=high_score_reviews, dictionary= dictionary, topn=10)
    coherence_lda = coherence_model_lda.get_coherence()
    coherence_values.append(coherence_lda)

2


In [None]:
x = range(2,15)
plt.plot(x, coherence_values)
plt.xlabel("number of topics")
plt.ylabel("coherence score")
plt.show()

## 6-2 언어 모델 평가 방법 : Perplexity (PPL)
### PPL은 선정된 토픽 개수마다 학습시켜 가장 낮은 값을 보이는 구간을 찾아 최적화된 토픽의 개수 선정 가능

- 의미 : 확률 모델이 겨로가를 얼마나 정확하게 예측하는지 판단. 낮을 수록 정확하게 예측
- 주용도 : 동일 모델 내 파라미터에 따른 성능 평가할 때 주로 사용
- 한계 : ppl이 낮다고 해서 결과가 해석 용이하다는 의미가 아님

In [None]:
import matplotlib.pyplot as plt
perplexity_values = []
for i in range(2,20):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = i, id2word = dictionary)
    perplexity_values.append(ldamodel.log_perplexity(corpus))
    

In [None]:
x = range(2,20)
plt.plot(x, perplexity_values)
plt.xlabel("number of topics")
plt.ylabel("perplexity score")
plt.show()

# 7. 하이퍼 파라미터 선정 및 LDA 시각화
- 학습된 코퍼스로 토픽 개수를 선정하고 다양한 파라미터 적용 가능

In [None]:
# 위의 결과들 바탕으로 갯수 수정하기
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=8, alpha = 0.1, id2word=dictionary)
ldamodel.print_topics(num_words=20)

## 7-1. pyLDAvis를 불러온 뒤 학습된 모델 시각화 진행
- 내가 원하는 주제들을 설명할 수 있는 단어들을 끌어내기 위해 파라미터 값을 설정하기도 하므로 해당 단어들의 이해도나 문서에 담긴 도메인 이해도가 중요
- 추가로 해당 토픽에 묶인 단어들이 사용자 사전에 정의되지 않았다면 사용자 사전을 추가하여 단어를 등록하는 과정도 필요

### LDAvis
1. 왼쪽에 출력되는 topic의 2차원 embedding vector로 비슷한 위치에 존재하는 토픽들은 서로 비슷한 문맥을 지니고 있음
2. 오른쪽에 출력되는 각 토픽의 키워드

In [None]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
vis

In [None]:
kk = ldamodel.show_topic(7,topn=80000)
type(ldamodel.show_topic)
kk2 = pd.DataFrame(kk)
kk2.head(10)

# 8. 토픽에 할당된 가사 및 키워드 추출
## 8-1. 토픽에 할당된 키워드 추출
1. 각 토픽에 할당된 단어와 단어별 토픽 차지 비율 추출
2. 데이터 프레임화
3. 토픽별 추출

## 8-2. 토픽에 할당된 가사 추출
- 각 가사별로 토픽에 할당되는 토픽 번호와 차지하는 비중을 만들기 위한 코드
- 해당 코드를 통해 문서 개별로 가장 크게 할당된 토픽의 번호와 비율 확인 가능
- 여러 토픽에 중첩 할당된 경우 개별 할당된 값도 확인 가능

### 분석 과정에서의 ldea
1. 단어별로 토픽 모델링 결과를 잘 나타냄
2. 문서별로 다시 묶음
3. 각 주제에 해당하는 문서들끼리만 토픽 모델링 결과 내기
4. 하나의 주체에서 또 다르게 얘기하는 주제들을 끄집어 낼 수 있음

In [None]:
def make_topictable_per_lyrics(ldamodel, corpus):
    topic_table = pd.DataFrame()
    
    # 몇 번째 가사인지를 의미하는 가사 번호와 해당 가사의 토픽 비중을 한줄씩 꺼내오기
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list
        doc = sorted(doc, key = lambda x : (x[1]), reverse = True)
        # 각 가사들에 대해서 비중이 높은 토픽순으로 토픽 정렬
        
        # 모든 가사에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): # 몇 번 토픿인지와 비중을 나눠서 저장한다
            if j == 0: # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_list]), ignore_index = True)
            else:
                break
        return(topic_table)
    

In [None]:
topictable = make_topictable_per_lyrics(ldamodel, corpus)
topictable = topictable.reset_index()
topictable.columns= ['가사','가장 비중이 높은 토픽','가장 높은 토픽의 비중','각 토픽의 비중']
topictable[:100]