In [1]:
import pandas as pd
nsmc = pd.read_csv('../dataset/nsmc_train.csv', sep='\t')

In [2]:
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.prepare()

0

In [3]:
text = nsmc.document[0]

In [4]:
result = kiwi.analyze(text)

In [5]:
def extract_keywords(text):
    result = kiwi.analyze(text)
    for token, pos, start, end in result[0][0]:
        if pos[0] in 'NV':
            yield f'{token}/{pos}'

In [8]:
list(extract_keywords(nsmc.document[0]))

['더빙/NNG', '짜증/NNG', '나/VV', '목소리/NNG']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [10]:
cv = CountVectorizer(max_features=1000, tokenizer=extract_keywords)
dtm = cv.fit_transform(nsmc.loc[0:1999, 'document'])

In [14]:
trans = TfidfTransformer()
dtm2 = trans.fit_transform(dtm)

In [15]:
words = cv.get_feature_names()

In [16]:
import joblib
joblib.dump({'words': words, 'dtm': dtm, 'dtm2': dtm2}, 'nsmc.pki')

['nsmc.pki']

## 감성분석 실습

In [17]:
import joblib
data = joblib.load('nsmc.pki')

In [18]:
locals().update(data)

In [19]:
dtm

<2000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 12479 stored elements in Compressed Sparse Row format>

In [20]:
import pandas as pd
nsmc = pd.read_csv('../dataset/nsmc_train.csv', sep='\t')

In [21]:
x = dtm

In [22]:
y = nsmc.label.values[:2000]

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   test_size=0.2,
                                                   random_state=1984)

In [25]:
import tensorflow as tf

In [26]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [27]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [28]:
model.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x27e644e7880>

In [29]:
model.evaluate(x_test.A, y_test)



[0.6110315918922424, 0.7200000286102295]

In [30]:
model.save('nsmc.krs')

INFO:tensorflow:Assets written to: nsmc.krs\assets


## 가중치 분석

In [31]:
model = tf.keras.models.load_model('nsmc.krs')

In [32]:
w, b = model.weights

In [33]:
word_sent = pd.DataFrame({'토큰': words, '가중치': w.numpy().flat})

In [34]:
word_sent.sort_values('가중치').head(10)

Unnamed: 0,토큰,가중치
537,아깝/VA,-0.340692
756,재미없/VA,-0.306921
331,뭐/NP,-0.291046
593,없/VA,-0.273475
206,돈/NNG,-0.254213
582,어이없/VA,-0.239978
225,드럽/VA,-0.224282
883,최악/NNG,-0.223266
776,점/NNB,-0.219449
914,터/NNB,-0.219275


In [35]:
word_sent.sort_values('가중치').tail(10)

Unnamed: 0,토큰,가중치
601,여운/NNG,0.212376
949,필요/NNG,0.218571
503,시절/NNG,0.22552
164,눈물/NNG,0.228728
757,재미있/VA,0.239358
62,괜찮/VA,0.255047
72,귀엽/VA,0.266097
304,명작/NNG,0.315355
759,재밌/VA,0.37266
881,최고/NNG,0.403372


## 희소행렬 변환

In [36]:
type(x_train)

scipy.sparse.csr.csr_matrix

In [37]:
x_coo = x_train.tocoo()

In [38]:
x_coo.row

array([   0,    0,    0, ..., 1599, 1599, 1599])

In [39]:
x_coo.col

array([  1, 119, 140, ..., 754, 951, 952])

In [40]:
x_coo.data

array([1, 1, 1, ..., 1, 1, 7], dtype=int64)

In [41]:
import numpy as np

In [42]:
index = np.column_stack((x_coo.row, x_coo.col))

In [43]:
index

array([[   0,    1],
       [   0,  119],
       [   0,  140],
       ...,
       [1599,  754],
       [1599,  951],
       [1599,  952]])

In [44]:
x_train_sparse = tf.SparseTensor(index, x_coo.data, x_coo.shape)

In [45]:
x_train_sparse = tf.sparse.reorder(x_train_sparse)

In [46]:
model.fit(x_train_sparse, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x27e7498eb20>

## Early Stopping

In [47]:
model.fit(x_train.A, y_train, epochs=100, validation_split=0.1,
         callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy')])

Epoch 1/100
Epoch 2/100


<tensorflow.python.keras.callbacks.History at 0x27e77779bb0>