In [1]:
# My Google Drive Mount하기!
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# 1. Sentimental Analysis Methodology

* 문장의 감성(sentiment)을 예측
    * 감성: 긍정 vs 부정
    * 찬성 vs 반대, 좋다 vs 싫다 등
* c.f., 정서(emotion): 기쁨, 슬픔, 분노 등

**Sentimental Analysis**
1. 사전 기반
2. ML 기반


## (1) 사전 기반 Sentimental Analysis

* 단어별로 긍정/부정을 분류하여 감성 사전을 만든다.
* 긍정단어: 좋다, 만족한다, 뛰어나다 등
* 부정단어: 나쁘다, 불만이다, 뒤떨어진다 등
* 문장에서 긍정단어의 수와 부정단어의 수를 세서 많은 쪽으로 결정

1. 장점
    * 배경지식이 있다면 감성 사전을 만들 수 있음
    * 복잡한 통계나 ML 지식 필요 X
    * 프로그램으로 만들기 간단함
2. 단점
    * ML 지식이 필요함
    * 사전으로 만드는 노력이 필요함
    * 문장의 어순을 고려 X

## (2) ML 기반 Sentimental Analysis

1. 장점
    * 감성 사전보다 높은 성능
    * 배경지식이 불필요
    * Model에 따라 문장의 어순 고려할 수 있음
    * 감성 사전을 자동으로 만들 수도 있음
2. 단점
    * 대량의 Training Data가 필요함
    * Training Data에 긍정/부정 Labeling하는 노력이 필요함
    * Stats와 ML에 대한 지식이 필요함

# 2. Linear Model

* Linear Model: y = wx + b

    * x: Document 내 특정 term의 frequency
    * y: Document의 긍정/부정 (1 or 0)
    * w: weight
    * b: bias

* w > 0 : x가 증가할수록 y도 증가 (긍정단어)
* w < 0 : x가 증가할수록 y는 감소 (부정단어)

# 3. Logistic Model

* y = logistic(wx + b)
* Linear Model에 Logistic Function을 결합하여, y가 0~1 범위를 가지게 함

**Logistic Function**
* sigmoid(x) = 1 / (1 + e^(-x))

# 4. Gradient Descent

* Learning(학습): Model의 Parameter(w, b)를 추정하는 과정임
* Model Prediction과 Actual Target의 차이를 Cost Function으로 측정하여, Cost Function의 값을 Minimize하는 Parameter를 찾는다.
* 주로, Gradient Descent라는 Algorithm을 사용함

# 5. Cross Entropy

* ML에서 사용할 수 있는 Cost Function에는 Gradient Descent 외에도 여러 가지가 있음
* Sentimental Analysis와 같은 Binary Classification 문제일 경우,
* Cross Entropy를 Cost Function으로 사용함!
* H(p, q) = - sum (p(x) * log(q(x)))
* p: 실제분포, q: 추정분포
* p와 q 두 확률 분포가 비슷학수록 작아짐!

# 6. Likelihood

* 어떤 Model을 가정했을 때, 우리가 가진 Sample Data가 관찰될 가능성
* 우도(Likelihood)가 높으면, 우리의 가정이 맞다고 생각할 수 있다. (최대우도법)
* 우도에 로그를 씌운 것이 Log 우도
* [Log-우도 최대화]의 의미 = [Cross Entropy 최소화]의 의미

# 7. kiwi를 이용한 Sentimental Analysis

In [2]:
# Naver Movie Review Data Download
import pandas as pd

movie_df = pd.read_csv('https://github.com/e9t/nsmc/raw/master/ratings_train.txt', sep='\t')
movie_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [3]:
print(movie_df.shape)

(150000, 3)


In [5]:
! pip install kiwipiepy
from kiwipiepy import Kiwi

kiwi = Kiwi()
kiwi.prepare()

Collecting kiwipiepy
  Downloading kiwipiepy-0.10.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (495 kB)
[K     |████████████████████████████████| 495 kB 4.1 MB/s 
[?25hCollecting kiwipiepy-model~=0.10
  Downloading kiwipiepy_model-0.10.0.tar.gz (25.7 MB)
[K     |████████████████████████████████| 25.7 MB 60 kB/s 
[?25hBuilding wheels for collected packages: kiwipiepy-model
  Building wheel for kiwipiepy-model (setup.py) ... [?25l[?25hdone
  Created wheel for kiwipiepy-model: filename=kiwipiepy_model-0.10.0-py3-none-any.whl size=25882656 sha256=036c7b5f000fc7f02850c3b5143bb1ae06ea125e52e620daef08d340192276f3
  Stored in directory: /root/.cache/pip/wheels/42/2f/c9/c82156eb15ed5ebb5475002723d1ab319947aca25bd663cd1d
Successfully built kiwipiepy-model
Installing collected packages: kiwipiepy-model, kiwipiepy
Successfully installed kiwipiepy-0.10.0 kiwipiepy-model-0.10.0


  """


일단, Document 1개를 통해 어떻게 접근해야 할지 구상부터 해보자.

In [6]:
doc_1 = movie_df.loc[0, 'document']
result_1 = kiwi.analyze(doc_1)
result_1

[([Token(form='아', tag='IC', start=0, len=1),
   Token(form='더빙', tag='NNG', start=2, len=2),
   Token(form='..', tag='SF', start=4, len=2),
   Token(form='진짜', tag='MAG', start=7, len=2),
   Token(form='짜증', tag='NNG', start=10, len=2),
   Token(form='나', tag='VV', start=12, len=1),
   Token(form='네요', tag='EF', start=13, len=2),
   Token(form='목소리', tag='NNG', start=16, len=3)],
  -66.60433959960938)]

In [7]:
result_1[0][0]

[Token(form='아', tag='IC', start=0, len=1),
 Token(form='더빙', tag='NNG', start=2, len=2),
 Token(form='..', tag='SF', start=4, len=2),
 Token(form='진짜', tag='MAG', start=7, len=2),
 Token(form='짜증', tag='NNG', start=10, len=2),
 Token(form='나', tag='VV', start=12, len=1),
 Token(form='네요', tag='EF', start=13, len=2),
 Token(form='목소리', tag='NNG', start=16, len=3)]

In [8]:
for token, pos, start, length in result_1[0][0]:
    if pos[0] in 'NV':
        print(f'{token}/{pos}')

더빙/NNG
짜증/NNG
나/VV
목소리/NNG


위 과정을 User Defined Function으로 만들어보자!

In [9]:
def ExtractNV(doc):
    result = kiwi.analyze(doc)
    for token, pos, _, _ in result[0][0]:
        if pos[0] in 'NV':
            yield f'{token}/{pos}'

In [10]:
list(ExtractNV(movie_df.loc[0, 'document']))

['더빙/NNG', '짜증/NNG', '나/VV', '목소리/NNG']

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cv = CountVectorizer(
    max_features=1000,
    tokenizer=ExtractNV
)

tdm = cv.fit_transform(
    movie_df.loc[:9999, 'document']
)

print(tdm.shape)

(10000, 1000)


In [18]:
tfidf_trans = TfidfTransformer()
tdm_tfidf = tfidf_trans.fit_transform(tdm)

print(tdm_tfidf.shape)

(10000, 1000)


In [19]:
term_list = cv.get_feature_names()

In [20]:
# tdm, tdf_tfidf, term_list를 모두 local save해주자!

import joblib

FolderPath = '/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data'

joblib.dump({
    'term_list': term_list,
    'tdm': tdm,
    'tdm_tfidf': tdm_tfidf
}, FolderPath + '/' + 'movie_df.pkl')

['/content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data/movie_df.pkl']

In [21]:
data = joblib.load(FolderPath + '/' + 'movie_df.pkl')
locals().update(data)

In [22]:
data_input = tdm_tfidf
data_target = movie_df['label'].values[:10000] # .values를 해줌으로써, Series가 Numpy Array 형태로 변환된다!

print(data_input.shape)
print(data_target.shape)

(10000, 1000)
(10000,)


In [23]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
    train_test_split(
        data_input,
        data_target,
        test_size=0.2,
        random_state=1234
    )

print(train_input.shape, train_target.shape)
print(test_input.shape, test_target.shape)

(8000, 1000) (8000,)
(2000, 1000) (2000,)


# 8. Logistic Regression Practice

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(
    1,
    activation='sigmoid',
    input_shape=(data_input.shape[-1], )
))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 1001      
Total params: 1,001
Trainable params: 1,001
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

model.fit(
    train_input.A,
    train_target,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6711769c10>

In [26]:
model.evaluate(
    test_input.A,
    test_target
)



[0.5785617232322693, 0.7559999823570251]

In [27]:
# Save the model
model.save(FolderPath + '/' + 'movie_df_model.krs')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/03. Kookmin AI Big Data MBA/Semester 3_032021-062021/1. Text Data Analytics/Lecture Notes Review/data/movie_df_model.krs/assets


# 9. Weights Analysis

In [28]:
from tensorflow.keras.models import load_model

model = load_model(FolderPath + '/' + 'movie_df_model.krs')

In [29]:
w, b = model.weights

print(w.shape)
print(b.shape)

(1000, 1)
(1,)


In [31]:
TermWeight_df = pd.DataFrame({
    'term': term_list,
    'weight': w.numpy().flat
})

TermWeight_df = TermWeight_df.sort_values('weight', ascending=False).reset_index(drop=True)
TermWeight_df.head()

Unnamed: 0,term,weight
0,재밌/VA,1.271532
1,최고/NNG,1.194343
2,감동/NNG,0.913389
3,좋/VA,0.88125
4,재미있/VA,0.84767


In [32]:
TermWeight_df.tail()

Unnamed: 0,term,weight
995,최악/NNG,-0.893685
996,없/VA,-0.923763
997,재미없/VA,-0.980357
998,쓰레기/NNG,-0.999975
999,아깝/VA,-1.03957


# 10. Transformation to Sparse Tensor

In [33]:
print(train_input.shape)

(8000, 1000)


In [34]:
type(train_input)

scipy.sparse.csr.csr_matrix

In [35]:
train_input

<8000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 49488 stored elements in Compressed Sparse Row format>

* 8,000,000개 중에서, 오직 49,072개만 1 이상의 값이고 나머지는 모두 0인 Sparse Metrics이다.
* 따라서, train_input.A를 통해 변환하면 오히려 메모리 남비이므로, 다르게 접근을 해보자.

CSR → COO

In [36]:
train_input_coo = train_input.tocoo()
test_input_coo = test_input.tocoo()

In [37]:
print(train_input_coo.row)
print(train_input_coo.col)
print(train_input_coo.data)

print(test_input_coo.row)
print(test_input_coo.col)
print(test_input_coo.data)

[   0    0    0 ... 7999 7999 7999]
[989 891 699 ... 605 388  96]
[0.67006266 0.54542713 0.43419313 ... 0.25519177 0.19360394 0.47571246]
[   1    1    1 ... 1999 1999 1999]
[764 130  70 ... 310  61   2]
[0.42262555 0.67367916 0.60625411 ... 0.55520417 0.58292774 0.45088987]


In [39]:
import numpy as np

train_input_idx = np.column_stack([train_input_coo.row, train_input_coo.col])
train_input_data = train_input_coo.data
train_input_shape = train_input_coo.shape

test_input_idx = np.column_stack([test_input_coo.row, test_input_coo.col])
test_input_data = test_input_coo.data
test_input_shape = test_input_coo.shape

In [40]:
from tensorflow import SparseTensor
from tensorflow import sparse

train_input_sparse = SparseTensor(
    train_input_idx,
    train_input_data,
    train_input_shape
)
test_input_sparse = SparseTensor(
    test_input_idx,
    test_input_data,
    test_input_shape
)

train_input_sparse = sparse.reorder(train_input_sparse)
test_input_sparse = sparse.reorder(test_input_sparse)

In [42]:
model.fit(
    train_input_sparse,
    train_target,
    epochs=10
)

Epoch 1/10


  "shape. This may consume a large amount of memory." % value)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f66c6dad8d0>

# 11. Early Stopping

In [43]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
    train_test_split(
        data_input,
        data_target,
        test_size=0.2,
        random_state=1234
    )

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(1, activation='sigmoid', input_shape=(data_input.shape[-1], )))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 1)                 1001      
Total params: 1,001
Trainable params: 1,001
Non-trainable params: 0
_________________________________________________________________


In [46]:
from tensorflow.keras.callbacks import EarlyStopping

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

STP = EarlyStopping(monitor='val_acc')

model.fit(
    train_input.A,
    train_target,
    epochs=100,
    validation_split=0.2,
    callbacks=[STP]
)

Epoch 1/100


<tensorflow.python.keras.callbacks.History at 0x7f66c5385f50>