In [44]:
# from google.colab import drive
# drive.mount('/content/drive')

In [45]:
# !pip install koreanize-matplotlib
# !pip install konlpy
# !apt update
# !bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
# !pip install mecab-python3

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.utils import to_categorical
import koreanize_matplotlib
from konlpy.tag import Mecab

In [47]:
# train_data = pd.read_csv("https://raw.githubusercontent.com/haram4th/data4mdai/main/hotelscom_review_train.csv")
test_data = pd.read_csv("../06machine_learning/data/hotelscom_review_test.csv")

In [48]:
test_data.head()

Unnamed: 0,description,isgood
0,쾌적한 시설과 특히 금진온천은 저에게는 특별한추억이었습니다,1
1,관광호텔로 알고 예약 후 직접 방문시 주변이 모두 모텔주변에 위치하였으며 관광호텔급...,0
2,신축이라 그런지 아주 청결합니다 조명은 여러가지 색깔이 들어오는데 다 켜면 잘 조화...,1
3,맨몸으로들어간 호텔에 치솔과 면도기가 없어서 황당했습니다 해외에도 다있고 국내 모텔...,1
4,도어락이 제대로 작동되지 않았고텔레비전도 아주 낡아서 중간에 소리도 안 들리고 화면...,0


In [49]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35977 entries, 0 to 35976
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  35977 non-null  object
 1   isgood       35977 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 562.3+ KB


In [50]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35977 entries, 0 to 35976
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  35977 non-null  object
 1   isgood       35977 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 562.3+ KB


# 토큰화

In [51]:
docs = test_data['description']

In [52]:
mecab = Mecab()
mecab.morphs(docs[0])

['쾌적',
 '한',
 '시설',
 '과',
 '특히',
 '금진',
 '온천',
 '은',
 '저',
 '에게',
 '는',
 '특별',
 '한',
 '추억',
 '이',
 '었',
 '습니다']

In [53]:
type(docs)

pandas.core.series.Series

In [54]:
%%time
# 전체 문장을 토큰화 후 tokenized_docs에 저장
tokenized_docs = docs.apply(mecab.morphs)

CPU times: user 3.7 s, sys: 72.4 ms, total: 3.78 s
Wall time: 3.78 s


In [55]:
# import joblib
# joblib.dump(tokenized_docs, "./model/hotels_tokenized_docs")

In [56]:
# tokenized_docs = joblib.load("./model/hotels_tokenized_docs")

In [57]:
# tokenized_docs[0]

In [58]:
# 단어 인덱스 생성
token = Tokenizer(lower=False)
token.fit_on_texts(tokenized_docs)
print(len(token.word_index))

25045


In [59]:
# 문장 백터화
X = token.texts_to_sequences(tokenized_docs)
print(X[0])

[282, 17, 49, 56, 217, 8973, 787, 10, 284, 321, 6, 1117, 17, 884, 1, 16, 7]


In [60]:
y = test_data['isgood']
y

0        1
1        0
2        1
3        1
4        0
        ..
35972    1
35973    1
35974    1
35975    1
35976    1
Name: isgood, Length: 35977, dtype: int64

In [61]:
# 가장 긴 문장의 길이 구하기
max_len = max(len(i) for i in X)
print("가장 긴 문장의 길이(패딩에 사용): ", max_len)

가장 긴 문장의 길이(패딩에 사용):  734


In [62]:
train_max_len = 738

In [63]:
# # 패딩
# X_padded = pad_sequences(X, maxlen=max_len, padding='post')
# print(X_padded[0])

In [64]:
# 패딩
X_padded = pad_sequences(X, maxlen=train_max_len, padding='post')
print(X_padded[0])

[ 282   17   49   56  217 8973  787   10  284  321    6 1117   17  884
    1   16    7    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [65]:
# 임베딩에 입력될 단어 수
word_size = len(token.word_index) + 1
print(word_size)

25046


# 학습된 모델을 통해서 테스트 데이터 분석

In [66]:
from tensorflow.keras.models import load_model

In [67]:
attention_model = load_model("./model/hotels_review_Attention.keras")

In [68]:
pred = attention_model.predict(X_padded)
pred

[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 50ms/step


array([[0.27431607],
       [0.9297263 ],
       [0.96125114],
       ...,
       [0.9567775 ],
       [0.8109563 ],
       [0.45815223]], dtype=float32)

In [69]:
pred = pd.DataFrame(pred)
pred[0] = pred[0].apply(lambda x: 1 if x > 0.5 else 0)
pred

Unnamed: 0,0
0,0
1,1
2,1
3,1
4,1
...,...
35972,0
35973,1
35974,1
35975,1


In [70]:
y = pd.DataFrame(y)
y

Unnamed: 0,isgood
0,1
1,0
2,1
3,1
4,0
...,...
35972,1
35973,1
35974,1
35975,1


In [71]:
result = pred.join(y)
result

Unnamed: 0,0,isgood
0,0,1
1,1,0
2,1,1
3,1,1
4,1,0
...,...,...
35972,0,1
35973,1,1
35974,1,1
35975,1,1


In [72]:
from sklearn.metrics import classification_report

In [73]:
print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.32      0.22      0.26      7873
           1       0.80      0.87      0.84     28104

    accuracy                           0.73     35977
   macro avg       0.56      0.54      0.55     35977
weighted avg       0.70      0.73      0.71     35977



In [74]:
import joblib

In [75]:
X = test_data['description']

토크나이저 로딩

In [76]:
mecab = joblib.load("./model/mecab_hotels_model")

In [77]:
def tokenizer(x):
    token = mecab.morphs(x)
    return token

In [78]:
# X = X.apply(tokenizer2)

훈련시 사용한 TfidfVectorizer로 전처리

In [79]:
tfidf_cv = joblib.load("./model/tfidf_cv_hotels_model")
X_tfidf = tfidf_cv.transform(X)

나이브베이즈 모델 로딩 및 테스트 데이터 분석

In [80]:
mnb = joblib.load("./model/Naive Bayes_hotels_model")

In [81]:
mnb_pred = mnb.predict(X_tfidf)
print(classification_report(y, mnb_pred))

              precision    recall  f1-score   support

           0       0.91      0.26      0.41      7873
           1       0.83      0.99      0.90     28104

    accuracy                           0.83     35977
   macro avg       0.87      0.63      0.66     35977
weighted avg       0.85      0.83      0.79     35977

