In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten, Embedding
from tensorflow.keras.utils import to_categorical
import koreanize_matplotlib
from konlpy.tag import Okt

2024-09-13 16:25:44.651578: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-13 16:25:46.535582: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-09-13 16:25:46.535779: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-09-13 16:25:46.540688: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
2024-09-13 16:25:46.832110: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


In [2]:
train_data =  pd.read_csv("https://raw.githubusercontent.com/haram4th/data4mdai/main/hotelscom_review_train.csv")
test_data =pd.read_csv("../07_DeepLearning/hotelscom_review_test.csv")

In [3]:
train_data.head()

Unnamed: 0,description,isgood
0,사람이 너무 많고 작은 수영장과 조식 수용한계로 모두 포기하고 옆의 아이파크몰에서 ...,0
1,방도크고 깨끗하여 아주 좋았어요,1
2,매년여름휴가철마다 찾는곳이예요 너무 좋아요 점점 더 좋아지는듯 직원분들도 너무 친절...,1
3,여수에서는 제일 유명한 호텔이래요 호텔 stay 가 필요하다면 소노캄 여수도 괜찮은...,1
4,가격대비 훌륭하지만 아무래도 오래된 느낌이 많이 드네요겉이불은 세탁하니까 깨끗히나 ...,1


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53964 entries, 0 to 53963
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  53964 non-null  object
 1   isgood       53964 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 843.3+ KB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35977 entries, 0 to 35976
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  35977 non-null  object
 1   isgood       35977 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 562.3+ KB


In [6]:
#토크나이징 -> 원핫 인코딩

# 토큰화

In [7]:
docs = train_data['description']


In [8]:
#텐서플로우는 띄어쓰기로만 초큰화를 하기에 otk를 사용한다

In [9]:
okt =Okt()
okt.morphs(docs[0])

['사람',
 '이',
 '너무',
 '많고',
 '작은',
 '수영장',
 '과',
 '조식',
 '수',
 '용한',
 '계',
 '로',
 '모두',
 '포기',
 '하고',
 '옆',
 '의',
 '아이파크몰',
 '에서',
 '그냥',
 '식사',
 '함']

In [10]:
#맥캡과 다르게 okt를 사용하ㅐ서 시간이 쫌 느리다
#토크나이징한 후 다시 저장

In [11]:
type(docs)

pandas.core.series.Series

In [12]:
# 전체 문장을 토큰화 후 tokenized_docs에 저장
#시리즈니까 apply쓸 수 있음
tokenized_docs = docs.apply(okt.morphs)

In [13]:
# 단어 인덱스 생
token =Tokenizer(lower =False)
token.fit_on_texts(tokenized_docs)
print(len(token.word_index))

63571


In [14]:
# 문장 벡터화
X = token.texts_to_sequences(tokenized_docs)
print(X[0])

[147, 1, 10, 362, 381, 124, 24, 22, 39, 8795, 2621, 20, 126, 2252, 36, 179, 13, 3939, 12, 145, 258, 190]


In [20]:
y= train_data['isgood']
y

0        0
1        1
2        1
3        1
4        1
        ..
53959    1
53960    1
53961    1
53962    1
53963    1
Name: isgood, Length: 53964, dtype: int64

In [15]:
#가장 긴 문장의 길이 구하기 
max_len = max(len(i) for i in X)
print("가장 긴 문장의 길이(패딩에 사용): ", max_len)

가장 긴 문장의 길이(패딩에 사용):  554


In [16]:
#특이하게 긴 케이스에는 0이 많이 들어가는 문제도 있음.
#하지만 지금은 그냥진행


In [17]:
#패딩
#임베딩은 +1 안해줘도 된다
X_padded = pad_sequences(X, maxlen = max_len, padding ='post')
print(X_padded[0])

[ 147    1   10  362  381  124   24   22   39 8795 2621   20  126 2252
   36  179   13 3939   12  145  258  190    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [18]:
#홀드아웃
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X_padded, y, test_size=0.3, stratify=y, random_state=10 )

In [23]:
#임베딩에 입력될 단어 수
word_size=len(token.word_index)+1
print(word_size)

63572


양방향 RNN 네트워크를 이용해 텍스트 분석

In [24]:
from tensorflow.keras.layers import Dense,Flatten, Embedding, Dropout, SimpleRNN, Bidirectional


In [26]:
birnn = Sequential()
birnn.add(Embedding(word_size, 64, input_length=max_len))
birnn.add(Bidirectional(SimpleRNN(128, return_sequences = True, activation='tanh')))
birnn.add(Dropout(0.5))
birnn.add(SimpleRNN(64, activation='tanh'))
birnn.add(Dropout(0.5))
birnn.add(Dense(32, activation = 'relu'))
birnn.add(Dense(1, activation='sigmoid'))
birnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 554, 64)           4068608   
                                                                 
 bidirectional (Bidirectiona  (None, 554, 256)         49408     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 554, 256)          0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                20544     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                      

In [28]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [29]:
birnn.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
modelpath = "./model/hotels_review_birnn.keras"
checkpoint = ModelCheckpoint(filepath=modelpath, save_best_only=True)
earlystop = EarlyStopping(patience=10)