# **QnA Classification**

## 모델링 및 평가

### 라이브러리 불러오기

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import scipy.stats as spst
from scipy.sparse import load_npz

import os

from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Embedding, SimpleRNN
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

2023-08-15 17:21:02.949971: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-15 17:21:03.115679: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-15 17:21:03.116919: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 데이터 불러오기

In [3]:
# df1_x : 불용어 처리하지 않은 데이터셋
# df2_x : 불용어 처리 완료한 데이터셋
df1_cnt_x_train = load_npz('df1_cnt_x_train.npz')
df1_bigram_x_train = load_npz('df1_bigram_x_train.npz')
df1_ngram_x_train = load_npz('df1_ngram_x_train.npz')
df1_tfidf_x_train = load_npz('df1_tfidf_x_train.npz')

df1_cnt_x_val = load_npz('df1_cnt_x_val.npz')
df1_bigram_x_val = load_npz('df1_bigram_x_val.npz')
df1_ngram_x_val = load_npz('df1_ngram_x_val.npz')
df1_tfidf_x_val = load_npz('df1_tfidf_x_val.npz')

df2_cnt_x_train = load_npz('df2_cnt_x_train.npz')
df2_bigram_x_train = load_npz('df2_bigram_x_train.npz')
df2_ngram_x_train = load_npz('df2_ngram_x_train.npz')
df2_tfidf_x_train = load_npz('df2_tfidf_x_train.npz')

df2_cnt_x_val = load_npz('df2_cnt_x_val.npz')
df2_bigram_x_val = load_npz('df2_bigram_x_val.npz')
df2_ngram_x_val = load_npz('df2_ngram_x_val.npz')
df2_tfidf_x_val = load_npz('df2_tfidf_x_val.npz')

In [4]:
y_train = pd.read_csv('y_train.csv')
y_val = pd.read_csv('y_val.csv')

In [5]:
# x 리스트 정의
df1_x_train_list = [df1_cnt_x_train, df1_bigram_x_train, df1_ngram_x_train, df1_tfidf_x_train]
df1_x_val_list = [df1_cnt_x_val, df1_bigram_x_val, df1_ngram_x_val, df1_tfidf_x_val]

df2_x_train_list = [df2_cnt_x_train, df2_bigram_x_train, df2_ngram_x_train, df2_tfidf_x_train]
df2_x_val_list = [df2_cnt_x_val, df2_bigram_x_val, df2_ngram_x_val, df2_tfidf_x_val]

### ML

#### RandomForest

In [59]:
# 함수정의
def rf(x_train, x_val, y_train=y_train, y_val=y_val):
    rf = RandomForestClassifier()
    rf.fit(x_train, y_train.values.ravel())
    y_pred = rf.predict(x_val)
    result = accuracy_score(y_val, y_pred)
    return result

In [60]:
df1_acc_rf = []
df2_acc_rf = []

for i in range(len(df1_x_train_list)):
    x_train = df1_x_train_list[i]
    x_val = df1_x_val_list[i]
    df1_acc_rf.append(rf(x_train, x_val))

for i in range(len(df2_x_train_list)):
    x_train = df2_x_train_list[i]
    x_val = df2_x_val_list[i]
    df2_acc_rf.append(rf(x_train, x_val))

print(df1_acc_rf)
print(df2_acc_rf)

[0.7628032345013477, 0.5795148247978437, 0.7115902964959568, 0.7506738544474394]
[0.7425876010781671, 0.5889487870619946, 0.7048517520215634, 0.738544474393531]


RandomForest의 경우, df1와 df2 모두 cnt가 가장 좋은 결과를 나타냄

#### LGBM

In [66]:
# 함수정의
def lgb(x_train, x_val, y_train=y_train, y_val=y_val):
    lgb = LGBMClassifier()
    x_train_float = x_train.astype(np.float64)
    x_val_float = x_val.astype(np.float64)
    lgb.fit(x_train_float, y_train.values.ravel())
    y_pred = lgb.predict(x_val_float)
    result = accuracy_score(y_val, y_pred)
    return result

In [67]:
df1_acc_lgb = []
df2_acc_lgb = []

for i in range(len(df1_x_train_list)):
    x_train = df1_x_train_list[i]
    x_val = df1_x_val_list[i]
    df1_acc_lgb.append(lgb(x_train, x_val))

for i in range(len(df2_x_train_list)):
    x_train = df2_x_train_list[i]
    x_val = df2_x_val_list[i]
    df2_acc_lgb.append(lgb(x_train, x_val))

print(df1_acc_lgb)
print(df2_acc_lgb)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1612
[LightGBM] [Info] Number of data points in the train set: 2964, number of used features: 252
[LightGBM] [Info] Start training from score -0.855428
[LightGBM] [Info] Start training from score -1.612479
[LightGBM] [Info] Start training from score -1.638187
[LightGBM] [Info] Start training from score -1.863068
[LightGBM] [Info] Start training from score -3.650490
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 720
[LightGBM] [Info] Number of data points in the train set: 2964, number of used features: 32
[LightGBM] [Info] Start training from score -0.855428
[LightGBM] [Info] Start training from score -1.612479
[LightGBM] [Info] Start training from score -1.638187
[LightGBM] [Info] Start training from score -1.863068
[LightGBM] [Info] 

LightGBM의 경우, df1와 df2 모두 cnt가 가장 좋은 결과를 나타냄

#### XGBoost

In [70]:
# 함수정의
def xgb(x_train, x_val, y_train=y_train, y_val=y_val):
    xgb = XGBClassifier()
    xgb.fit(x_train, y_train.values.ravel())
    y_pred = xgb.predict(x_val)
    result = accuracy_score(y_val, y_pred)
    return result

In [69]:
df1_acc_xgb = []
df2_acc_xgb = []

for i in range(len(df1_x_train_list)):
    x_train = df1_x_train_list[i]
    x_val = df1_x_val_list[i]
    df1_acc_xgb.append(xgb(x_train, x_val))

for i in range(len(df2_x_train_list)):
    x_train = df2_x_train_list[i]
    x_val = df2_x_val_list[i]
    df2_acc_xgb.append(xgb(x_train, x_val))

print(df1_acc_xgb)
print(df2_acc_xgb)

[0.7668463611859838, 0.6226415094339622, 0.7695417789757413, 0.7601078167115903]
[0.7722371967654986, 0.623989218328841, 0.7601078167115903, 0.7506738544474394]


XGBoost의 경우, df1은 ngram, df2는 cnt가 가장 좋은 결과를 나타냄

#### CatBoost

In [26]:
# 함수정의
def cat(x_train, x_val, y_train=y_train, y_val=y_val):
    cat = CatBoostClassifier()
    cat.fit(x_train, y_train.values.ravel())
    y_pred = cat.predict(x_val)
    result = accuracy_score(y_val, y_pred)
    return result

In [72]:
df1_acc_cat = []
df2_acc_cat = []

for i in range(len(df1_x_train_list)):
    x_train = df1_x_train_list[i]
    x_val = df1_x_val_list[i]
    df1_acc_cat.append(cat(x_train, x_val))

for i in range(len(df2_x_train_list)):
    x_train = df2_x_train_list[i]
    x_val = df2_x_val_list[i]
    df2_acc_cat.append(cat(x_train, x_val))

print(df1_acc_cat)
print(df2_acc_cat)

Learning rate set to 0.083635
0:	learn: 1.5388130	total: 61.7ms	remaining: 1m 1s
1:	learn: 1.4783731	total: 72.6ms	remaining: 36.2s
2:	learn: 1.4278325	total: 84.5ms	remaining: 28.1s
3:	learn: 1.3819693	total: 95.5ms	remaining: 23.8s
4:	learn: 1.3429185	total: 106ms	remaining: 21s
5:	learn: 1.3187773	total: 116ms	remaining: 19.2s
6:	learn: 1.2915791	total: 125ms	remaining: 17.7s
7:	learn: 1.2621237	total: 134ms	remaining: 16.6s
8:	learn: 1.2372742	total: 143ms	remaining: 15.8s
9:	learn: 1.2177443	total: 153ms	remaining: 15.2s
10:	learn: 1.2034392	total: 163ms	remaining: 14.7s
11:	learn: 1.1849500	total: 172ms	remaining: 14.2s
12:	learn: 1.1642447	total: 183ms	remaining: 13.9s
13:	learn: 1.1458589	total: 192ms	remaining: 13.5s
14:	learn: 1.1310975	total: 203ms	remaining: 13.3s
15:	learn: 1.1183922	total: 214ms	remaining: 13.2s
16:	learn: 1.1065188	total: 223ms	remaining: 12.9s
17:	learn: 1.0977197	total: 233ms	remaining: 12.7s
18:	learn: 1.0843788	total: 244ms	remaining: 12.6s
19:	learn

Catboost의 경우, df1과 df2 모두 cnt가 가장 좋은 결과를 나타냄

#### SVM

In [73]:
# 함수정의
def svm(x_train, x_val, y_train=y_train, y_val=y_val):
    svm = SVC()
    svm.fit(x_train, y_train.values.ravel())
    y_pred = svm.predict(x_val)
    result = accuracy_score(y_val, y_pred)
    return result

In [75]:
df1_acc_svm = []
df2_acc_svm = []

for i in range(len(df1_x_train_list)):
    x_train = df1_x_train_list[i]
    x_val = df1_x_val_list[i]
    df1_acc_svm.append(svm(x_train, x_val))

for i in range(len(df2_x_train_list)):
    x_train = df2_x_train_list[i]
    x_val = df2_x_val_list[i]
    df2_acc_svm.append(svm(x_train, x_val))

print(df1_acc_svm)
print(df2_acc_svm)

[0.4973045822102426, 0.49595687331536387, 0.49595687331536387, 0.49595687331536387]
[0.49595687331536387, 0.49595687331536387, 0.49595687331536387, 0.49595687331536387]


SVM의 경우, 모든 부분에서 기대 이하의 성능을 나타냄

#### Grid search

In [84]:
# 전체 결과(svm제외) 확인
print("-"*40, "rf", "-"*40)
print(df1_acc_rf)
print(df2_acc_rf)
print("-"*40, "lgb", "-"*40)
print(df1_acc_lgb)
print(df2_acc_lgb)
print("-"*40, "xgb", "-"*40)
print(df1_acc_xgb)
print(df2_acc_xgb)
print("-"*40, "cat", "-"*40)
print(df1_acc_cat)
print(df2_acc_cat)

---------------------------------------- rf ----------------------------------------
[0.7628032345013477, 0.5795148247978437, 0.7115902964959568, 0.7506738544474394]
[0.7425876010781671, 0.5889487870619946, 0.7048517520215634, 0.738544474393531]
---------------------------------------- lgb ----------------------------------------
[0.7533692722371967, 0.5444743935309974, 0.7520215633423181, 0.7304582210242587]
[0.7493261455525606, 0.555256064690027, 0.7479784366576819, 0.7331536388140162]
---------------------------------------- xgb ----------------------------------------
[0.7668463611859838, 0.6226415094339622, 0.7695417789757413, 0.7601078167115903]
[0.7722371967654986, 0.623989218328841, 0.7601078167115903, 0.7506738544474394]
---------------------------------------- cat ----------------------------------------
[0.77088948787062, 0.6145552560646901, 0.7520215633423181, 0.7547169811320755]
[0.7762803234501348, 0.6064690026954178, 0.7574123989218329, 0.7681940700808625]


상위권을 차지한 XGBoost와 CatBoost에서 df1_cnt, df2_cnt를 사용하여 그리드 서치 진행

In [16]:
# 파라미터 선택
xgb_param_grid = {
    'n_estimators': [300, 400],
    'max_depth': [2, 3],
    'learning_rate': [0.15, 0.2, 0.25]
}

cat_param_grid = {
    'depth': [7, 8],
    'learning_rate': [0.1, 0.15]
}

In [18]:
# 함수정의
def grid_search(model, x_train, y_train, cv=3):
    if model == "xgb":
        grid_model = XGBClassifier()
        param = xgb_param_grid
    elif model == "cat":
        grid_model = CatBoostClassifier()
        param = cat_param_grid
    else:
        return "Invalid model. Please restart with 'xgb' or 'cat'."

    grid_search = GridSearchCV(
        grid_model, param, cv=cv, n_jobs=-1, scoring="accuracy", verbose=0
    )
    
    grid_search.fit(x_train, y_train)

    return grid_search.best_params_, grid_search.best_score_

In [20]:
df1_xgb_best_params, df1_xgb_best_score = grid_search("xgb", df1_cnt_x_train, y_train)
print("xgb 최적 파라미터: ", df1_xgb_best_params)
print("xgb 최고 성능: ", df1_xgb_best_score)

df1_cat_best_params, df1_cat_best_score = grid_search("cat", df1_cnt_x_train, y_train)
print("cat 최적 파라미터: ", df1_cat_best_params)
print("cat 최고 성능: ", df1_cat_best_score)

xgb 최적 파라미터:  {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 300}
xgb 최고 성능:  0.7304318488529015
0:	learn: 1.4928919	total: 116ms	remaining: 1m 55s
0:	learn: 1.4931784	total: 191ms	remaining: 3m 10s
0:	learn: 1.5296978	total: 253ms	remaining: 4m 12s
0:	learn: 1.4902176	total: 321ms	remaining: 5m 21s
1:	learn: 1.4607827	total: 320ms	remaining: 2m 39s
0:	learn: 1.5276099	total: 332ms	remaining: 5m 31s
1:	learn: 1.4024502	total: 317ms	remaining: 2m 38s
1:	learn: 1.4025529	total: 395ms	remaining: 3m 17s
0:	learn: 1.5291329	total: 327ms	remaining: 5m 26s
0:	learn: 1.5294744	total: 391ms	remaining: 6m 31s
0:	learn: 1.5295503	total: 428ms	remaining: 7m 7s
2:	learn: 1.3350481	total: 596ms	remaining: 3m 18s
1:	learn: 1.4606539	total: 621ms	remaining: 5m 9s
1:	learn: 1.4569069	total: 591ms	remaining: 4m 54s
2:	learn: 1.4130648	total: 648ms	remaining: 3m 35s
0:	learn: 1.5300663	total: 468ms	remaining: 7m 47s
0:	learn: 1.4937429	total: 399ms	remaining: 6m 38s
2:	learn: 1.4037571	total: 728

In [21]:
df2_xgb_best_params, df2_xgb_best_score = grid_search("xgb", df2_cnt_x_train, y_train)
print("xgb 최적 파라미터: ", df2_xgb_best_params)
print("xgb 최고 성능: ", df2_xgb_best_score)

df2_cat_best_params, df2_cat_best_score = grid_search("cat", df2_cnt_x_train, y_train)
print("cat 최적 파라미터: ", df2_cat_best_params)
print("cat 최고 성능: ", df2_cat_best_score)

xgb 최적 파라미터:  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 400}
xgb 최고 성능:  0.7344804318488528
0:	learn: 1.4772153	total: 198ms	remaining: 3m 18s
0:	learn: 1.5188159	total: 188ms	remaining: 3m 8s
0:	learn: 1.5187731	total: 234ms	remaining: 3m 54s
0:	learn: 1.4724300	total: 242ms	remaining: 4m 1s
0:	learn: 1.5156600	total: 288ms	remaining: 4m 48s
0:	learn: 1.5155069	total: 419ms	remaining: 6m 58s
1:	learn: 1.3955120	total: 401ms	remaining: 3m 19s
0:	learn: 1.4772772	total: 293ms	remaining: 4m 53s
0:	learn: 1.4889042	total: 364ms	remaining: 6m 3s
1:	learn: 1.4564227	total: 471ms	remaining: 3m 55s
0:	learn: 1.5268163	total: 490ms	remaining: 8m 9s
0:	learn: 1.5268263	total: 377ms	remaining: 6m 16s
1:	learn: 1.4408882	total: 621ms	remaining: 5m 9s
1:	learn: 1.4697521	total: 580ms	remaining: 4m 49s
0:	learn: 1.4888901	total: 376ms	remaining: 6m 15s
1:	learn: 1.3772719	total: 531ms	remaining: 4m 24s
1:	learn: 1.3963730	total: 589ms	remaining: 4m 54s
0:	learn: 1.4726498	total: 581ms	

In [25]:
print("-"*10, "xgb", "-"*10)
print(df1_xgb_best_score)
print(df2_xgb_best_score)
print("-"*10, "cat", "-"*10)
print(df1_cat_best_score)
print(df2_cat_best_score)

---------- xgb ----------
0.7304318488529015
0.7344804318488528
---------- cat ----------
0.7321187584345479
0.7307692307692308


파라미터를 수없이 바꿔봤지만 기본 모델보다 더 나은 성능이 나오지 못 함

#### ML 최종

In [27]:
ML_final_score = cat(df2_cnt_x_train, df2_cnt_x_val)

Learning rate set to 0.083635
0:	learn: 1.5365350	total: 12.3ms	remaining: 12.3s
1:	learn: 1.4796089	total: 22.5ms	remaining: 11.2s
2:	learn: 1.4364620	total: 33.6ms	remaining: 11.2s
3:	learn: 1.3902299	total: 43.7ms	remaining: 10.9s
4:	learn: 1.3473765	total: 54.7ms	remaining: 10.9s
5:	learn: 1.3169267	total: 64.3ms	remaining: 10.6s
6:	learn: 1.2836209	total: 75.5ms	remaining: 10.7s
7:	learn: 1.2567025	total: 85.3ms	remaining: 10.6s
8:	learn: 1.2356546	total: 96.4ms	remaining: 10.6s
9:	learn: 1.2148788	total: 108ms	remaining: 10.7s
10:	learn: 1.1947413	total: 120ms	remaining: 10.8s
11:	learn: 1.1748596	total: 131ms	remaining: 10.8s
12:	learn: 1.1606668	total: 140ms	remaining: 10.7s
13:	learn: 1.1455892	total: 151ms	remaining: 10.7s
14:	learn: 1.1298255	total: 161ms	remaining: 10.5s
15:	learn: 1.1152652	total: 171ms	remaining: 10.5s
16:	learn: 1.1049699	total: 184ms	remaining: 10.6s
17:	learn: 1.0961177	total: 197ms	remaining: 10.7s
18:	learn: 1.0840326	total: 211ms	remaining: 10.9s
19

In [29]:
print(ML_final_score)

0.7762803234501348


### DL

#### Model1

In [6]:
dense_df1_cnt_x_train = df1_cnt_x_train.toarray()
dense_df1_cnt_x_val = df1_cnt_x_val.toarray()
dense_df2_cnt_x_train = df1_cnt_x_train.toarray()
dense_df2_cnt_x_val = df1_cnt_x_val.toarray()

In [7]:
# 원핫인코딩
y_train_one_hot = to_categorical(y_train)
y_val_one_hot = to_categorical(y_val)

In [8]:
df1_input_size = dense_df1_cnt_x_train.shape[1]
df2_input_size = dense_df2_cnt_x_train.shape[1]
output_size = y_train_one_hot.shape[1]

In [14]:
# 모델 설계
tf.keras.backend.clear_session()

model1 = Sequential([
    Dense(1024, activation="relu", input_shape=(df1_input_size,)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(512, activation="relu", input_shape=(df1_input_size,)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(256, activation="relu", input_shape=(df1_input_size,)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(128, activation="relu", input_shape=(df1_input_size,)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(64, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),

    Dense(32, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),

    Dense(output_size, activation="softmax")
])

In [15]:
# 모델 컴파일
model1.compile(optimizer='adam',
               loss='categorical_crossentropy', 
               metrics=['accuracy'])

In [21]:
m1_df1_hist = model1.fit(dense_df1_cnt_x_train, 
                     y_train_one_hot, 
                     validation_data=(dense_df1_cnt_x_val, y_val_one_hot), 
                     epochs=500, 
                     batch_size=128,
                     callbacks=[mc, reduce_lr])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [22]:
m1_df1_test_loss, m1_df1_test_accuracy = model1.evaluate(dense_df1_cnt_x_val, y_val_one_hot)
print("테스트 손실:", m1_df1_test_loss)
print("테스트 정확도:", m1_df1_test_accuracy)

테스트 손실: 1.0963698625564575
테스트 정확도: 0.7371967434883118


In [24]:
m1_df2_history = model1.fit(dense_df2_cnt_x_train, 
                     y_train_one_hot, 
                     validation_data=(dense_df2_cnt_x_val, y_val_one_hot), 
                     epochs=500, 
                     batch_size=128,
                     callbacks=[mc, reduce_lr])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [25]:
m1_df2_test_loss, m1_df2_test_accuracy = model1.evaluate(dense_df2_cnt_x_val, y_val_one_hot)
print("테스트 손실:", m1_df2_test_loss)
print("테스트 정확도:", m1_df2_test_accuracy)

테스트 손실: 1.1849002838134766
테스트 정확도: 0.7425876259803772


#### Model2

In [26]:
max_length1 = max([len(sentence) for sentence in dense_df1_cnt_x_train])
max_length2 = max([len(sentence) for sentence in dense_df2_cnt_x_train])

print("max_length1: ", max_length1)
print("max_length2: ", max_length2)

max_length1:  3681
max_length2:  3681


In [27]:
# 패딩 수행
max_sequence_length = 4000 # 최대 문장길이보다 약간 여유있게

pad_df1_x_train = pad_sequences(dense_df1_cnt_x_train, maxlen=max_sequence_length)
pad_df1_x_val = pad_sequences(dense_df1_cnt_x_val, maxlen=max_sequence_length)

pad_df2_x_train = pad_sequences(dense_df2_cnt_x_train, maxlen=max_sequence_length)
pad_df2_x_val = pad_sequences(dense_df2_cnt_x_val, maxlen=max_sequence_length)

In [28]:
# 모델 설계
vocab_size = 10000 # 단어 집합의 크기
embedding_dim = 128 # 단어 임베딩의 차원 수
rnn_units = 64 # RNN 셀에서 사용할 유닛 수

def create_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
        SimpleRNN(rnn_units, return_sequences=True),
        SimpleRNN(rnn_units),
        Dense(output_size, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [29]:
# callback 지정
es = EarlyStopping(
    monitor = "val_loss",
    min_delta = 0,
    patience = 5,
    restore_best_weights = True,
    start_from_epoch = 10
)

mc = ModelCheckpoint(
    'best_model.h5', 
    monitor='val_loss', 
    save_best_only=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=5, 
    min_lr=1e-5
)

In [30]:
# 학습 및 평가
model2 = create_model()

m2_df1_hist = model2.fit(pad_df1_x_train, 
                     y_train_one_hot, 
                     epochs=100, 
                     batch_size=128, 
                     validation_data=(pad_df1_x_val, y_val_one_hot),
                     callbacks=[es, mc, reduce_lr])

Epoch 1/100
Epoch 2/100


  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [31]:
# 학습 및 평가
m2_df2_hist = model2.fit(pad_df2_x_train, 
                     y_train_one_hot, 
                     epochs=100, 
                     batch_size=128, 
                     validation_data=(pad_df2_x_val, y_val_one_hot),
                     callbacks=[es, mc, reduce_lr])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [33]:
# 모델 저장
def cat_save_model(x_train, y_train, model_path):
    cat = CatBoostClassifier()
    cat.fit(x_train, y_train.values.ravel())
    cat.save_model(model_path)

In [34]:
model_path = 'catboost_model.cbm'
cat_save_model(df2_cnt_x_train, y_train, model_path)

Learning rate set to 0.083635
0:	learn: 1.5365350	total: 58ms	remaining: 57.9s
1:	learn: 1.4796089	total: 68.5ms	remaining: 34.2s
2:	learn: 1.4364620	total: 80.7ms	remaining: 26.8s
3:	learn: 1.3902299	total: 90.8ms	remaining: 22.6s
4:	learn: 1.3473765	total: 101ms	remaining: 20.1s
5:	learn: 1.3169267	total: 112ms	remaining: 18.5s
6:	learn: 1.2836209	total: 121ms	remaining: 17.2s
7:	learn: 1.2567025	total: 131ms	remaining: 16.2s
8:	learn: 1.2356546	total: 141ms	remaining: 15.5s
9:	learn: 1.2148788	total: 151ms	remaining: 14.9s
10:	learn: 1.1947413	total: 161ms	remaining: 14.5s
11:	learn: 1.1748596	total: 173ms	remaining: 14.3s
12:	learn: 1.1606668	total: 184ms	remaining: 14s
13:	learn: 1.1455892	total: 196ms	remaining: 13.8s
14:	learn: 1.1298255	total: 208ms	remaining: 13.6s
15:	learn: 1.1152652	total: 220ms	remaining: 13.5s
16:	learn: 1.1049699	total: 230ms	remaining: 13.3s
17:	learn: 1.0961177	total: 240ms	remaining: 13.1s
18:	learn: 1.0840326	total: 253ms	remaining: 13.1s
19:	learn: 

In [36]:
# 모델 불러오기 및 평가
def cat_load_evaluate(x_val, y_val, model_path):
    cat = CatBoostClassifier()
    cat.load_model(model_path)
    y_pred = cat.predict(x_val)
    result = accuracy_score(y_val, y_pred)
    return result

ML_final_score = cat_load_evaluate(df2_cnt_x_val, y_val, model_path)
print(ML_final_score)

0.7762803234501348
