# Code for model training & test

# lib & load_data

In [121]:
#### installation ###

!pip install git+https://github.com/ssut/py-hanspell.git
!pip install konlpy

Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /tmp/pip-req-build-6ti368ej
  Running command git clone -q https://github.com/ssut/py-hanspell.git /tmp/pip-req-build-6ti368ej
Building wheels for collected packages: py-hanspell
  Building wheel for py-hanspell (setup.py) ... [?25l[?25hdone
  Created wheel for py-hanspell: filename=py_hanspell-1.1-cp36-none-any.whl size=4854 sha256=d983e78fbe171602d3ee9588a5e89973aa6fa3d040421038bae74d341d2aa134
  Stored in directory: /tmp/pip-ephem-wheel-cache-6xksqvwx/wheels/0a/25/d1/e5e96476dbb1c318cc26c992dd493394fe42b0c204b3e65588
Successfully built py-hanspell


In [123]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import reuters
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt, Kkma
okt = Okt()
kkma = Kkma()
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from hanspell import spell_checker

# path setting

In [151]:
train_data_path = "/content/drive/MyDrive/Programming/Project/Hel_ri_celus (AI voice_bot for delivery_riders)/data/train_data.csv"
test_data_path = "/content/drive/MyDrive/Programming/Project/Hel_ri_celus (AI voice_bot for delivery_riders)/data/test_data.csv"

token_ls_save_path = "/content/drive/MyDrive/Programming/Project/Hel_ri_celus (AI voice_bot for delivery_riders)/token_ls.csv"

model_save_path = '/content/drive/MyDrive/Programming/Project/Hel_ri_celus (AI voice_bot for delivery_riders)/model/best_model.h5'


# Preprocessing

In [152]:
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

## 정규 표현식으로 치환
분, 주문번호 변환

In [125]:
# 정규표현식 함수 정의

def re_sub(df):
    convert_ls =[]
    for idx in (df['text']):
        idx = re.sub("\d\d\d\d"," @",idx)
        idx = re.sub("\d\d분"," #분",idx)
        
        ## 띄어쓰기, 맞춤법
        spelled_sent = spell_checker.check(idx)
        hanspell_sent = spelled_sent.checked
        
        convert_ls.append(hanspell_sent)

    convert_txt = pd.Series(convert_ls, name = 'convert_ls')
    df = pd.concat([df,convert_txt],axis = 1)

    return df

In [126]:
df_train = re_sub(df_train)
df_test = re_sub(df_test)

### train_data label 확인

In [127]:
# label
pd.Series.unique(df_train['intent'])

array(['운행시작', '가게전화', '가게도착', '픽업완료', '영수증번호', '소요시간선택', '배달완료'],
      dtype=object)

## 형태소로 분리 , df에 열로 추가

### tokenizer 테스트

In [128]:
# okt_ls = []
# for i in range(len(text_data)):
#     okt_text = okt.pos(text_data['convert_ls'][i])
#     okt_ls = okt_ls + okt_text

# pd.unique(okt_ls)

In [129]:
# kkma_ls = []
# for i in range(len(text_data)):
#     kkma_text = kkma.pos(text_data['convert_ls'][i])
#     kkma_ls = kkma_ls + kkma_text

# pd.unique(kkma_ls)

### kkma가 더 적합해보임

토큰화 / 불용어 제거 / token_len_max

In [130]:
# kkma 토큰화 함수정의

def kkma_tokenizer(input_df):
    
    valid_pos = ['NNG','VV','SW','MAG']
    
    input_df['token_text'] = np.nan

    token_ls =[]
    for i in range(len(input_df)):
        
        # tokenize
        token_text = kkma.pos(input_df['convert_ls'][i])
        
        # 불용어 제거
        ls = []
        for token in token_text:
            
            if token[1] in valid_pos:
                ls.append(token[0])
                token_ls.append(token[0])
            
            input_df['token_text'][i] = ls

In [131]:
kkma_tokenizer(df_train)
kkma_tokenizer(df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [132]:
df_train

Unnamed: 0,text,intent,label,convert_ls,token_text
0,운행시작해,운행시작,0,운행 시작해,"[운행, 시작하]"
1,운행시작하자,운행시작,0,운행 시작하자,"[운행, 시작]"
2,운행하자,운행시작,0,운행하자,[운행]
3,운행해,운행시작,0,운행해,[운행]
4,운행시작해주세요,운행시작,0,운행 시작해주세요,"[운행, 시작하]"
...,...,...,...,...,...
120,배달끝났어,배달완료,6,배달 끝났어,"[배달, 끝나]"
121,음식배달완료,배달완료,6,음식 배달 완료,"[음식, 배달, 완료]"
122,음식배달완료했어,배달완료,6,음식 배달 완료했어,"[음식, 배달, 완료]"
123,음식배달끝냈어,배달완료,6,음식 배달 끝냈어,"[음식, 배달, 끝내]"


## Inter encode

In [133]:
# 토크나이저 최적화
token_ls = []

for i in range(len(df_train)):
    
    token_ls = token_ls + df_train['token_text'][i]

token_ls = pd.Series(token_ls, name = 'token')

# tokenizer fit
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_ls)

# 글자 묶음 크기
unq_token = pd.unique(token_ls)
vocab_size = len(unq_token)


token_ls.to_csv(token_ls_save_path)

In [149]:
len(tokenizer.word_index)
vocab_size

30

In [134]:
## df의 정수인코딩 column 생성 및 반영 함수화
# tokenizer.fit_on_texts 에서 특수문자 @,# 을 자체적으로 없앰, 이 부분 보완 필요

def int_encode(df, token_ls):

    tokenizer.fit_on_texts(token_ls)

    df['integer_encode'] = np.nan
    
    for i in range(len(df)):
        
        # integer encode
        seq = tokenizer.texts_to_sequences(df['token_text'])

        df['integer_encode'] = seq

In [135]:
int_encode(df_train,token_ls = token_ls)
int_encode(df_test,token_ls = token_ls)

## Padding

In [136]:
df_train = df_train[['integer_encode','label']]
df_test = df_test[['integer_encode','label']]

In [137]:
max_len = 8

train_padded = pad_sequences(df_train['integer_encode'], maxlen=max_len)
test_padded = pad_sequences(df_test['integer_encode'], maxlen=max_len)

# train_test_split

In [138]:
x_data_train,x_data_valid, y_data_train, y_data_valid = \
train_test_split(train_padded,
                 df_train['label'],
                 test_size=0.3,
                 random_state=0,
                 shuffle = True)

# Modeling

In [139]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

## label one_hot_encoding

In [140]:
from tensorflow.keras.utils import to_categorical

y_data_train = to_categorical(y_data_train) # 훈련용 레이블의 원-핫 인코딩
y_data_valid = to_categorical(y_data_valid) # valid용 레이블의 원-핫 인코딩

##call_back

In [141]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint(model_save_path, monitor='val_acc', mode='max', verbose=1, save_best_only=True)

## modeling

In [142]:
intent = pd.unique(df_train['label'])
print(intent)
intent_count = intent.shape[0]
intent_count

[0 1 2 3 4 5 6]


7

In [143]:
model = Sequential()
model.add(Embedding(vocab_size, 120))
model.add(LSTM(120))
model.add(Dense(intent_count, activation='softmax'))

In [144]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [145]:
history = model.fit(x_data_train,
                    y_data_train,
                    batch_size = 10,
                    epochs=20,
                    callbacks=[es, mc],
                    
                    validation_data=(x_data_valid,
                                     y_data_valid))

Epoch 1/20
1/9 [==>...........................] - ETA: 0s - loss: 1.9395 - acc: 0.3000
Epoch 00001: val_acc improved from -inf to 0.47368, saving model to /content/drive/MyDrive/Programming/Project/Hel_ri_celus (AI voice_bot for delivery_riders)/model/best_model.h5
Epoch 2/20
1/9 [==>...........................] - ETA: 0s - loss: 1.8105 - acc: 0.8000
Epoch 00002: val_acc did not improve from 0.47368
Epoch 3/20
1/9 [==>...........................] - ETA: 0s - loss: 1.6920 - acc: 0.1000
Epoch 00003: val_acc did not improve from 0.47368
Epoch 4/20
1/9 [==>...........................] - ETA: 0s - loss: 1.4127 - acc: 0.7000
Epoch 00004: val_acc did not improve from 0.47368
Epoch 5/20
1/9 [==>...........................] - ETA: 0s - loss: 1.5787 - acc: 0.4000
Epoch 00005: val_acc did not improve from 0.47368
Epoch 6/20
1/9 [==>...........................] - ETA: 0s - loss: 1.4320 - acc: 0.3000
Epoch 00006: val_acc improved from 0.47368 to 0.52632, saving model to /content/drive/MyDrive/Progr

# predict

In [146]:
def pred(input_data):
    pred = model.predict(input_data)
    pred_result = [np.argmax(i) for i in pred]
    return pred_result

In [147]:
import sklearn
print(sklearn.metrics.classification_report(df_test['label'],pred(test_padded)))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         2
           6       1.00      0.50      0.67         2

    accuracy                           0.93        14
   macro avg       0.95      0.93      0.92        14
weighted avg       0.95      0.93      0.92        14

