# lib & load_data

In [None]:
!pip install git+https://github.com/ssut/py-hanspell.git

In [None]:
!pip install konlpy

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import reuters
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt, Kkma
okt = Okt()
kkma = Kkma()
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from hanspell import spell_checker

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Helricelus/data/train_data.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Helricelus/data/test_data.csv")


# 전처리

## 정규 표현식으로 치환
분, 주문번호 변환

In [None]:
# 정규표현식 함수 정의

def re_sub(df):
    convert_ls =[]
    for idx in (df['text']):
        idx = re.sub("\d\d\d\d"," @",idx)
        idx = re.sub("\d\d분"," #분",idx)
        
        ## 띄어쓰기, 맞춤법
        spelled_sent = spell_checker.check(idx)
        hanspell_sent = spelled_sent.checked
        
        convert_ls.append(hanspell_sent)

    convert_txt = pd.Series(convert_ls, name = 'convert_ls')
    df = pd.concat([df,convert_txt],axis = 1)

    return df

In [None]:
df_train = re_sub(df_train)
df_test = re_sub(df_test)

### train_data label 확인

In [None]:
# label
pd.Series.unique(df_train['intent'])

## 형태소로 분리 , df에 열로 추가

### tokenizer 테스트

In [None]:
# okt_ls = []
# for i in range(len(text_data)):
#     okt_text = okt.pos(text_data['convert_ls'][i])
#     okt_ls = okt_ls + okt_text

# pd.unique(okt_ls)

In [None]:
# kkma_ls = []
# for i in range(len(text_data)):
#     kkma_text = kkma.pos(text_data['convert_ls'][i])
#     kkma_ls = kkma_ls + kkma_text

# pd.unique(kkma_ls)

### kkma가 더 적합해보임

토큰화 / 불용어 제거 / token_len_max

In [None]:
# okt 토큰화 함수정의
valid_pos = ['NNG','VV','SW', 'NNG','MAG']
def okt_tokenizer(input_df,valid_pos):
    input_df['token_text'] = np.nan

    token_ls =[]
    for i in range(len(input_df)):
        
        # tokenize
        token_text = kkma.pos(input_df['convert_ls'][i])
        
        # 불용어 제거
        ls = []
        for token in token_text:
            
            if token[1] in valid_pos:
                ls.append(token[0])
                token_ls.append(token[0])
            
            input_df['token_text'][i] = ls

In [None]:
okt_tokenizer(df_train,valid_pos)
okt_tokenizer(df_test,valid_pos)

## 정수 인코딩

In [None]:
# 토크나이저 최적화
token_ls = []

for i in range(len(df_train)):
    
    token_ls = token_ls + df_train['token_text'][i]

token_ls = pd.Series(token_ls)

# tokenizer fit
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_ls)

vocab_size = len(pd.unique(token_ls))

token_ls.to_csv("/content/drive/MyDrive/Helricelus/token_ls.csv")

In [None]:
## df의 정수인코딩 column 생성 및 반영 함수화


def int_encode(df, token_ls):

    tokenizer.fit_on_texts(token_ls)

    df['integer_encode'] = np.nan
    
    for i in range(len(df)):
        
        # integer encode
        seq = tokenizer.texts_to_sequences(df['token_text'])

        df['integer_encode'] = seq

In [None]:
int_encode(df_train,token_ls = token_ls)
int_encode(df_test,token_ls = token_ls)

## 패딩

In [75]:
df_train = df_train[['integer_encode','label']]
df_test = df_test[['integer_encode','label']]


In [None]:
max_len = 8

train_padded = pad_sequences(df_train['integer_encode'], maxlen=max_len)
test_padded = pad_sequences(df_test['integer_encode'], maxlen=max_len)

# train_test_split

In [78]:
x_data_train,x_data_valid, y_data_train, y_data_valid = \
train_test_split(train_padded,
                 df_train['label'],
                 test_size=0.3,
                 random_state=0,
                 shuffle = True)

In [None]:
# x_data_train,x_data_valid, y_data_train, y_data_valid = \
# train_test_split(df_train.drop('label', axis=1, inplace=False),
#                  train_data['label'],
#                  test_size=0.3,
#                  random_state=0,
#                  shuffle = True)

# Modeling

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

## label one_hot_encoding

In [None]:
from tensorflow.keras.utils import to_categorical

y_data_train = to_categorical(y_data_train) # 훈련용 레이블의 원-핫 인코딩
y_data_valid = to_categorical(y_data_valid) # valid용 레이블의 원-핫 인코딩

##call_back

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

## modeling

In [81]:
intent = pd.unique(train_data['label'])
print(intent)
intent_count = intent.shape[0]
intent_count

[0 1 2 3 4 5 6]


7

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 120))
model.add(LSTM(120))
model.add(Dense(intent_count, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
history = model.fit(x_data_train,
                    y_data_train,
                    batch_size = 10,
                    epochs=10,
                    callbacks=[es, mc],
                    validation_data=(x_data_valid,
                                     y_data_valid))

# predict

In [None]:
pred = model.predict(test_padded)
y_pred = [np.argmax(i) for i in pred]

0     0
1     0
2     1
3     1
4     2
5     2
6     3
7     3
8     4
9     4
10    5
11    5
12    0
13    6
dtype: int64

In [95]:
pd.concat([test_data,pd.Series(y_pred,name= 'prediction')],axis=1)

Unnamed: 0,padded,label,prediction
0,"[0, 0, 0, 0, 0, 0, 13, 15]",0,0
1,"[0, 0, 0, 0, 0, 0, 3, 15]",0,0
2,"[0, 0, 0, 0, 0, 0, 5, 7]",1,1
3,"[0, 0, 0, 0, 0, 0, 4, 7]",1,1
4,"[0, 0, 0, 0, 0, 0, 4, 17]",2,2
5,"[0, 0, 0, 0, 0, 0, 5, 1]",2,2
6,"[0, 0, 0, 0, 0, 0, 14, 9]",3,3
7,"[0, 0, 0, 0, 0, 0, 0, 14]",3,3
8,"[0, 0, 0, 0, 0, 0, 11, 2]",4,4
9,"[0, 0, 0, 0, 0, 0, 0, 10]",4,4


In [None]:
import sklearn
print(sklearn.metrics.classification_report(test_data['label'],y_pred))