In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
import tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout,Flatten,Conv1D,MaxPooling1D,BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
df = pd.read_csv('최종 전처리.csv')
df.head()

Unnamed: 0,digit_1,digit_2,digit_3,Full Text,clean_text
0,S,95.0,952.0,카센터에서 자동차부분정비 타이어오일교환,카 센터 자동차 부분 정비 타이어 오일 교환
1,G,47.0,472.0,상점내에서 일반인을 대상으로 채소 과일판매,상점 내 일반인 대상 채소 과일 판매
2,G,46.0,467.0,절단하여사업체에도매 공업용고무를가지고 합성고무도매,절단 하여사 업체 에도 매 공업 용 고무 가지 고 합성 고무 도매
3,G,47.0,475.0,영업점에서 일반소비자에게 열쇠잠금장치,영업 점 일 반 소비자 열쇠 잠금장치
4,Q,87.0,872.0,어린이집 보호자의 위탁을 받아 취학전아동보육,어린이집 보호자 위탁 받아 취학 전 아동 보육


In [5]:
train = df[:1000000]
train.head()

Unnamed: 0,digit_1,digit_2,digit_3,Full Text,clean_text
0,S,95.0,952.0,카센터에서 자동차부분정비 타이어오일교환,카 센터 자동차 부분 정비 타이어 오일 교환
1,G,47.0,472.0,상점내에서 일반인을 대상으로 채소 과일판매,상점 내 일반인 대상 채소 과일 판매
2,G,46.0,467.0,절단하여사업체에도매 공업용고무를가지고 합성고무도매,절단 하여사 업체 에도 매 공업 용 고무 가지 고 합성 고무 도매
3,G,47.0,475.0,영업점에서 일반소비자에게 열쇠잠금장치,영업 점 일 반 소비자 열쇠 잠금장치
4,Q,87.0,872.0,어린이집 보호자의 위탁을 받아 취학전아동보육,어린이집 보호자 위탁 받아 취학 전 아동 보육


In [6]:
encoder1 = LabelEncoder()
train['digit_1'] = encoder1.fit_transform(train['digit_1'])
train['digit_1'].nunique()

19

In [7]:
encoder2 = LabelEncoder()
train['digit_2'] = encoder2.fit_transform(train['digit_2'])
train['digit_2'].nunique()

74

In [8]:
encoder3 = LabelEncoder()
train['digit_3'] = encoder3.fit_transform(train['digit_3'])
train['digit_3'].nunique()

225

In [9]:
text = np.load('text.npy') # Sentence-Piece Tokenizer 전처리 완료한 데이터 

# 전처리 진행

In [10]:
max_len = 21

text = tf.keras.preprocessing.sequence.pad_sequences(text, maxlen=max_len)
text

array([[   0,    0,    0, ...,  633, 1943, 3832],
       [   0,    0,    0, ...,  296,  147,   12],
       [   0,    0,    0, ..., 1304,  608,   23],
       ...,
       [   0,    0,    0, ..., 1444, 1030,   12],
       [   0,    0,    0, ..., 1434, 1725,   49],
       [   0,    0,    0, ...,   44, 1310,  203]])

In [11]:
train_data = text[:1000000]
test_data = text[1000000:]
target = train['digit_1']

X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2, shuffle=True, stratify=target)

### CNN - LSTM (digit1)

In [12]:
vocab_size = 10760
embedding_dim = 128
hidden_units = 128
num_classes = 19 # 대분류 19개 카테고리

In [13]:
model1 = Sequential()
# Convolutional Layer1
model1.add(Embedding(vocab_size, embedding_dim,input_length= max_len))
model1.add(Conv1D(filters=32, kernel_size=2, activation='relu',padding='valid',strides=1))
model1.add(MaxPooling1D(pool_size=2))
model1.add(BatchNormalization())

# LSTM Layer 
model1.add(LSTM(hidden_units, return_sequences=True,activation='tanh'))
model1.add(LSTM(hidden_units, return_sequences=True,activation='tanh'))
model1.add(Dropout(0.5))
model1.add(Flatten())
model1.add(Dropout(0.2))
model1.add(Dense(num_classes, activation='softmax'))

In [14]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 21, 128)           1377280   
_________________________________________________________________
conv1d (Conv1D)              (None, 20, 32)            8224      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 10, 32)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 10, 32)            128       
_________________________________________________________________
lstm (LSTM)                  (None, 10, 128)           82432     
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           131584    
_________________________________________________________________
dropout (Dropout)            (None, 10, 128)           0

In [15]:
es = EarlyStopping(monitor='val_loss', mode = "auto", verbose=1, patience=4)
mc = ModelCheckpoint('best_model1.h5', monitor='val_loss', mode = "auto", verbose=1, save_best_only=True)

In [16]:
model1.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model1.fit(X_train, y_train, batch_size=32, epochs=5, callbacks=[es, mc], validation_split=0.2)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.11536, saving model to best_model1.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.11536 to 0.10419, saving model to best_model1.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.10419 to 0.10277, saving model to best_model1.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.10277 to 0.09953, saving model to best_model1.h5
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.09953


In [17]:
print("테스트 정확도: %.4f" % (model1.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.9732


In [18]:
pred1 = model1.predict(X_test)
pred1

array([[2.9546719e-08, 4.6803317e-09, 1.9415420e-05, ..., 2.3403267e-08,
        5.2224351e-08, 4.2582610e-07],
       [1.1820782e-10, 2.3775701e-10, 3.0921835e-06, ..., 1.4734679e-10,
        2.3098746e-08, 1.7536028e-09],
       [1.1707170e-09, 9.9615527e-10, 3.7998993e-06, ..., 3.7487107e-09,
        5.8047152e-07, 4.8575082e-08],
       ...,
       [1.7905883e-11, 3.9168720e-11, 4.5640851e-09, ..., 2.2388877e-10,
        2.0952044e-10, 3.0624112e-09],
       [4.6483842e-10, 1.7043931e-09, 4.5281186e-07, ..., 2.1544286e-06,
        3.4170162e-07, 9.9998760e-01],
       [4.8403018e-03, 1.1898049e-03, 6.7741022e-02, ..., 8.3854934e-03,
        3.4097157e-02, 1.6319806e-02]], dtype=float32)

In [19]:
predict1 = []
for i in range(pred1.shape[0]):
    predict1.append(pred1[i].argmax())
predict1 = np.array(predict1)
predict1

array([ 6,  8,  8, ...,  7, 18, 15], dtype=int64)

In [20]:
print ('Here is the classification report:')
print (classification_report(y_test, predict1))

Here is the classification report:
              precision    recall  f1-score   support

           0       0.85      0.69      0.76       213
           1       0.87      0.79      0.83        85
           2       0.95      0.96      0.96     21038
           3       0.94      0.87      0.91       151
           4       0.90      0.84      0.87       451
           5       0.95      0.94      0.94      7010
           6       0.98      0.98      0.98     49294
           7       0.99      0.99      0.99     19608
           8       0.99      0.99      0.99     37485
           9       0.91      0.93      0.92      2172
          10       0.98      0.97      0.98      2076
          11       0.99      0.98      0.98      8028
          12       0.94      0.93      0.93      5687
          13       0.85      0.89      0.87      3540
          14       0.88      0.93      0.91       593
          15       0.98      0.97      0.97      9322
          16       0.97      0.98      0.97   

### 모델개발용 데이터 예측

In [22]:
digit1_pred = model1.predict(test_data)
digit1_pred

array([[9.03244632e-11, 3.20108141e-11, 5.13272835e-06, ...,
        1.10988732e-10, 5.43994751e-08, 2.35793829e-09],
       [2.91725513e-07, 1.15970185e-08, 3.39414203e-03, ...,
        9.23644961e-07, 9.14432519e-07, 3.31375259e-06],
       [6.03968786e-08, 2.15418243e-08, 7.81658237e-05, ...,
        2.76987284e-05, 4.00899626e-05, 9.99415874e-01],
       ...,
       [9.56981457e-05, 1.45059421e-05, 1.57141674e-03, ...,
        3.65797132e-05, 2.61703157e-04, 2.36944208e-04],
       [3.18315233e-06, 8.25528730e-08, 3.41778991e-06, ...,
        3.69546673e-04, 9.88729537e-01, 3.63292085e-04],
       [9.51939946e-05, 3.09158131e-05, 1.09350694e-04, ...,
        2.55075167e-04, 5.51725563e-04, 2.53769546e-03]], dtype=float32)

In [23]:
digit1_predict = []
for i in range(digit1_pred.shape[0]):
    digit1_predict.append(digit1_pred[i].argmax())
digit1_predict = np.array(digit1_predict)
digit1_predict

array([ 8,  6, 18, ...,  6, 17, 11], dtype=int64)

In [24]:
digit1_predict = encoder1.inverse_transform(digit1_predict)
digit1_predict

array(['I', 'G', 'S', ..., 'G', 'R', 'L'], dtype=object)

In [25]:
len(digit1_predict)

100000

### CNN - LSTM (digit2)

In [26]:
target2 = train['digit_2']

X_train, X_test, y_train, y_test = train_test_split(train_data, target2, test_size=0.2, shuffle=True)

In [27]:
vocab_size = 10760
embedding_dim = 128
hidden_units = 128
num_classes = 74 

In [28]:
model2 = Sequential()
# Convolutional Layer1
model2.add(Embedding(vocab_size, embedding_dim,input_length= max_len))
model2.add(Conv1D(filters=32, kernel_size=2, activation='relu',padding='valid',strides=1))
model2.add(MaxPooling1D(pool_size=2))
model2.add(BatchNormalization())

# LSTM Layer 
model2.add(LSTM(hidden_units, return_sequences=True,activation='tanh'))
model2.add(LSTM(hidden_units, return_sequences=True,activation='tanh'))
model2.add(Dropout(0.5))
model2.add(Flatten())
model2.add(Dropout(0.2))
model2.add(Dense(num_classes, activation='softmax'))

In [29]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 21, 128)           1377280   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 20, 32)            8224      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 10, 32)            0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 10, 32)            128       
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 128)           82432     
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 128)           131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 128)          

In [30]:
mc2 = ModelCheckpoint('best_model2.h5', monitor='val_loss', mode = "auto", verbose=1, save_best_only=True)

In [31]:
model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history2 = model2.fit(X_train, y_train, batch_size=32, epochs=5, callbacks=[es, mc2], validation_split=0.2)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.27289, saving model to best_model2.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.27289 to 0.24793, saving model to best_model2.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.24793 to 0.23863, saving model to best_model2.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.23863 to 0.23683, saving model to best_model2.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.23683 to 0.23502, saving model to best_model2.h5


In [32]:
print("테스트 정확도: %.4f" % (model2.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.9376


In [33]:
pred2 = model2.predict(X_test)
pred2

array([[5.45849502e-07, 1.04273061e-08, 1.86578518e-07, ...,
        1.36696315e-07, 2.61645056e-07, 7.02760374e-08],
       [7.17393050e-05, 2.41151383e-06, 7.69458438e-06, ...,
        9.97355019e-05, 1.42155832e-03, 3.26807203e-04],
       [3.32512108e-11, 9.63055261e-11, 2.42635144e-12, ...,
        2.14673985e-08, 1.81168858e-08, 9.99996424e-01],
       ...,
       [2.56060389e-06, 3.58855190e-09, 1.10603565e-07, ...,
        7.79994707e-07, 3.01738346e-05, 2.60156885e-05],
       [5.38527775e-06, 2.10963194e-07, 2.79878295e-05, ...,
        1.30842159e-06, 1.64081300e-07, 4.36250775e-06],
       [4.96126418e-10, 2.32369141e-11, 2.14486172e-11, ...,
        5.25102280e-07, 6.64239367e-12, 1.47263568e-09]], dtype=float32)

In [34]:
predict2 = []
for i in range(pred2.shape[0]):
    predict2.append(pred2[i].argmax())
predict2 = np.array(predict2)
predict2

array([40, 41, 73, ..., 41, 47, 57], dtype=int64)

In [35]:
print ('Here is the classification report:')
print (classification_report(y_test, predict2))

Here is the classification report:
              precision    recall  f1-score   support

           0       0.78      0.55      0.64       159
           1       0.82      0.48      0.61        29
           2       1.00      0.57      0.73        28
           5       0.88      0.70      0.78        83
           7       0.88      0.93      0.90      2945
           8       1.00      0.23      0.38        60
          10       0.88      0.78      0.83      1060
          11       0.91      0.90      0.91      1142
          12       0.85      0.79      0.82       258
          13       0.78      0.67      0.72       342
          14       0.84      0.80      0.82       332
          15       0.87      0.89      0.88       818
          16       0.00      0.00      0.00        11
          17       0.64      0.69      0.66       651
          18       0.56      0.07      0.12        72
          19       0.70      0.78      0.74      1198
          20       0.79      0.80      0.80   

### 모델개발용 데이터 예측

In [36]:
digit2_pred = model2.predict(test_data)
digit2_pred

array([[1.4500429e-10, 2.0278322e-11, 1.6345071e-08, ..., 6.5445240e-11,
        6.4859663e-12, 9.7628194e-10],
       [8.3109808e-05, 2.2012517e-07, 2.1425194e-06, ..., 5.2826297e-05,
        2.8792188e-05, 5.2629483e-05],
       [1.6163108e-06, 1.1555074e-08, 2.7449646e-08, ..., 9.9993038e-01,
        6.5426564e-10, 1.8014912e-07],
       ...,
       [5.3636136e-06, 9.8136965e-08, 1.1713762e-06, ..., 3.8461239e-06,
        5.2268613e-05, 2.0972757e-05],
       [2.2151289e-06, 7.4453618e-07, 3.1578804e-06, ..., 1.6305519e-03,
        6.9220177e-06, 1.8463249e-04],
       [8.2490951e-06, 1.2492598e-06, 1.1992653e-06, ..., 3.4571681e-04,
        2.9781427e-06, 3.2825134e-05]], dtype=float32)

In [37]:
digit2_predict = []
for i in range(digit2_pred.shape[0]):
    digit2_predict.append(digit2_pred[i].argmax())
digit2_predict = np.array(digit2_predict)
digit2_predict

array([47, 40, 71, ..., 41, 69, 57], dtype=int64)

In [38]:
digit2_predict = encoder2.inverse_transform(digit2_predict)
digit2_predict

array([56., 46., 94., ..., 47., 90., 68.])

In [39]:
len(digit2_predict)

100000

### CNN - LSTM (digit3)

In [40]:
target3 = train['digit_3']

X_train, X_test, y_train, y_test = train_test_split(train_data, target3, test_size=0.2, shuffle=True)

In [41]:
vocab_size = 10760
embedding_dim = 128
hidden_units = 128
num_classes = 225

In [42]:
model3 = Sequential()
# Convolutional Layer1
model3.add(Embedding(vocab_size, embedding_dim,input_length= max_len))
model3.add(Conv1D(filters=32, kernel_size=2, activation='relu',padding='valid',strides=1))
model3.add(MaxPooling1D(pool_size=2))
model3.add(BatchNormalization())

# LSTM Layer 
model3.add(LSTM(hidden_units, return_sequences=True,activation='tanh'))
model3.add(LSTM(hidden_units, return_sequences=True,activation='tanh'))
model3.add(Dropout(0.5))
model3.add(Flatten())
model3.add(Dropout(0.2))
model3.add(Dense(num_classes, activation='softmax'))

In [43]:
mc3 = ModelCheckpoint('best_model3.h5', monitor='val_loss', mode = "auto", verbose=1, save_best_only=True)

In [44]:
model3.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history3 = model3.fit(X_train, y_train, batch_size=32, epochs=5, callbacks=[es, mc3], validation_split=0.2)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.44303, saving model to best_model3.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.44303 to 0.39536, saving model to best_model3.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.39536 to 0.38026, saving model to best_model3.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.38026 to 0.37058, saving model to best_model3.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.37058 to 0.36674, saving model to best_model3.h5


In [45]:
print("테스트 정확도: %.4f" % (model3.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.9071


In [46]:
pred3 = model3.predict(X_test)
pred3

array([[1.1774032e-10, 1.5438582e-10, 2.2651525e-11, ..., 1.7291192e-09,
        1.3650728e-09, 7.1947248e-08],
       [3.2516841e-08, 5.0423692e-09, 5.7901400e-10, ..., 2.5989505e-07,
        9.7591374e-07, 3.0793447e-07],
       [1.1112172e-09, 8.7802174e-09, 2.4129050e-09, ..., 1.6536006e-09,
        1.7339855e-07, 5.4877808e-07],
       ...,
       [2.1244948e-06, 1.6433320e-06, 2.1831215e-06, ..., 1.3676956e-06,
        4.8580878e-05, 1.7833547e-05],
       [3.6728711e-08, 5.0499311e-09, 1.0734736e-08, ..., 9.0353608e-10,
        4.9443849e-09, 2.5050818e-08],
       [4.8379728e-10, 1.4503644e-09, 5.8748645e-10, ..., 9.5291677e-11,
        3.0911245e-08, 6.1527072e-10]], dtype=float32)

In [47]:
predict3 = []
for i in range(pred3.shape[0]):
    predict3.append(pred3[i].argmax())
predict3 = np.array(predict3)
predict3

array([135, 132, 148, ..., 147,  18, 147], dtype=int64)

In [48]:
print ('Here is the classification report:')
print (classification_report(y_test, predict3))

Here is the classification report:
              precision    recall  f1-score   support

           0       0.81      0.70      0.75        80
           1       0.83      0.80      0.81        44
           2       0.67      0.13      0.22        31
           3       0.81      0.72      0.76        18
           4       0.00      0.00      0.00         4
           5       0.94      0.56      0.70        27
           9       0.74      0.51      0.61        39
          10       1.00      0.81      0.90        43
          11       0.00      0.00      0.00         2
          12       0.79      0.57      0.66       138
          13       0.86      0.81      0.83       185
          14       0.73      0.57      0.64       221
          15       0.95      0.79      0.86       149
          16       0.00      0.00      0.00        13
          17       0.82      0.78      0.80       259
          18       0.82      0.90      0.86      1888
          19       0.79      0.75      0.77   

#### 모델개발용 데이터 예측 (digit_3)

In [49]:
digit3_pred = model3.predict(test_data)
digit3_pred

array([[3.28453442e-09, 9.39776257e-09, 4.28662750e-09, ...,
        1.31704592e-09, 9.62051772e-08, 7.10673609e-09],
       [6.32077857e-08, 5.95717164e-09, 1.11455858e-08, ...,
        1.04431751e-06, 3.57007188e-08, 2.20646204e-08],
       [1.57847694e-06, 1.36167642e-07, 1.03913443e-07, ...,
        5.52972779e-08, 1.75750647e-05, 1.31764546e-05],
       ...,
       [1.27368639e-05, 2.44962075e-06, 1.21168148e-06, ...,
        1.57710947e-05, 5.73136713e-05, 6.29236456e-05],
       [3.04460656e-07, 9.52211565e-07, 8.24323979e-06, ...,
        1.03302604e-07, 2.26771706e-04, 5.85728503e-06],
       [6.26271790e-09, 2.07695479e-08, 1.56772060e-07, ...,
        1.79854514e-08, 3.33998105e-06, 1.54950194e-05]], dtype=float32)

In [50]:
digit3_predict = []
for i in range(digit3_pred.shape[0]):
    digit3_predict.append(digit3_pred[i].argmax())
digit3_predict = np.array(digit3_predict)
digit3_predict

array([147, 122, 219, ..., 132, 214, 169], dtype=int64)

In [51]:
digit3_predict = encoder3.inverse_transform(digit3_predict)
digit3_predict

array([561., 466., 949., ..., 478., 902., 682.])

In [52]:
len(digit3_predict)

100000

### 답안 작성용 파일

In [53]:
submission = pd.read_csv('문서분류/답안 작성용 파일.csv', encoding='cp949')

In [54]:
len(digit1_predict)

100000

In [55]:
len(digit2_predict)

100000

In [56]:
len(digit3_predict)

100000

In [57]:
submission['digit_1'] = digit1_predict

In [58]:
submission['digit_2'] = digit2_predict

In [59]:
submission['digit_3'] = digit3_predict

In [60]:
submission.to_csv('답안 작성용 파일(CNN-LSTM).csv', index=False, encoding='utf-8-sig')