In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
import tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
df = pd.read_csv('최종 전처리.csv')
df.head()

Unnamed: 0,digit_1,digit_2,digit_3,Full Text,clean_text
0,S,95.0,952.0,카센터에서 자동차부분정비 타이어오일교환,카 센터 자동차 부분 정비 타이어 오일 교환
1,G,47.0,472.0,상점내에서 일반인을 대상으로 채소 과일판매,상점 내 일반인 대상 채소 과일 판매
2,G,46.0,467.0,절단하여사업체에도매 공업용고무를가지고 합성고무도매,절단 하여사 업체 에도 매 공업 용 고무 가지 고 합성 고무 도매
3,G,47.0,475.0,영업점에서 일반소비자에게 열쇠잠금장치,영업 점 일 반 소비자 열쇠 잠금장치
4,Q,87.0,872.0,어린이집 보호자의 위탁을 받아 취학전아동보육,어린이집 보호자 위탁 받아 취학 전 아동 보육


In [5]:
train = df[:1000000]
train.head()

Unnamed: 0,digit_1,digit_2,digit_3,Full Text,clean_text
0,S,95.0,952.0,카센터에서 자동차부분정비 타이어오일교환,카 센터 자동차 부분 정비 타이어 오일 교환
1,G,47.0,472.0,상점내에서 일반인을 대상으로 채소 과일판매,상점 내 일반인 대상 채소 과일 판매
2,G,46.0,467.0,절단하여사업체에도매 공업용고무를가지고 합성고무도매,절단 하여사 업체 에도 매 공업 용 고무 가지 고 합성 고무 도매
3,G,47.0,475.0,영업점에서 일반소비자에게 열쇠잠금장치,영업 점 일 반 소비자 열쇠 잠금장치
4,Q,87.0,872.0,어린이집 보호자의 위탁을 받아 취학전아동보육,어린이집 보호자 위탁 받아 취학 전 아동 보육


In [6]:
encoder1 = LabelEncoder()
train['digit_1'] = encoder1.fit_transform(train['digit_1'])
train['digit_1'].nunique()

19

In [7]:
encoder2 = LabelEncoder()
train['digit_2'] = encoder2.fit_transform(train['digit_2'])
train['digit_2'].nunique()

74

In [8]:
encoder3 = LabelEncoder()
train['digit_3'] = encoder3.fit_transform(train['digit_3'])
train['digit_3'].nunique()

225

In [9]:
text = np.load('text.npy')

# 전처리 진행

In [10]:
max_len = 21

text = tf.keras.preprocessing.sequence.pad_sequences(text, maxlen=max_len)
text

array([[   0,    0,    0, ...,  633, 1943, 3832],
       [   0,    0,    0, ...,  296,  147,   12],
       [   0,    0,    0, ..., 1304,  608,   23],
       ...,
       [   0,    0,    0, ..., 1444, 1030,   12],
       [   0,    0,    0, ..., 1434, 1725,   49],
       [   0,    0,    0, ...,   44, 1310,  203]])

In [11]:
train_data = text[:1000000]
test_data = text[1000000:]
target = train['digit_1']

X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2, shuffle=True, stratify=target)

### Bidirectional - LSTM (digit1)

In [12]:
from tensorflow.keras.layers import LSTM,Bidirectional,GRU

In [13]:
vocab_size = 10760
embedding_dim = 128
hidden_units = 128
num_classes = 19 # 대분류 19개 카테고리

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Bidirectional(LSTM(hidden_units, return_sequences=True)))
model.add(Bidirectional(LSTM(hidden_units)))
model.add(Dropout(0.2))
model.add(Dense(hidden_units, activation='tanh'))
model.add(Dense(num_classes, activation='softmax'))

In [15]:
es = EarlyStopping(monitor='val_loss', mode = "auto", verbose=1, patience=4)
mc = ModelCheckpoint('best_model1.h5', monitor='val_loss', mode = "auto", verbose=1, save_best_only=True)

In [16]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=32, epochs=5, callbacks=[es, mc], validation_split=0.2)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.10475, saving model to best_model1.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.10475 to 0.09364, saving model to best_model1.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.09364 to 0.09313, saving model to best_model1.h5
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.09313
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.09313


In [20]:
print("테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.9739


In [20]:
pred1 = model.predict(X_test)
pred1

array([[2.1877358e-06, 8.4878636e-08, 1.6217272e-06, ..., 1.1509170e-04,
        2.0777001e-05, 5.3187392e-05],
       [2.0993060e-05, 1.0097981e-05, 9.3919371e-06, ..., 4.0944320e-05,
        7.2567753e-05, 6.6762143e-03],
       [3.7637064e-06, 2.9767534e-07, 2.6332580e-03, ..., 3.1503259e-06,
        1.0072624e-05, 1.0060821e-04],
       ...,
       [2.9552095e-05, 1.8878640e-06, 6.1975163e-04, ..., 1.0876021e-04,
        1.7698061e-04, 3.7307595e-04],
       [8.4484651e-07, 1.1765148e-06, 1.5032321e-03, ..., 2.5571619e-06,
        6.5585191e-06, 1.8651126e-04],
       [1.5346949e-07, 5.9876740e-08, 7.9369964e-04, ..., 2.9972682e-07,
        3.1281827e-06, 2.7468255e-05]], dtype=float32)

In [21]:
predict1 = []
for i in range(pred1.shape[0]):
    predict1.append(pred1[i].argmax())
predict1 = np.array(predict1)
predict1

array([15, 11,  6, ...,  8,  6,  6], dtype=int64)

In [22]:
print ('Here is the classification report:')
print (classification_report(y_test, predict1))

Here is the classification report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76       213
           1       0.94      0.74      0.83        85
           2       0.96      0.96      0.96     21038
           3       0.97      0.86      0.91       151
           4       0.84      0.86      0.85       451
           5       0.94      0.95      0.94      7010
           6       0.98      0.98      0.98     49294
           7       1.00      0.99      0.99     19608
           8       0.99      0.99      0.99     37485
           9       0.94      0.92      0.93      2172
          10       0.98      0.99      0.98      2076
          11       0.98      0.98      0.98      8028
          12       0.94      0.94      0.94      5687
          13       0.87      0.90      0.88      3540
          14       0.93      0.93      0.93       593
          15       0.96      0.98      0.97      9322
          16       0.98      0.97      0.97   

### 모델개발용 데이터 예측

In [23]:
digit1_pred = model.predict(test_data)
digit1_pred

array([[2.24906444e-06, 2.70694525e-08, 1.70390165e-04, ...,
        1.18408852e-05, 6.79882487e-06, 7.30940747e-06],
       [5.86889655e-07, 8.53980453e-07, 1.41269830e-03, ...,
        3.63686979e-07, 4.15976820e-06, 2.27669407e-05],
       [2.39920723e-06, 4.04215371e-07, 3.74364354e-05, ...,
        1.21279256e-04, 1.45669794e-03, 9.97923374e-01],
       ...,
       [1.06746527e-04, 9.11065513e-07, 6.14050718e-04, ...,
        6.51831488e-06, 8.33224694e-05, 4.89315789e-05],
       [1.60639356e-05, 1.18197795e-06, 1.66292721e-05, ...,
        2.14294778e-05, 9.97276962e-01, 1.20382691e-04],
       [1.06915322e-05, 7.36309175e-07, 2.75153634e-05, ...,
        8.53607253e-06, 3.38268874e-05, 4.23385529e-04]], dtype=float32)

In [24]:
digit1_predict = []
for i in range(digit1_pred.shape[0]):
    digit1_predict.append(digit1_pred[i].argmax())
digit1_predict = np.array(digit1_predict)
digit1_predict

array([ 8,  6, 18, ...,  6, 17, 11], dtype=int64)

In [25]:
digit1_predict = encoder1.inverse_transform(digit1_predict)
digit1_predict

array(['I', 'G', 'S', ..., 'G', 'R', 'L'], dtype=object)

In [26]:
len(digit1_predict)

100000

### Bidirectional - LSTM (digit2)

In [40]:
target2 = train['digit_2']

X_train, X_test, y_train, y_test = train_test_split(train_data, target2, test_size=0.2, shuffle=True)

In [41]:
vocab_size = 10760
embedding_dim = 128
hidden_units = 128
num_classes = 74 

model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_dim))
model2.add(Bidirectional(LSTM(hidden_units, return_sequences=True)))
model2.add(Bidirectional(LSTM(hidden_units)))
model2.add(Dropout(0.2))
model2.add(Dense(hidden_units, activation='tanh'))
model2.add(Dense(num_classes, activation='softmax'))

In [42]:
mc2 = ModelCheckpoint('best_model2.h5', monitor='val_loss', mode = "auto", verbose=1, save_best_only=True)

In [43]:
model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history2 = model2.fit(X_train, y_train, batch_size=32, epochs=5, callbacks=[es, mc2], validation_split=0.2)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.24453, saving model to best_model1.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.24453 to 0.21838, saving model to best_model1.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.21838 to 0.21424, saving model to best_model1.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.21424 to 0.21331, saving model to best_model1.h5
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.21331


In [44]:
print("테스트 정확도: %.4f" % (model2.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.9412


In [45]:
pred2 = model2.predict(X_test)
pred2

array([[1.8914500e-07, 1.6674186e-07, 1.2438110e-07, ..., 7.6858465e-05,
        1.4080516e-06, 1.3811493e-04],
       [3.4404434e-06, 5.1579121e-07, 2.1519449e-07, ..., 2.6983022e-05,
        8.4983090e-05, 6.3103707e-06],
       [3.2960767e-07, 3.7960756e-06, 1.3680912e-09, ..., 1.0017654e-06,
        9.3332266e-05, 1.2250978e-06],
       ...,
       [3.0018998e-06, 2.5148944e-08, 1.6724691e-07, ..., 9.9977249e-01,
        3.8668549e-07, 2.4822093e-05],
       [5.2099719e-10, 1.8329423e-08, 2.9562949e-11, ..., 3.0474068e-06,
        5.5468852e-10, 1.2819409e-07],
       [4.0422859e-07, 1.6686792e-07, 5.6620333e-08, ..., 1.6117730e-07,
        6.1884562e-06, 1.9596766e-06]], dtype=float32)

In [46]:
predict2 = []
for i in range(pred2.shape[0]):
    predict2.append(pred2[i].argmax())
predict2 = np.array(predict2)
predict2

array([70, 24, 29, ..., 71, 57, 41], dtype=int64)

In [47]:
print ('Here is the classification report:')
print (classification_report(y_test, predict2))

Here is the classification report:
              precision    recall  f1-score   support

           0       0.84      0.61      0.70       173
           1       0.95      0.72      0.82        25
           2       0.77      0.68      0.72        25
           4       0.00      0.00      0.00         4
           5       0.90      0.84      0.87        88
           7       0.91      0.91      0.91      2893
           8       0.78      0.70      0.74        90
           9       0.00      0.00      0.00         1
          10       0.88      0.79      0.83      1023
          11       0.94      0.84      0.89      1068
          12       0.87      0.78      0.82       287
          13       0.81      0.67      0.73       383
          14       0.81      0.80      0.81       348
          15       0.87      0.91      0.89       840
          16       0.54      0.37      0.44        19
          17       0.72      0.66      0.69       591
          18       0.52      0.22      0.31   

### 모델개발용 데이터 예측

In [48]:
digit2_pred = model2.predict(test_data)
digit2_pred

array([[2.1989308e-09, 1.4497396e-08, 3.0837860e-08, ..., 5.2259033e-08,
        4.2988749e-08, 6.8238796e-07],
       [4.8637266e-05, 8.7785349e-07, 8.7951355e-07, ..., 9.8136203e-05,
        4.1451171e-04, 8.1442113e-06],
       [2.0434825e-05, 5.2360431e-08, 6.9749564e-07, ..., 9.9898499e-01,
        5.2941255e-07, 3.8075395e-05],
       ...,
       [1.2916678e-06, 7.9112351e-07, 1.2844343e-07, ..., 5.0421213e-06,
        9.7873353e-06, 1.2135895e-05],
       [1.3988320e-07, 1.9368488e-06, 3.7168946e-09, ..., 2.1473532e-05,
        1.0832193e-07, 7.4764524e-05],
       [5.2166695e-07, 2.1029943e-07, 1.1458310e-09, ..., 4.9704329e-05,
        8.3851614e-07, 4.5041388e-06]], dtype=float32)

In [49]:
digit2_predict = []
for i in range(digit2_pred.shape[0]):
    digit2_predict.append(digit2_pred[i].argmax())
digit2_predict = np.array(digit2_predict)
digit2_predict

array([47, 40, 71, ..., 41, 69, 57], dtype=int64)

In [50]:
digit2_predict = encoder2.inverse_transform(digit2_predict)
digit2_predict

array([56., 46., 94., ..., 47., 90., 68.])

In [56]:
len(digit2_predict)

100000

### Bidirectional - LSTM (digit3)

In [35]:
target3 = train['digit_3']

X_train, X_test, y_train, y_test = train_test_split(train_data, target3, test_size=0.2, shuffle=True)

In [36]:
vocab_size = 10760
embedding_dim = 128
hidden_units = 128
num_classes = 225

model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_dim))
model3.add(Bidirectional(LSTM(hidden_units, return_sequences=True)))
model3.add(Bidirectional(LSTM(hidden_units)))
model3.add(Dropout(0.2))
model3.add(Dense(hidden_units, activation='tanh'))
model3.add(Dense(num_classes, activation='softmax'))

In [37]:
mc3 = ModelCheckpoint('best_model3.h5', monitor='val_loss', mode = "auto", verbose=1, save_best_only=True)

In [38]:
model3.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history3 = model3.fit(X_train, y_train, batch_size=32, epochs=5, callbacks=[es, mc3], validation_split=0.2)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.40633, saving model to best_model3.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.40633 to 0.35587, saving model to best_model3.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.35587 to 0.34364, saving model to best_model3.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.34364 to 0.34259, saving model to best_model3.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.34259 to 0.34070, saving model to best_model3.h5


In [39]:
print("테스트 정확도: %.4f" % (model3.evaluate(X_test, y_test)[1]))

테스트 정확도: 0.9118


In [40]:
pred3 = model3.predict(X_test)
pred3

array([[2.9504253e-07, 2.9637954e-07, 1.5003279e-07, ..., 7.9812999e-08,
        2.8413565e-08, 5.1654251e-08],
       [1.9241886e-06, 2.5428091e-07, 6.3702167e-08, ..., 2.8238164e-06,
        1.2378317e-06, 3.4912918e-07],
       [2.2280405e-10, 5.8984290e-11, 8.4589186e-10, ..., 9.7125906e-08,
        1.2082144e-07, 3.7002297e-07],
       ...,
       [6.0838302e-06, 4.1361324e-07, 2.7208223e-05, ..., 3.0871908e-07,
        7.8518035e-08, 4.3670065e-05],
       [4.9525781e-05, 2.9938876e-06, 7.0327784e-05, ..., 2.8361457e-08,
        2.4947444e-06, 9.6627969e-07],
       [3.8094641e-10, 2.0471983e-08, 4.6213361e-10, ..., 3.2652761e-10,
        2.4746083e-07, 3.8060600e-07]], dtype=float32)

In [41]:
predict3 = []
for i in range(pred3.shape[0]):
    predict3.append(pred3[i].argmax())
predict3 = np.array(predict3)
predict3

array([126, 159, 212, ..., 119,  18, 147], dtype=int64)

In [42]:
print ('Here is the classification report:')
print (classification_report(y_test, predict3))

Here is the classification report:
              precision    recall  f1-score   support

           0       0.82      0.75      0.78        75
           1       0.92      0.79      0.85        56
           2       0.34      0.53      0.41        36
           3       0.76      0.73      0.74        22
           4       0.00      0.00      0.00         2
           5       0.86      0.67      0.75        18
           8       0.00      0.00      0.00         1
           9       0.81      0.50      0.62        42
          10       0.95      0.91      0.93        45
          12       0.70      0.80      0.75       148
          13       0.71      0.86      0.78       182
          14       0.75      0.66      0.70       229
          15       0.92      0.92      0.92       147
          16       1.00      0.21      0.35        19
          17       0.81      0.78      0.79       274
          18       0.89      0.88      0.89      1892
          19       0.91      0.65      0.76   

#### 모델개발용 데이터 예측 (digit_3)

In [43]:
digit3_pred = model3.predict(test_data)
digit3_pred

array([[5.7048184e-09, 2.6058706e-07, 1.0722601e-08, ..., 4.5377151e-09,
        4.1760060e-08, 3.7099915e-07],
       [1.6224116e-09, 1.0008424e-08, 4.4535620e-09, ..., 6.9600619e-06,
        2.9999883e-09, 9.7333280e-11],
       [1.4325600e-10, 2.2990873e-10, 1.2305840e-11, ..., 7.9351048e-07,
        8.5486533e-07, 8.4058996e-07],
       ...,
       [2.0446165e-05, 7.9516710e-08, 1.4600730e-06, ..., 3.4340376e-06,
        1.6763615e-05, 5.0918852e-06],
       [2.0826040e-07, 5.0908028e-08, 4.7005601e-06, ..., 3.7785099e-07,
        3.0968982e-05, 3.6230758e-05],
       [3.4283247e-09, 1.6284558e-08, 4.7453199e-08, ..., 1.3401958e-06,
        3.1349387e-07, 8.3375307e-06]], dtype=float32)

In [44]:
digit3_predict = []
for i in range(digit3_pred.shape[0]):
    digit3_predict.append(digit3_pred[i].argmax())
digit3_predict = np.array(digit3_predict)
digit3_predict

array([147, 122, 219, ..., 132, 214, 169], dtype=int64)

In [45]:
digit3_predict = encoder3.inverse_transform(digit3_predict)
digit3_predict

array([561., 466., 949., ..., 478., 902., 682.])

In [46]:
len(digit3_predict)

100000

### 답안 작성용 파일

In [47]:
submission = pd.read_csv('문서분류/답안 작성용 파일.csv', encoding='cp949')

In [28]:
len(digit1_predict)

100000

In [None]:
len(digit2_predict)

In [48]:
len(digit3_predict)

100000

In [32]:
submission['digit_1'] = digit1_predict

In [55]:
submission['digit_2'] = digit2_predict

In [49]:
submission['digit_3'] = digit3_predict

In [50]:
submission.to_csv('답안 작성용 파일.csv', index=False, encoding='utf-8-sig')