<a href="https://colab.research.google.com/github/hyeryn/Natural-Language/blob/master/09_2_email_classfn_CNN_1D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import print_function

import pandas as pd

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb #일련의 감정 데이터셋

from sklearn.metrics import accuracy_score,classification_report


# set parameters: -> 최대피처/추출할단어수 6000, 개별문장최대길이 400
max_features = 6000
max_length = 400
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train observations') #학습된 모델과 테스트할 모델의 관측치 수
print(len(x_test), 'test observations')


# Creating numbers to word mapping -> 해당 단어와 해당 정수 인덱스 값의 딕셔너리 매핑
wind = imdb.get_word_index()
revind = dict((v,k) for k,v in wind.items()) #python3부터는 iteritems 대신 items()사용
#영어 단어가 아닌 숫자로 결과가 보여짐
print (x_train[0])
print (y_train[0])

#디코딩: 역매핑딕셔너리
def decode(sent_list):
    new_words = []
    for i in sent_list:
        new_words.append(revind[i])
    comb_words = " ".join(new_words)
    return comb_words
#숫자 매핑을 텍스트로 변환 후 출력
print (decode(x_train[0]))


# 효율적 연산을 위한 패드 배열 : 모든 관측치를 하나의 고정된 차원으로 가져와 속도를 향상시키고 계산을 가능하게 끔 만듦
#-> 최대 길이기 400단인 추가 문장을 더하기 위해서
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


# Deep Learning architecture parameters -> 케라스 코드를 적용해 CNN 1D모델을 만듦
batch_size = 32
embedding_dims = 60
num_kernels = 260
kernel_size = 3
hidden_dims = 300
epochs = 3


# Building the model
model = Sequential()

model.add(Embedding(max_features,embedding_dims,input_length=max_length))
model.add(Dropout(0.2))

model.add(Conv1D(num_kernels,kernel_size,padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print (model.summary())
#학습의 정확도는 올라가지만 유효성 검사의 정확도는 낮아짐 -> epochs 수 늘리는 것 보다 아키텍처 크기를 늘려야함
model.fit(x_train, y_train,batch_size=batch_size,epochs=epochs,validation_split=0.2)


#Model Prediction
y_train_predclass = model.predict_classes(x_train,batch_size=batch_size)
y_test_predclass = model.predict_classes(x_test,batch_size=batch_size)

y_train_predclass.shape = y_train.shape
y_test_predclass.shape = y_test.shape


# Model accuracies & metrics calculation -> 과적합 발생
print (("\n\nCNN 1D  - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3)))
print ("\nCNN 1D of Training data\n",classification_report(y_train, y_train_predclass))
print ("\nCNN 1D - Train Confusion Matrix\n\n",pd.crosstab(y_train, y_train_predclass,rownames = ["Actuall"],colnames = ["Predicted"]))

print (("\nCNN 1D  - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3)))
print ("\nCNN 1D of Test data\n",classification_report(y_test, y_test_predclass))
print ("\nCNN 1D - Test Confusion Matrix\n\n",pd.crosstab(y_test, y_test_predclass,rownames = ["Actuall"],colnames = ["Predicted"]))

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


25000 train observations
25000 test observations
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1
the as you w





CNN 1D  - Train accuracy: 0.962

CNN 1D of Training data
               precision    recall  f1-score   support

           0       0.95      0.98      0.96     12500
           1       0.98      0.95      0.96     12500

    accuracy                           0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000


CNN 1D - Train Confusion Matrix

 Predicted      0      1
Actuall                
0          12222    278
1            660  11840

CNN 1D  - Test accuracy: 0.883

CNN 1D of Test data
               precision    recall  f1-score   support

           0       0.86      0.91      0.89     12500
           1       0.91      0.85      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000


CNN 1D - Test Confusion Matrix

 Predicted      0      1
Actuall                
0          11427   