In [4]:
# TF_IDF 생성 후 심층 신경망을 이용한 이메일 분류
# 20개의 사전 분류된 범주 중 하나로 이메일을 분류 하기 위해 DNN(심층 신경망)을 사용(281p)
from sklearn.datasets import fetch_20newsgroups

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train') # 처음 다운로드에 많은 시간이 소요
newsgroups_test = fetch_20newsgroups(subset='test')  

In [6]:
x_train = newsgroups_train.data
x_test = newsgroups_test.data

In [7]:
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [8]:
print ("20개 카테고리 전체 목록:")
print (newsgroups_train.target_names)
print ("\n")
print ("샘플 이메일:")
print (x_train[0])
print ("샘플 타겟 카테고리:")
print (y_train[0])
print (newsgroups_train.target_names[y_train[0]])

20개 카테고리 전체 목록:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


샘플 이메일:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, o

In [10]:
# 데이터 전처리에 사용
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

In [12]:
def preprocessing(text): # (284p ~ 289p))
    # 표준 문장부호가 있다면 빈칸으로 바꾸고 아니면 공백으로 바꾸지 않는다. 
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in 
              nltk.word_tokenize(sent)] # 공백에 따라 단어로 토큰화하고 추가 단계를 적용하기 위한 리스트로 함께 묶는다.
    
    tokens = [word.lower() for word in tokens] # 모든 문자를 소문자로 변환해 말뭉치에서 중복을 제거
    
    stopwds = stopwords.words('english') 
    tokens = [token for token in tokens if token not in stopwds] # 불용어 제거
     
    tokens = [word for word in tokens if len(word)>=3] # 길이가 3이상인 단어만 남기고 다른 단어 제거
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens] # 접미사가 나오는 단어에 PorterStemmer 함수를 통해 스테밍
        
    tagged_corpus = pos_tag(tokens) # 품사 태깅
    
    Noun_tags = ['NN','NNP','NNPS','NNS'] # NN: 명사 일반 단수, NNP: 명사 보통 복수  
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] # VB: 동사 기본, VBD 동사 현재, VBN: 동사 과거
    lemmatizer = WordNetLemmatizer()                 # VBP: 동사 현재 3인칭 단수가 아닌 것, VBZ: 동사 현재 3인칭 단수

    def prat_lemmatize(token,tag): # tag값이 leematize 함수의 값이 일치하지 않는 경우
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

In [13]:
# 학습 및 데이터 전처리 , 실행이 오래걸림
x_train_preprocessed  = []
for i in x_train:
    x_train_preprocessed .append(preprocessing(i))

x_test_preprocessed = []
for i in x_test:
    x_test_preprocessed.append(preprocessing(i))

# TFIDF 벡터라이저(vectorizer) 구축
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
                             max_features= 10000,strip_accents='unicode',  norm='l2')

x_train_2 = vectorizer.fit_transform(x_train_preprocessed).todense()
x_test_2 = vectorizer.transform(x_test_preprocessed).todense()

In [24]:
len(x_train_preprocessed[0]), len(x_train_preprocessed[1]) # 교재에는 없지만 조교님이 데이터 확인을 위해 쓰신 코드같음.

(391, 564)

In [25]:
x_train_preprocessed[0]

'lerxst wam umd edu thing subject car nntp post host rac3 wam umd edu organ univers maryland colleg park line wonder anyon could enlighten car saw day door sport car look late 60 earli 70 call bricklin door realli small addit front bumper separ rest bodi know anyon tellm model name engin spec year product car make histori whatev info funki look car plea mail thank bring neighborhood lerxst'

In [26]:
x_train_2[1]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [27]:
# 딥러닝 모듈 (286p~ )
#!pip install tensorflow  // keras가 tensorflow를 필요로 해서 설치
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta,Adam,RMSprop
from keras.utils import np_utils

In [28]:
np.random.seed(1337) 
nb_classes = 20 # 클래스 20
batch_size = 64 # 일괄 처리 사이즈 64
nb_epochs = 20  # 학습할 에포크 수는 20으로 지정

In [29]:
Y_train = np_utils.to_categorical(y_train, nb_classes)

In [30]:
Y_train[0] # 역시 교재에는 없는 코드

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

In [31]:
# 케라스에서의 딥 러이어 (심층) 모델 구축
model = Sequential()

model.add(Dense(1000,input_shape= (10000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

print (model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              10001000  
_________________________________________________________________
activation (Activation)      (None, 1000)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_1 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2

In [32]:
model.fit(x_train_2, Y_train, batch_size=batch_size, epochs=nb_epochs,verbose=1) # 모델 학습 # 각 에포크마다 학습 하기에 오래 걸림

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2239dba7fd0>

In [34]:
# keras 모듈 때문인지 데이터 때문인지를 모르겠는데 위의 loss 값이 교재와 다른 것을 확인할 수 있다.

y_train_predclass = model.predict_classes(x_train_2, batch_size=batch_size)
y_test_predclass = model.predict_classes(x_test_2, batch_size=batch_size)

In [38]:
# 그래서 그런 것인지 위의 모델 학습 이후의 코드의 결과가 교재와는 다른 점들이 보인다.
from sklearn.metrics import accuracy_score,classification_report

print ("\n\nDeep Neural Network  - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3))
print ("\nDeep Neural Network  - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3))



Deep Neural Network  - Train accuracy:

Deep Neural Network  - Test accuracy:


(None, 0.809)

In [40]:
print ("\nDeep Neural Network  - Train Classification Report")
print (classification_report(y_train,y_train_predclass))


Deep Neural Network  - Train Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       584
           2       1.00      1.00      1.00       591
           3       1.00      0.99      1.00       590
           4       1.00      1.00      1.00       578
           5       1.00      1.00      1.00       593
           6       1.00      0.99      1.00       585
           7       1.00      1.00      1.00       594
           8       1.00      1.00      1.00       598
           9       1.00      1.00      1.00       597
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       595
          12       0.99      1.00      1.00       591
          13       1.00      1.00      1.00       594
          14       1.00      1.00      1.00       593
          15       1.00      1.00      1.00       599
          16       1.00      

In [41]:
print ("\nDeep Neural Network  - Test Classification Report")
print (classification_report(y_test,y_test_predclass))


Deep Neural Network  - Test Classification Report
              precision    recall  f1-score   support

           0       0.80      0.76      0.78       319
           1       0.66      0.73      0.69       389
           2       0.72      0.68      0.70       394
           3       0.70      0.67      0.68       392
           4       0.79      0.75      0.77       385
           5       0.85      0.76      0.80       395
           6       0.83      0.78      0.80       390
           7       0.88      0.85      0.86       396
           8       0.85      0.93      0.89       398
           9       0.90      0.91      0.90       397
          10       0.94      0.96      0.95       399
          11       0.95      0.88      0.91       396
          12       0.59      0.79      0.68       393
          13       0.85      0.82      0.83       396
          14       0.91      0.90      0.91       394
          15       0.88      0.86      0.87       398
          16       0.78      0