In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def makeDtmTfIdf(x_train, x_test):
  # train dtm vector만들기
  dtmvector = CountVectorizer()
  x_train_dtm = dtmvector.fit_transform(x_train)
  tfidf_transformer = TfidfTransformer()
  tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

  x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
  tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환
  
  return tfidfv, tfidfv_test

def MultinomialNaiveBayesClassifier(x_train, y_train, x_test, y_test):
  
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  mod = MultinomialNB()
  mod.fit(tfidfv, y_train)
  predicted = mod.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Multinomial NB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def ComplementNaiveBayesClasifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  cb = ComplementNB()
  cb.fit(tfidfv, y_train)
  predicted = cb.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Complement Naive Bayes 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교


def LogisticRegressionClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)

  lr = LogisticRegression(C=10000, penalty='l2')
  lr.fit(tfidfv, y_train)
  predicted = lr.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Logistic Regression 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def LinearSVM(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)

  lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
  lsvc.fit(tfidfv, y_train)

  predicted = lsvc.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("LinearSVM 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def DTClassifier(x_train, y_train, x_test, y_test):
  tree = DecisionTreeClassifier(max_depth=10, random_state=0)
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  tree.fit(tfidfv, y_train)
  predicted = tree.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Decesion Tree Classifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def RFClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  forest = RandomForestClassifier(n_estimators=5, random_state=0)
  forest.fit(tfidfv, y_train)
  predicted = forest.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("RandomForest 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def GBClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  grbt = GradientBoostingClassifier(random_state=0) # verbose=3
  grbt.fit(tfidfv, y_train)
  predicted = grbt.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Gradient Boosting 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def VTClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
        ], voting='soft', n_jobs=-1)
  voting_classifier.fit(tfidfv, y_train)

  predicted = voting_classifier.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Voting Classifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def RunAllClasifier(x_train, y_train, x_test, y_test):
  
  cf_list= [MultinomialNaiveBayesClassifier, ComplementNaiveBayesClasifier, 
            LogisticRegressionClassifier, LinearSVM, DTClassifier, RFClassifier, 
            GBClassifier]
  for f in cf_list:
    f(x_train, y_train, x_test, y_test)



def AddSpecialIndex(index_to_word, data):

  decoded = []
  for i in range(len(data)):
    t = ' '.join([index_to_word[index] for index in data[i]])
    decoded.append(t)

  return decoded


# 1. 모든 단어 사용


In [2]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = {index + 3 : word for word, index in word_index.items()}
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

x_train = AddSpecialIndex(index_to_word, x_train)
x_test = AddSpecialIndex(index_to_word, x_test)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
RunAllClasifier(x_train, y_train, x_test, y_test)

Multinomial NB 정확도: 0.5997328584149599
Complement Naive Bayes 정확도: 0.7649154051647373


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 정확도: 0.813446126447017




LinearSVM 정확도: 0.7880676758682101
Decesion Tree Classifier 정확도: 0.6211041852181657
RandomForest 정확도: 0.6544968833481746
Gradient Boosting 정확도: 0.7684772929652716


# 빈도수 상위 5000개의 단어만 사용


In [4]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = {index + 3 : word for word, index in word_index.items()}
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

x_train = AddSpecialIndex(index_to_word, x_train)
x_test = AddSpecialIndex(index_to_word, x_test)
RunAllClasifier(x_train, y_train, x_test, y_test)


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Multinomial NB 정확도: 0.6731967943009796
Complement Naive Bayes 정확도: 0.7707034728406055


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 정확도: 0.8058771148708815




LinearSVM 정확도: 0.7666963490650045
Decesion Tree Classifier 정확도: 0.6179875333926982
RandomForest 정확도: 0.701246660730187
Gradient Boosting 정확도: 0.769813000890472


In [9]:
x_train[:10]

['<sos> <unk> <unk> said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3',
 '<sos> generale de banque sa lt <unk> <unk> and lt heller overseas corp of chicago have each taken 50 pct stakes in <unk> company sa <unk> factors generale de banque said in a statement it gave no financial details of the transaction sa <unk> <unk> turnover in 1986 was 17 5 billion belgian francs reuter 3',
 '<sos> shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs vs 22 cts net 46 0 mln vs 3 328 000 avg shrs 14 0 mln vs 15 2 mln year shr 5 41 dlrs vs 1 56 dlrs shr diluted 4 94 dlrs vs 1 50 dlrs net 78 2 mln vs 25 9 mln avg shrs 14 5 mln vs 15 1 mln note earnings per share reflect the two fo

# 딥러닝 모델 적용하기 (LSTM 사용)


In [7]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

vocab_size = 5000

def train_LSTM(X_train, y_train):

    model = Sequential()
    model.add(Embedding(vocab_size, 100))
    model.add(LSTM(128))
    model.add(Dense(46, activation='sigmoid')) # 46개의 class를 분류해야한다.

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
    mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

    model.compile(optimizer='rmsprop', loss='CategoricalCrossentropy', metrics=['acc'])
    history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

    return model, history

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=vocab_size, test_split=0.2)
index_to_word = {index + 3 : word for word, index in word_index.items()}
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

x_train = AddSpecialIndex(index_to_word, x_train)
x_test = AddSpecialIndex(index_to_word, x_test)

tokenizer = Tokenizer(vocab_size, oov_token = 'OOV')
tokenizer.fit_on_texts(x_train)
X_train = tokenizer.texts_to_sequences(x_train)
X_test = tokenizer.texts_to_sequences(x_test)
max_len = 50
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [20]:
from keras.utils import to_categorical

y_train = to_categorical(y_train)

model, history = train_LSTM(X_train, y_train)

Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.44073, saving model to best_model.h5
Epoch 2/15

Epoch 00002: val_acc improved from 0.44073 to 0.53923, saving model to best_model.h5
Epoch 3/15

Epoch 00003: val_acc improved from 0.53923 to 0.57485, saving model to best_model.h5
Epoch 4/15

Epoch 00004: val_acc improved from 0.57485 to 0.58820, saving model to best_model.h5
Epoch 5/15

Epoch 00005: val_acc improved from 0.58820 to 0.63216, saving model to best_model.h5
Epoch 6/15

Epoch 00006: val_acc improved from 0.63216 to 0.63662, saving model to best_model.h5
Epoch 7/15

Epoch 00007: val_acc improved from 0.63662 to 0.65331, saving model to best_model.h5
Epoch 8/15

Epoch 00008: val_acc did not improve from 0.65331
Epoch 9/15

Epoch 00009: val_acc improved from 0.65331 to 0.65665, saving model to best_model.h5
Epoch 10/15

Epoch 00010: val_acc improved from 0.65665 to 0.66778, saving model to best_model.h5
Epoch 11/15

Epoch 00011: val_acc did not improve from 0.66778
Epoc

In [21]:
y_test = to_categorical(y_test)

results = model.evaluate(X_test,  y_test, verbose=2)


71/71 - 0s - loss: 1.5660 - acc: 0.6385


# 회고
LSTM으로 돌려보면 기존 머신러닝 방법보다 성능이 못하다.
이유는 tokenization과정을 공백으로만 했고, 영어의 경우 a, the와 같은 불용어를 빼주지 않은 이유도 있을거 같다.
그리고 단어를 단순히 숫자로만 변경했다. w2v를 이용하면 성능이 좋아질거 같은데 귀찮...