In [42]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def makeDtmTfIdf(x_train, x_test):
  # train dtm vector만들기
  dtmvector = CountVectorizer()
  x_train_dtm = dtmvector.fit_transform(x_train)
  tfidf_transformer = TfidfTransformer()
  tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

  x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
  tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환
  
  return tfidfv, tfidfv_test

def MultinomialNaiveBayesClassifier(x_train, y_train, x_test, y_test):
  
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  mod = MultinomialNB()
  mod.fit(tfidfv, y_train)
  predicted = mod.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Multinomial NB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def ComplementNaiveBayesClasifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  cb = ComplementNB()
  cb.fit(tfidfv, y_train)
  predicted = cb.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Complement Naive Bayes 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교


def LogisticRegressionClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)

  lr = LogisticRegression(C=10000, penalty='l2')
  lr.fit(tfidfv, y_train)
  predicted = lr.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Logistic Regression 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def LinearSVM(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)

  lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
  lsvc.fit(tfidfv, y_train)

  predicted = lsvc.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("LinearSVM 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def DTClassifier(x_train, y_train, x_test, y_test):
  tree = DecisionTreeClassifier(max_depth=10, random_state=0)
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  tree.fit(tfidfv, y_train)
  predicted = tree.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Decesion Tree Classifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def RFClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  forest = RandomForestClassifier(n_estimators=5, random_state=0)
  forest.fit(tfidfv, y_train)
  predicted = forest.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("RandomForest 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def GBClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  grbt = GradientBoostingClassifier(random_state=0) # verbose=3
  grbt.fit(tfidfv, y_train)
  predicted = grbt.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Gradient Boosting 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def VTClassifier(x_train, y_train, x_test, y_test):
  tfidfv, tfidfv_test = makeDtmTfIdf(x_train, x_test)
  voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
        ], voting='soft', n_jobs=-1)
  voting_classifier.fit(tfidfv, y_train)

  predicted = voting_classifier.predict(tfidfv_test) #테스트 데이터에 대한 예측
  print("Voting Classifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

def RunAllClasifier(x_train, y_train, x_test, y_test):
  
  cf_list= [MultinomialNaiveBayesClassifier, ComplementNaiveBayesClasifier, 
            LogisticRegressionClassifier, LinearSVM, DTClassifier, RFClassifier, 
            GBClassifier]
  for f in cf_list:
    f(x_train, y_train, x_test, y_test)



def AddSpecialIndex(index_to_word, data):

  decoded = []
  for i in range(len(data)):
    t = ' '.join([index_to_word[index] for index in data[i]])
    decoded.append(t)

  return decoded


# 1. 모든 단어 사용


In [43]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = {index + 3 : word for word, index in word_index.items()}
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

x_train = AddSpecialIndex(index_to_word, x_train)
x_test = AddSpecialIndex(index_to_word, x_test)


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [44]:
RunAllClasifier(x_train, y_train, x_test, y_test)

Multinomial NB 정확도: 0.5997328584149599
Complement Naive Bayes 정확도: 0.7649154051647373


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 정확도: 0.813446126447017




LinearSVM 정확도: 0.780053428317008
Decesion Tree Classifier 정확도: 0.6211041852181657
RandomForest 정확도: 0.6544968833481746
Gradient Boosting 정확도: 0.7684772929652716


# 빈도수 상위 5000개의 단어만 사용


In [None]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = {index + 3 : word for word, index in word_index.items()}
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

x_train = AddSpecialIndex(index_to_word, x_train)
x_test = AddSpecialIndex(index_to_word, x_test)
RunAllClasifier(x_train, y_train, x_test, y_test)


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Multinomial NB 정확도: 0.6731967943009796
Complement Naive Bayes 정확도: 0.7707034728406055


# 딥러닝 모델 적용하기 (LSTM 사용)


In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

vocab_size = 

def train_LSTM(X_train, y_train):

    model = Sequential()
    model.add(Embedding(vocab_size, 100))
    model.add(LSTM(128))
    model.add(Dense(46, activation='sigmoid')) # 46개의 class를 분류해야한다.

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
    mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

    model.compile(optimizer='rmsprop', loss='CategoricalCrossentropy', metrics=['acc'])
    history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

    return model, history