In [16]:
# 필요한 모듈을 임포트하고, 데이터를 다운로드 하기

# data 및 전처리 
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score


# Vectorization 모듈
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


# 모델링 리스트
from sklearn.naive_bayes import MultinomialNB # 다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

# 조건 1 : 모든 단어로 단어장 구성하여 결과 보기

In [None]:
# train data, x_test data 확인

(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

print(x_train[0])
print(x_test[0])

In [6]:
# 학습, 테스트 데이터 문자 변경 → 시퀀스로 변환한 것을 왜 다시 문자로 바꾸는지 (용현님 질문)

# 인덱스를 단어로 바꿔주는 딕셔너리 정의 
word_index = reuters.get_word_index(path="reuters_word_index.json")

index_to_word = { index+3 : word for word, index in word_index.items() }

# index_to_word에 숫자 0은 <pad>, 숫자 1은 <sos>, 숫자 2는 <unk>를 넣어줍니다.
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token
  
# 확인
print(' '.join([index_to_word[index] for index in x_train[0]]))

#  전체 데이터 텍스트로 변경 ##
decoded_train = []
for i in range(len(x_train)):
  t = ' '.join([index_to_word[index] for index in x_train[i]])  ## 
  decoded_train.append(t)

x_train = decoded_train

decoded_test = []
for i in range(len(x_test)):
  text = ' '.join([index_to_word[index] for index in x_test[i]])
  decoded_test.append(text)

x_test = decoded_test

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
<sos> <unk> <unk> said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3


In [8]:
# 위에서 만들어진 텍스트 데이터를 이용해서 벡터화 진행

# DTM 생성
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)
print(x_train_dtm.shape)

# Tfidf 행렬 생성
tfidf_matrix = TfidfTransformer()
x_train_tfidf = tfidf_matrix.fit_transform(x_train_dtm)
print(x_train_tfidf.shape)


(8982, 9670)
(8982, 9670)


(8982, 9670)

In [9]:
# 테스트데이터/학습데이터 tf-idf 생성 

# 학습데이터(train)
x_train_dtm = dtmvector.fit_transform(x_train)
x_train_tfidf = tfidf_matrix.fit_transform(x_train_dtm)

# 테스트데이터(test)
x_test_dtm = dtmvector.transform(x_test)
x_test_tfidf = tfidf_matrix.transform(x_test_dtm)

In [10]:
# 모델별 결과 저장 리스트 생성 

result_acc = [] # 정확도 리스트
result_f1score = [] # f1score 리스트

In [18]:
# 모델 리스트

model_nb = MultinomialNB()
model_cnb = ComplementNB()
model_lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
model_lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
model_tree = DecisionTreeClassifier(max_depth=10, random_state=27)
model_forest = RandomForestClassifier(n_estimators = 5, random_state=27)
model_grbt = GradientBoostingClassifier(random_state=27, verbose=3)
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, max_iter=3000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft')


model_list = [model_nb, model_cnb, model_lr, model_lsvc, model_forest, model_grbt, voting_classifier]

In [None]:
# 모델 적용
for model in model_list:
  model.fit(x_train_tfidf, y_train)
  y_pred = model.predict(x_test_tfidf)

  acc = accuracy_score(y_test, y_pred)
  f_score = f1_score(y_test, y_pred, average='weighted')

  result_acc.append(acc)
  result_f1score.append(f_score)


result_df = pd.DataFrame(zip(result_acc, result_f1score), index=model_list, columns=['accuracy', 'f1_score'])
result_df

# 조건 2 : 단어장 크기 5000 진행

In [None]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

# train, test 문자로 바꾸기
decoded_train = []
for i in range(len(x_train)):
  t = ' '.join([index_to_word[index] for index in x_train[i]])
  decoded_train.append(t)

x_train = decoded_train

decoded_test = []
for i in range(len(x_test)):
  text = ' '.join([index_to_word[index] for index in x_test[i]])
  decoded_test.append(text)

x_test = decoded_test

# tfidf 생성
# train
x_train_dtm = dtmvector.fit_transform(x_train)
x_train_tfidf = tfidf_matrix.fit_transform(x_train_dtm)

# test
x_test_dtm = dtmvector.transform(x_test)
x_test_tfidf = tfidf_matrix.transform(x_test_dtm)


# 모델 별 결과를 저장할 리스트 생성
result_acc = [] # 정확도 리스트
result_f1score = [] # f1score 리스트

# 모델 리스트
model_nb = MultinomialNB()
model_cnb = ComplementNB()
model_lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
model_lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
model_tree = DecisionTreeClassifier(max_depth=10, random_state=27)
model_forest = RandomForestClassifier(n_estimators = 5, random_state=27)
model_grbt = GradientBoostingClassifier(random_state=27, verbose=3)
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, max_iter=3000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft')


model_list = [model_nb, model_cnb, model_lr, model_lsvc, model_forest, model_grbt, voting_classifier]

for model in model_list:
  model.fit(x_train_tfidf, y_train)
  y_pred = model.predict(x_test_tfidf)

  acc = accuracy_score(y_test, y_pred)
  f_score = f1_score(y_test, y_pred, average='weighted')

  result_acc.append(acc)
  result_f1score.append(f_score)


result_df = pd.DataFrame(zip(result_acc, result_f1score), index=model_list, columns=['accuracy', 'f1_score'])
result_df


# 조건 3 : 단어장 크기 5000, skip_top = 5 진행

In [None]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, skip_top = 5, test_split=0.2)

# train, test 문자로 바꾸기
decoded_train = []
for i in range(len(x_train)):
  t = ' '.join([index_to_word[index] for index in x_train[i]])
  decoded_train.append(t)

x_train = decoded_train

decoded_test = []
for i in range(len(x_test)):
  text = ' '.join([index_to_word[index] for index in x_test[i]])
  decoded_test.append(text)

x_test = decoded_test

# tfidf 생성
# train
x_train_dtm = dtmvector.fit_transform(x_train)
x_train_tfidf = tfidf_matrix.fit_transform(x_train_dtm)

# test
x_test_dtm = dtmvector.transform(x_test)
x_test_tfidf = tfidf_matrix.transform(x_test_dtm)


# 모델 별 결과를 저장할 리스트 생성
result_acc = [] # 정확도 리스트
result_f1score = [] # f1score 리스트

# 모델 리스트
model_nb = MultinomialNB()
model_cnb = ComplementNB()
model_lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
model_lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
model_tree = DecisionTreeClassifier(max_depth=10, random_state=27)
model_forest = RandomForestClassifier(n_estimators = 5, random_state=27)
model_grbt = GradientBoostingClassifier(random_state=27, verbose=3)
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, max_iter=3000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft')


model_list = [model_nb, model_cnb, model_lr, model_lsvc, model_forest, model_grbt, voting_classifier]

for model in model_list:
  model.fit(x_train_tfidf, y_train)
  y_pred = model.predict(x_test_tfidf)

  acc = accuracy_score(y_test, y_pred)
  f_score = f1_score(y_test, y_pred, average='weighted')

  result_acc.append(acc)
  result_f1score.append(f_score)


result_df = pd.DataFrame(zip(result_acc, result_f1score), index=model_list, columns=['accuracy', 'f1_score'])
result_df


# 조건 4 : 단어장 크기 10000, skip_top = 3 진행

In [None]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, skip_top = 3, test_split=0.2)

# train, test 문자로 바꾸기
decoded_train = []
for i in range(len(x_train)):
  t = ' '.join([index_to_word[index] for index in x_train[i]])
  decoded_train.append(t)

x_train = decoded_train

decoded_test = []
for i in range(len(x_test)):
  text = ' '.join([index_to_word[index] for index in x_test[i]])
  decoded_test.append(text)

x_test = decoded_test

# tfidf 생성
# train
x_train_dtm = dtmvector.fit_transform(x_train)
x_train_tfidf = tfidf_matrix.fit_transform(x_train_dtm)

# test
x_test_dtm = dtmvector.transform(x_test)
x_test_tfidf = tfidf_matrix.transform(x_test_dtm)


# 모델 별 결과를 저장할 리스트 생성
result_acc = [] # 정확도 리스트
result_f1score = [] # f1score 리스트

# 모델 리스트
model_nb = MultinomialNB()
model_cnb = ComplementNB()
model_lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
model_lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
model_tree = DecisionTreeClassifier(max_depth=10, random_state=27)
model_forest = RandomForestClassifier(n_estimators = 5, random_state=27)
model_grbt = GradientBoostingClassifier(random_state=27, verbose=3)
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, max_iter=3000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft')


model_list = [model_nb, model_cnb, model_lr, model_lsvc, model_forest, model_grbt, voting_classifier]

for model in model_list:
  model.fit(x_train_tfidf, y_train)
  y_pred = model.predict(x_test_tfidf)

  acc = accuracy_score(y_test, y_pred)
  f_score = f1_score(y_test, y_pred, average='weighted')

  result_acc.append(acc)
  result_f1score.append(f_score)


result_df = pd.DataFrame(zip(result_acc, result_f1score), index=model_list, columns=['accuracy', 'f1_score'])
result_df


# 조건 5: RNN을 이용한 딥러닝 모델과 비교

In [None]:
import tensorflow as tf

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
                                                        value=0,
                                                        padding='pre',
                                                        maxlen=max_len)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
                                                       value=0,
                                                       padding='pre',
                                                       maxlen = max_len)
vocab_size = vocab_size
word_vector_dim = 200

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(46, activation='softmax')) # 클래스가 총 46개라 마지막층은 46개의 결과가 나와야합니다. 

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='SparseCategoricalCrossentropy', 
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=15, batch_size=64, validation_data=(x_test, y_test), verbose=1)