## 6. 뉴스 카테고리 다중분류-Project

In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산


from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

2023-06-28 18:03:19.185790: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = { index+3 : word for word, index in word_index.items() }

for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

In [3]:
dtmvector = CountVectorizer()
tfidf_transformer = TfidfTransformer()

def reuters_load_ml(num_words, index_to_word):
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
    
    decoded = []
    for i in range(len(x_train)):
        t = ' '.join([index_to_word[index] for index in x_train[i]])
        decoded.append(t)

    x_train = decoded
    
    decoded = []
    for i in range(len(x_test)):
        t = ' '.join([index_to_word[index] for index in x_test[i]])
        decoded.append(t)

    x_test = decoded
    
    x_train_dtm = dtmvector.fit_transform(x_train)
    x_train = tfidf_transformer.fit_transform(x_train_dtm)
    x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
    x_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환
    
    return x_train, y_train, x_test, y_test

In [4]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def graph_confusion_matrix(model, x_test, y_test):#, classes_name):
  df_cm = pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)))#, index=classes_name, columns=classes_name)
  fig = plt.figure(figsize=(12,12))
  heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=12)
  plt.ylabel('label')
  plt.xlabel('predicted value')

In [5]:
def fit_ml(x_train, y_train, x_test, y_test):
    nb = MultinomialNB()
    nb.fit(x_train, y_train)
    predicted = nb.predict(x_test) #테스트 데이터에 대한 예측
    print("NB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    #graph_confusion_matrix(nb, x_test, y_test)
    
    cb = ComplementNB()
    cb.fit(x_train, y_train)
    predicted = cb.predict(x_test)
    print("CB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
    lr.fit(x_train, y_train)
    predicted = lr.predict(x_test)
    print("LogisticRegression 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
    lsvc.fit(x_train, y_train)
    predicted = lsvc.predict(x_test)
    print("LinearSVC 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    tree = DecisionTreeClassifier(max_depth=10, random_state=0)
    tree.fit(x_train, y_train)
    predicted = tree.predict(x_test)
    print("DecisionTreeClassifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    forest = RandomForestClassifier(n_estimators=5, random_state=0)
    forest.fit(x_train, y_train)
    predicted = forest.predict(x_test)
    print("RandomForestClassifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    grbt = GradientBoostingClassifier(random_state=0) # verbose=3
    grbt.fit(x_train, y_train)
    predicted = grbt.predict(x_test)
    print("GradientBoostingClassifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    clf1 = LogisticRegression()
    clf2 = ComplementNB()
    clf3 = GradientBoostingClassifier(random_state=0)
    print("LogisticRegression 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

    voting_classifier = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf2), ('dt', clf3)], voting='hard')
    voting_classifier.fit(x_train, y_train)
    predicted = voting_classifier.predict(x_test)
    print("VotingClassifier 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교


In [6]:
x_train, y_train, x_test, y_test = reuters_load_ml(None, index_to_word)
fit_ml(x_train, y_train, x_test, y_test)

x_train, y_train, x_test, y_test = reuters_load_ml(10000, index_to_word)
fit_ml(x_train, y_train, x_test, y_test)

x_train, y_train, x_test, y_test = reuters_load_ml(50000, index_to_word)
fit_ml(x_train, y_train, x_test, y_test)

x_train, y_train, x_test, y_test = reuters_load_ml(5000, index_to_word)
fit_ml(x_train, y_train, x_test, y_test)

NB 정확도: 0.5997328584149599
CB 정확도: 0.7649154051647373
 LogisticRegression 정확도: 0.8161175422974176




 LinearSVC 정확도: 0.7943009795191451
 DecisionTreeClassifier 정확도: 0.6211041852181657
 RandomForestClassifier 정확도: 0.6544968833481746
 GradientBoostingClassifier 정확도: 0.7702582368655387
 LogisticRegression 정확도: 0.7702582368655387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 VotingClassifier 정확도: 0.7960819234194123
NB 정확도: 0.5997328584149599
CB 정확도: 0.7649154051647373
 LogisticRegression 정확도: 0.8161175422974176




 LinearSVC 정확도: 0.786286731967943
 DecisionTreeClassifier 정확도: 0.6211041852181657
 RandomForestClassifier 정확도: 0.6544968833481746
 GradientBoostingClassifier 정확도: 0.7702582368655387
 LogisticRegression 정확도: 0.7702582368655387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 VotingClassifier 정확도: 0.7960819234194123
NB 정확도: 0.6567230632235085
CB 정확도: 0.7707034728406055
 LogisticRegression 정확도: 0.8107747105966162




 LinearSVC 정확도: 0.7853962600178095
 DecisionTreeClassifier 정확도: 0.6202137132680321
 RandomForestClassifier 정확도: 0.674087266251113
 GradientBoostingClassifier 정확도: 0.7662511130899377
 LogisticRegression 정확도: 0.7662511130899377


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 VotingClassifier 정확도: 0.7991985752448798
NB 정확도: 0.6567230632235085
CB 정확도: 0.7707034728406055
 LogisticRegression 정확도: 0.8107747105966162




 LinearSVC 정확도: 0.7867319679430098
 DecisionTreeClassifier 정확도: 0.6202137132680321
 RandomForestClassifier 정확도: 0.674087266251113
 GradientBoostingClassifier 정확도: 0.7662511130899377
 LogisticRegression 정확도: 0.7662511130899377


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 VotingClassifier 정확도: 0.7991985752448798
NB 정확도: 0.5997328584149599
CB 정확도: 0.7649154051647373
 LogisticRegression 정확도: 0.8170080142475512




 LinearSVC 정확도: 0.7983081032947462
 DecisionTreeClassifier 정확도: 0.6219946571682992
 RandomForestClassifier 정확도: 0.6527159394479074
 GradientBoostingClassifier 정확도: 0.7707034728406055
 LogisticRegression 정확도: 0.7707034728406055


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 VotingClassifier 정확도: 0.7983081032947462
NB 정확도: 0.6731967943009796
CB 정확도: 0.7707034728406055
 LogisticRegression 정확도: 0.8032056990204809




 LinearSVC 정확도: 0.7711487088156723
 DecisionTreeClassifier 정확도: 0.6179875333926982
 RandomForestClassifier 정확도: 0.701246660730187
 GradientBoostingClassifier 정확도: 0.767586821015138
 LogisticRegression 정확도: 0.767586821015138


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 VotingClassifier 정확도: 0.8000890471950134


In [16]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

src_vocab = 10000
# Reuters 데이터셋 로드
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=src_vocab, test_split=0.2)

# 데이터 전처리
label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(y_train)
y_test = label_binarizer.fit_transform(y_test)

# 시퀀스 패딩
text_max_len = 150  # 시퀀스의 최대 길이 지정
x_train = pad_sequences(x_train, maxlen=max_sequence_length, padding = 'pre')
x_test = pad_sequences(x_test, maxlen=max_sequence_length)

embedding_dim = 128
hidden_size = 256

model = Sequential()
model.add(Embedding(src_vocab, embedding_dim))
model.add(LSTM(hidden_size,dropout = 0.4))
model.add(Dense(units=46, activation='softmax'))

# 모델 구성
#model = Sequential()
#model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_sequence_length))
#model.add(LSTM(units=128))
#model.add(Dense(units=46, activation='softmax'))

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# 모델 학습
model.fit(x_train, y_train, batch_size=32, epochs=30, validation_data=(x_test, y_test))

# 테스트 데이터에 대한 예측
y_pred = model.predict(x_test)

# 분류 보고서 출력
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
report = classification_report(y_test_labels, y_pred_labels)
print(report)

Epoch 1/30


2023-06-28 21:50:49.068012: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-28 21:50:49.068976: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-28 21:50:49.069704: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-06-28 21:52:29.586129: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-28 21:52:29.587272: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-28 21:52:29.588248: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
 1/71 [..............................] - ETA: 28s

2023-06-28 22:45:22.545661: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-28 22:45:22.546755: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-28 22:45:22.547609: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

           0       0.31      0.42      0.36        12
           1       0.60      0.70      0.65       105
           2       0.44      0.55      0.49        20
           3       0.89      0.91      0.90       813
           4       0.84      0.77      0.80       474
           5       0.00      0.00      0.00         5
           6       0.75      0.43      0.55        14
           7       1.00      0.33      0.50         3
           8       0.52      0.63      0.57        38
           9       0.72      0.52      0.60        25
          10       0.77      0.67      0.71        30
          11       0.48      0.54      0.51        83
          12       0.25      0.31      0.28        13
          13       0.22      0.41      0.29        37
          14       0.17      0.50      0.25         2
          15       0.00      0.00      0.00         9
          16       0.53      0.53      0.53        99
          17       0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
