# 뉴스카테고리 다중분류[프로젝트]

![모델결과](./Reuters_results.png)

### 주어진 단어 수 수준에서 각 모델의 정확도와 F1 점수 
- ML모델인 선형 SVM이 모든 단어 수에서 일관되게 가장 높은 정확도와 F1 점수를 보임
- XGBoost와 KNN를 포함한 ML 모델이 상대적으로 우수한 성능을 보임 
- DL모델인 CNN+LSTM, LSTM, RNN과 같은 모델은 일반적으로 점수가 낮음 

In [43]:
import pandas as pd
df= pd.read_csv("./Reuters_classification_results_h.csv")
print (df)

    Word Count Linear SVM (Acc / F1) XGBoost (Acc / F1)   KNN (Acc / F1)  \
0  10000 words       0.8299 / 0.6808    0.7925 / 0.6387  0.7894 / 0.5903   
1   5000 words       0.8290 / 0.6813    0.7983 / 0.6396  0.7827 / 0.5818   
2    all words       0.8295 / 0.6887    0.7979 / 0.6546  0.7720 / 0.5769   

  Logistic Regression (Acc / F1) Decision Tree (Acc / F1)  \
0                0.7956 / 0.4721          0.6941 / 0.4580   
1                0.7979 / 0.4814          0.6901 / 0.4548   
2                0.7916 / 0.4514          0.7026 / 0.4577   

  Multinomial NB (Acc / F1) Random Forest (Acc / F1)   CNN (Acc / F1)  \
0           0.7711 / 0.4386          0.7560 / 0.4367  0.7217 / 0.2968   
1           0.7774 / 0.5096          0.7671 / 0.4561  0.7244 / 0.3085   
2           0.7226 / 0.2513          0.7400 / 0.4302  0.7177 / 0.3027   

  CNN+LSTM (Acc / F1)  LSTM (Acc / F1)   RNN (Acc / F1)  
0     0.6447 / 0.1153  0.6024 / 0.0734  0.4795 / 0.0311  
1     0.6901 / 0.1639  0.6051 / 0.0786  0

 text classification on the Reuters dataset using:
 - DTM, TF-IDF representations
 - Machine Learning models (Logistic Regression, SVM, etc.)
 - Word2Vec embedding
 - Deep Learning models (LSTM, CNN, CNN+LSTM, RNN)#
 - Evaluation with accuracy, F1, and confusion matrix visualizations

In [31]:
# 1. Imports and Setup

%pip install gensim
%pip install xgboost


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, SimpleRNN, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [32]:

# ## 2. Load Reuters Dataset

# Load data and word index
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=None)
#(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2) #5000
word_index = reuters.get_word_index()
index_word = {v + 3: k for k, v in word_index.items()}
for k, v in enumerate(("<pad>", "<sos>", "<unk>")): # Add special tokens at the beginning
  index_word[k] = v

# Decode to raw text for vectorization
X_train_text = [' '.join([index_word.get(i, '?') for i in seq]) for seq in X_train]
X_test_text = [' '.join([index_word.get(i, '?') for i in seq]) for seq in X_test]


In [33]:
# ## 3. ML: Vectorize Text 
# DTM Count Vectorizer
dtmvector = CountVectorizer() #Naive Bayes prefers DTM 
X_train_dtm = dtmvector.fit_transform(X_train_text)
X_test_dtm = dtmvector.transform(X_test_text)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=None)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

In [34]:


# ## 4. ML Models Setup
ml_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "MultinomialNB": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    #"Extra Trees": ExtraTreesClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}


ml_results = {}

In [35]:
# ## 5. Confusion Matrix Plot Function

def plot_confusion_matrix(y_true, y_pred, model_name, labels=None):
    cm = confusion_matrix(y_true, y_pred)
    cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_norm, cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()


In [36]:
# ## 6. Train and Evaluate ML Models

for name, model in ml_models.items():
    print(f"\nTraining {name}...")
    if name == "MultinomialNB":
        # MultinomialNB works better with DTM
        model.fit(X_train_dtm, y_train)
        y_pred = model.predict(X_test_dtm)
    else:
        # Other models use TF-IDF
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"{name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    #plot_confusion_matrix(y_test, y_pred, name)

    ml_results[name] = (acc, f1)


Training Logistic Regression...
Logistic Regression - Accuracy: 0.7916, F1 Score: 0.4514
              precision    recall  f1-score   support

           0       0.86      0.50      0.63        12
           1       0.65      0.83      0.73       105
           2       0.92      0.60      0.73        20
           3       0.92      0.94      0.93       813
           4       0.71      0.93      0.80       474
           5       0.00      0.00      0.00         5
           6       0.92      0.79      0.85        14
           7       1.00      0.33      0.50         3
           8       0.75      0.63      0.69        38
           9       0.96      0.88      0.92        25
          10       0.96      0.77      0.85        30
          11       0.61      0.81      0.69        83
          12       1.00      0.31      0.47        13
          13       0.68      0.62      0.65        37
          14       1.00      0.50      0.67         2
          15       0.00      0.00      0.00  

In [37]:
# ## 7. Prepare Data for DL

maxlen = 300
vocab_size = 100000
embedding_dim = 100

X_train_pad = pad_sequences(X_train, maxlen=maxlen)
X_test_pad = pad_sequences(X_test, maxlen=maxlen)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# ## 8. Train Word2Vec on Full Text

sentences = [text.split() for text in X_train_text + X_test_text]
w2v_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]

In [38]:
# ## 9. Define DL Models

def build_rnn():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False),
        SimpleRNN(64),
        Dropout(0.5),
        Dense(46, activation='softmax')
    ])
    return model

def build_lstm():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False),
        LSTM(64),
        Dropout(0.5),
        Dense(46, activation='softmax')
    ])
    return model

def build_cnn():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(46, activation='softmax')
    ])
    return model

def build_cnn_lstm():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(2),
        Dropout(0.5),
        LSTM(64),
        Dropout(0.5),
        Dense(46, activation='softmax')
    ])
    return model



In [39]:

# ## 10. Train and Evaluate DL Models

dl_models = {
    "LSTM": build_lstm,
    "CNN": build_cnn,
    "CNN + LSTM": build_cnn_lstm,
    "RNN": build_rnn
}

dl_results = {}

for name, builder in dl_models.items():
    print(f"\nTraining {name}...")
    model = builder()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(X_train_pad, y_train_cat,
              validation_split=0.2,
              epochs=20,
              batch_size=64,
              callbacks=[early_stop],
              verbose=1)

    y_pred_prob = model.predict(X_test_pad)
    y_pred = np.argmax(y_pred_prob, axis=1)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"{name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    #plot_confusion_matrix(y_test, y_pred, name)

    dl_results[name] = (acc, f1)




Training LSTM...
Epoch 1/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 191ms/step - accuracy: 0.3422 - loss: 2.7957 - val_accuracy: 0.4819 - val_loss: 2.0674
Epoch 2/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 142ms/step - accuracy: 0.4864 - loss: 2.0942 - val_accuracy: 0.4864 - val_loss: 1.9835
Epoch 3/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 144ms/step - accuracy: 0.4953 - loss: 1.9982 - val_accuracy: 0.5309 - val_loss: 1.8083
Epoch 4/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 140ms/step - accuracy: 0.5351 - loss: 1.8700 - val_accuracy: 0.5609 - val_loss: 1.7613
Epoch 5/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 197ms/step - accuracy: 0.5530 - loss: 1.7913 - val_accuracy: 0.5008 - val_loss: 1.9916
Epoch 6/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 165ms/step - accuracy: 0.5208 - loss: 1.8767 - val_accuracy: 0.5665 - val_loss:

In [42]:
# ## 11. Combine and Display Results

all_results = {**ml_results, **dl_results}
sorted_results = sorted(all_results.items(), key=lambda x: x[1][1], reverse=True)

print("\n=== Combined Model Performance [all words]===")
print("{:<20} {:<10} {:<10}".format("Model", "Accuracy", "F1 Score"))
for name, (acc, f1) in sorted_results:
    print("{:<20} {:.4f}     {:.4f}".format(name, acc, f1))



=== Combined Model Performance [all words]===
Model                Accuracy   F1 Score  
Linear SVM           0.8295     0.6887
XGBoost              0.7979     0.6546
KNN                  0.7720     0.5769
Decision Tree        0.7026     0.4577
Logistic Regression  0.7916     0.4514
Random Forest        0.7400     0.4302
CNN                  0.7177     0.3027
MultinomialNB        0.7226     0.2513
LSTM                 0.6616     0.1533
CNN + LSTM           0.6336     0.1150
RNN                  0.4604     0.0262
