# 뉴스 카테고리 다중분류

### 데이터 로드 및 전처리

In [None]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

#### 훈련 데이터와 테스트 데이터 로드

In [None]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)

print('훈련 샘플의 수: {}'.format(len(x_train)))
print('테스트 샘플의 수: {}'.format(len(x_test)))

#### 데이터 출력

In [None]:
print(x_train[0])
print(x_test[0])
print(y_train[0])
print(y_test[0])

In [None]:
num_classes = max(y_train) + 1
print('클래스의 수 : {}'.format(num_classes))

#### 데이터 분포 확인

In [None]:
print('훈련용 뉴스의 최대 길이 :{}'.format(max(len(l) for l in x_train)))
print('훈련용 뉴스의 평균 길이 :{}'.format(sum(map(len, x_train))/len(x_train)))

plt.hist([len(s) for s in x_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
fig, axe = plt.subplots(ncols=1)
fig.set_size_inches(11,5)
sns.countplot(x=y_train)
plt.show()

In [None]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("각 클래스 빈도수:")
print(np.asarray((unique_elements, counts_elements)))

#### 원본 뉴스 데이터로 복원
실습을 위해 정수 시퀀스로 변환된 데이터를 다시 텍스트로 복원   
단어를 key값으로, 고유한 정수를 value로 가지는 dictionary

In [None]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
print(word_index['the'])
print(word_index['it'])

In [None]:
index_to_word = {index+3 : word for word, index in word_index.items()}
print(index_to_word[4])
print(index_to_word[16])

In [None]:
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token
print(' '.join([index_to_word[index] for index in x_train[0]]))

In [None]:
seq = [4, 587, 23, 133, 6, 30, 515]
tokens = [index_to_word.get(t, "<unk>") for t in seq]
text = " ".join(tokens)
print(tokens)
print(text)

In [None]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))

In [None]:
decoded = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded.append(t)

x_test = decoded
print(len(x_test))

#### 벡터화 하기

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)
print(x_train_dtm.shape)

In [None]:
tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidfv.shape)

### 모델 학습 : 나이브 베이즈 분류기

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [None]:
model = MultinomialNB()
model.fit(tfidfv, y_train)

x_test_dtm = dtmvector.transform(x_test)
tfidfv_test = tfidf_transformer.transform(x_test_dtm)

predicted = model.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

In [None]:
print(x_test[3])
print(y_test[3])

In [None]:
probability_3 = model.predict_proba(tfidfv_test[3])[0]

plt.rcParams["figure.figsize"] = (11,5)
plt.bar(model.classes_, probability_3)
plt.xlim(-1, 21)
plt.xticks(model.classes_)
plt.xlabel("Class")
plt.ylabel("Probability")
plt.show()

In [None]:
model.predict(tfidfv_test[3])

#### F1-Score, Confusion Matrix

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, model.predict(tfidfv_test), zero_division=0))

In [None]:
def graph_confusion_matrix(model, x_test, y_test):#, classes_name):
  df_cm = pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)))#, index=classes_name, columns=classes_name)
  fig = plt.figure(figsize=(12,12))
  heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=12)
  plt.ylabel('label')
  plt.xlabel('predicted value')

graph_confusion_matrix(model, tfidfv_test, y_test)

---

### 다양한 머신러닝 모델 사용

#### Complement Naive Bayes Classifier(CNB)

In [None]:
cb = ComplementNB()
cb.fit(tfidfv, y_train)

predicted = cb.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

#### 로지스틱 회귀(Logistic Regression)

In [None]:
lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
lr.fit(tfidfv, y_train)

predicted = lr.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

#### 선형 서포트 벡터 머신(Linear Support Vector Machine)

In [None]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
lsvc.fit(tfidfv, y_train)

predicted = lsvc.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

#### 결정 트리(Decision Tree)

In [None]:
tree = DecisionTreeClassifier(max_depth=10, random_state=0)
tree.fit(tfidfv, y_train)

predicted = tree.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

#### 랜덤 포레스트(Random Forest)

In [None]:
forest = RandomForestClassifier(n_estimators=5, random_state=0)
forest.fit(tfidfv, y_train)

predicted = forest.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

#### 그래디언트 부스팅 트리(GradientBoostingClassifier)

In [None]:
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

predicted = grbt.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

#### 보팅(Voting)

In [None]:
X_train_dense = tfidfv.toarray().astype(np.float32, copy=False)
voting_classifier = VotingClassifier(
    estimators=[("lr", LogisticRegression(penalty="l2", solver="liblinear", random_state=0)),
                ("cnb", ComplementNB()),
                ("gb", GradientBoostingClassifier(random_state=0))],voting="soft")
voting_classifier.fit(X_train_dense, y_train)

predicted = voting_classifier.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

---

## Vocabulary Size X 다양한 머신러닝 모델
- Vocabulary Size : 5000, 3000, None
- 비교 모델
  - MultinomialNB, ComplementNB
  - LogisticRegression, LinearSVC
  - DecisionTreeClassifier, RandomForestClassifier
  - GradientBoostingClassifier
  - Voting(soft/hard)  



In [7]:
import numpy as np
import pandas as pd
import warnings, os, random
warnings.filterwarnings("ignore", category=UserWarning)

import tensorflow as tf
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, callbacks

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix, classification_report

In [8]:
SEED = 0
np.random.seed(SEED)

# 인덱스 → 단어 복원
word_index = reuters.get_word_index()
index_to_word = {idx + 3: w for w, idx in word_index.items()}
index_to_word[0], index_to_word[1], index_to_word[2] = "<pad>", "<sos>", "<unk>"
SPECIALS = {0, 1, 2}

# 정수 시퀀스를 공백으로 join한 텍스트로 변환
def decode_sequences(seqs, drop_special=True):
    out = []
    for s in seqs:
        toks = []
        for t in s:
            if drop_special and t in SPECIALS:
                continue
            toks.append(index_to_word.get(t, "<unk>"))
        out.append(" ".join(toks))
    return out

- 학습 시간 문제로 코드 수정
  - GradientBoostingClassifier 제외
  - Voting - hard 제외, soft만 유지  
  - LogisticRegression - max_iter 축소, tol 추가
  - RandomForest - n_estimators 축소

In [9]:
# 모델 파이프라인 정의
# - LR, NB, SVM, DT, RF 는 희소 입력 OK → identity
# - GB 는 희소 입력 X → densify로 dense 변환 후 적용

def build_models(seed=SEED):
    identity = FunctionTransformer(lambda X: X, accept_sparse=True)
    # densify = FunctionTransformer(lambda X: X.toarray(), accept_sparse=True)

    models = {"NB_Multinomial": make_pipeline(identity, MultinomialNB()),
              "CNB": make_pipeline(identity, ComplementNB()),
              "LogReg_L2": make_pipeline(identity, LogisticRegression(penalty="l2", solver="saga",
                                                                      multi_class="multinomial",
                                                                      max_iter=1000, tol=1e-3,
                                                                      random_state=seed, n_jobs=-1)),
              "LinearSVM": make_pipeline(identity, LinearSVC(random_state=seed)),
              "DecisionTree": make_pipeline(identity, DecisionTreeClassifier(random_state=seed)),
              "RandomForest": make_pipeline(identity, RandomForestClassifier(n_estimators=300,
                                                                             random_state=seed,
                                                                             n_jobs=-1))}
            #   # GradientBoosting은 희소 입력 미지원 → dense 변환
            #   "GradBoost": make_pipeline(densify, GradientBoostingClassifier(random_state=seed))}

    # Voting(soft): 확률 가능한 모델만 (LR, CNB, GB)
    voting_soft = VotingClassifier(estimators=[("lr", models["LogReg_L2"]),
                                               ("cnb", models["CNB"]),
                                               ("nb", models["NB_Multinomial"])],voting="soft")
                                               #("gb", models["GradBoost"])],voting="soft")
    # # Voting(hard): 7개 전부
    # voting_hard = VotingClassifier(estimators=[("nb", models["NB_Multinomial"]),
    #                                            ("cnb", models["CNB"]),
    #                                            ("lr", models["LogReg_L2"]),
    #                                            ("svm", models["LinearSVM"]),
    #                                            ("dt", models["DecisionTree"]),
    #                                            ("rf", models["RandomForest"]),
    #                                            ("gb", models["GradBoost"])],voting="hard")
    return models, voting_soft #,voting_hard

In [10]:
# 학습/평가 함수
def eval_model(name, clf, Xtr, ytr, Xte, yte):
    clf.fit(Xtr, ytr)
    y_pred = clf.predict(Xte)
    return {"model": name,
            "acc": accuracy_score(yte, y_pred),
            "balanced_acc": balanced_accuracy_score(yte, y_pred),
            "macro_f1": f1_score(yte, y_pred, average="macro"),
            "y_pred": y_pred}

def confusion_summary(y_true, y_pred, topk=10, all_labels=None):
    if all_labels is None:
        all_labels = np.unique(np.concatenate([y_true, y_pred]))
    cm = confusion_matrix(y_true, y_pred, labels=all_labels)
    cm_sum = cm.sum(axis=1, keepdims=True)
    cm_norm = np.divide(cm, cm_sum, out=np.zeros_like(cm, dtype=float), where=cm_sum!=0)
    cm_off = cm_norm.copy()
    np.fill_diagonal(cm_off, 0.0)
    pairs = np.dstack(np.unravel_index(np.argsort(cm_off.ravel())[::-1], cm_off.shape))[0][:topk]
    return cm, cm_norm, pairs, all_labels

In [11]:
# 실행 함수(num_words, vec_mode만 바꿔 호출)
def run_once(num_words, vec_mode="tfidf", topk_pairs=10, seed=SEED):
    # 데이터 로드
    (xtr_ids, ytr), (xte_ids, yte) = reuters.load_data(num_words=num_words, test_split=0.2)

    # 텍스트 복원
    xtr_text = decode_sequences(xtr_ids, drop_special=True)
    xte_text = decode_sequences(xte_ids, drop_special=True)

    # 벡터화
    if vec_mode == "tfidf":
        vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=2, max_df=0.95, sublinear_tf=True)
    else:
        vectorizer = CountVectorizer(ngram_range=(1,1), min_df=2, max_df=0.95)

    Xtr = vectorizer.fit_transform(xtr_text)
    Xte = vectorizer.transform(xte_text)

    # 모델 구성
    # models, voting_soft, voting_hard = build_models(seed=seed)
    models, voting_soft = build_models(seed=seed)

    # 학습 & 평가
    results = []
    for name, clf in models.items():
        results.append(eval_model(name, clf, Xtr, ytr, Xte, yte))
    results.append(eval_model("Voting_soft(LR+CNB+NB)", voting_soft, Xtr, ytr, Xte, yte))
    # results.append(eval_model("Voting_hard(All)", voting_hard, Xtr, ytr, Xte, yte))

    # 요약 표 출력
    df_res = (pd.DataFrame(results).sort_values("macro_f1", ascending=False).reset_index(drop=True))

    print(f"[num_words={num_words} | vec={vec_mode}] 결과 (Macro-F1 기준 내림차순)")
    print(df_res[["model", "acc", "balanced_acc", "macro_f1"]].to_string(index=False))

    # 혼동행렬 요약 (베스트 1개)
    best = max(results, key=lambda d: d["macro_f1"])
    cm, cm_norm, pairs, labels = confusion_summary(yte, best["y_pred"], topk=topk_pairs)

    print(f"\n[Best by macro-F1] {best['model']}의 혼동행렬 요약")
    print(f"- Confusion matrix shape: {cm.shape} (rows=true, cols=predicted)")
    print(f"\nTop-{topk_pairs} confused pairs (true -> pred: rate, count):")

    for i, j in pairs:
        count = cm[i, j]
        if count > 0:
            print(f"  {labels[i]} -> {labels[j]}: rate={cm_norm[i, j]:.3f}, count={count}")

    return df_res, results, (cm, cm_norm, labels)

In [12]:
# num_words=5000
df_5000, results_5000, cm_pack_5000 = run_once(num_words=5000, vec_mode="tfidf", topk_pairs=10, seed=SEED)



[num_words=5000 | vec=tfidf] 결과 (Macro-F1 기준 내림차순)
                 model      acc  balanced_acc  macro_f1
             LinearSVM 0.832146      0.634498  0.675920
          RandomForest 0.773375      0.418554  0.473705
          DecisionTree 0.703473      0.445195  0.447408
                   CNB 0.763580      0.404732  0.441942
             LogReg_L2 0.798753      0.394181  0.432609
Voting_soft(LR+CNB+GB) 0.732413      0.217712  0.247759
        NB_Multinomial 0.678094      0.122976  0.113883

[Best by macro-F1] LinearSVM의 혼동행렬 요약
- Confusion matrix shape: (46, 46) (rows=true, cols=predicted)

Top-10 confused pairs (true -> pred: rate, count):
  5 -> 1: rate=1.000, count=5
  40 -> 19: rate=0.700, count=7
  42 -> 25: rate=0.667, count=2
  37 -> 4: rate=0.500, count=1
  41 -> 4: rate=0.500, count=4
  14 -> 11: rate=0.500, count=1
  36 -> 11: rate=0.455, count=5
  15 -> 28: rate=0.444, count=4
  38 -> 1: rate=0.333, count=1
  35 -> 11: rate=0.333, count=2


In [13]:
# num_words=3000
df_3000, results_3000, cm_pack_3000 = run_once(num_words=3000, vec_mode="tfidf", topk_pairs=10, seed=SEED)



[num_words=3000 | vec=tfidf] 결과 (Macro-F1 기준 내림차순)
                 model      acc  balanced_acc  macro_f1
             LinearSVM 0.834372      0.637980  0.682639
          RandomForest 0.782725      0.442724  0.491461
             LogReg_L2 0.802760      0.423390  0.467768
          DecisionTree 0.695459      0.452602  0.448998
                   CNB 0.756901      0.404111  0.438829
Voting_soft(LR+CNB+GB) 0.750223      0.268574  0.298092
        NB_Multinomial 0.695013      0.157038  0.163511

[Best by macro-F1] LinearSVM의 혼동행렬 요약
- Confusion matrix shape: (46, 46) (rows=true, cols=predicted)

Top-10 confused pairs (true -> pred: rate, count):
  5 -> 1: rate=1.000, count=5
  40 -> 19: rate=0.700, count=7
  41 -> 4: rate=0.500, count=4
  14 -> 11: rate=0.500, count=1
  37 -> 4: rate=0.500, count=1
  36 -> 11: rate=0.455, count=5
  15 -> 28: rate=0.333, count=3
  38 -> 1: rate=0.333, count=1
  38 -> 16: rate=0.333, count=1
  35 -> 11: rate=0.333, count=2


In [14]:
# num_words=None
df_None, results_None, cm_pack_None = run_once(num_words=None, vec_mode="tfidf", topk_pairs=10, seed=SEED)



[num_words=None | vec=tfidf] 결과 (Macro-F1 기준 내림차순)
                 model      acc  balanced_acc  macro_f1
             LinearSVM 0.831256      0.631417  0.667421
          DecisionTree 0.707035      0.471876  0.461522
                   CNB 0.758682      0.408033  0.454552
          RandomForest 0.761799      0.390794  0.447817
             LogReg_L2 0.788958      0.340861  0.373389
Voting_soft(LR+CNB+GB) 0.684773      0.137222  0.155720
        NB_Multinomial 0.632235      0.086281  0.084486

[Best by macro-F1] LinearSVM의 혼동행렬 요약
- Confusion matrix shape: (46, 46) (rows=true, cols=predicted)

Top-10 confused pairs (true -> pred: rate, count):
  5 -> 1: rate=1.000, count=5
  42 -> 25: rate=0.667, count=2
  40 -> 19: rate=0.600, count=6
  37 -> 4: rate=0.500, count=1
  41 -> 4: rate=0.500, count=4
  14 -> 11: rate=0.500, count=1
  36 -> 11: rate=0.455, count=5
  15 -> 28: rate=0.444, count=4
  17 -> 16: rate=0.417, count=5
  35 -> 11: rate=0.333, count=2


- 모델 비교
  - LinearSVM
    - 고차원 희소 벡터(BoW/TF-IDF)에 강한 선형 분류기
    - accuracy(0.83±) 대비 balanced_acc(0.63±)·macro-F1(0.67~0.68) 간 격차는 클래스 불균형이 원인으로 추정   
     다수 클래스는 잘 맞추지만 소수 클래스 리콜은 떨어진다는 신호
  - LogisticRegression
    - 정확도는 준수(0.79~0.80)하지만 macro-F1은 SVM보다 낮음
      - 동일한 선형 계열이라도 정규화/최적화 차이와 다중클래스 로스(softmax)의 특성상 소수 클래스에서 결정경계가 조금 더 보수적으로 형성되기 쉬움
      - 기본 임계값(0.5 argmax)로는 소수 클래스 리콜을 끌어올리기 어려움 → class_weight='balanced', C 튜닝으로 개선 여지
  - Naive Bayes (Multinomial/Complement)
    - 빠르고 투박: 다수 클래스에 유리, 소수 클래스 리콜이 낮아 macro-F1이 낮음
    - 결과에서 볼 수 있듯이 ComplementNB가 MultinomialNB보다 일반적으로 불균형에 더 강함
  - RandomForest / DecisionTree
    - 희소·고차원에서 트리 분기는 정보이득이 분산되어 성능/균형 정확도 낮음(macro-F1 0.44~0.49)
    - RF가 DT보다 낫지만 여전히 선형 모델 대비 불리
  - Voting_soft
    - LR+CNB+NB라서 약한 NB 2개 + LR 1개 조합이 되어 LR 단독보다 못함
    - 확률교정/가중치 부재: LR만 강하고 NB는 약하니 평균하면 신호가 희석

- 단어 수(num_words)에 따른 경향
  - 3000 > 5000 ≥ None (macro-F1 기준)
    - 3000: 희소성과 노이즈의 균형이 가장 좋음 → 최고 macro-F1(0.6826)
    - 5000: 약간의 노이즈 증가로 소수 클래스 리콜이 아주 약간 희생
    - None(전체): 희귀 토큰 과다 → 분산 증가, 소수 클래스에서 오히려 성능 하락

- 혼동행렬 Top-10
  - 공통적으로 5→1, 40→19, 41→4, 36→11, 15→28 등이 자주 등장
    - 테스트 표본 수가 적거나, 어휘가 유사해 결정경계가 잘 안 갈라질 가능성이 높음

## Vocabulary Size X 딥러닝 모델
- Vocabulary Size : 3000, 5000, 10000
- 비교 모델 : cnn, bilstm
- 최대 시퀀스 길이 : 200, 300, 500

In [2]:
SEED = 42
EPOCHS = 12
BATCH_SIZE = 128
EMBED_DIM = 128

NUM_WORDS_LIST = [3000, 5000, 10000]
USE_MODELS = ["cnn", "bilstm"]
MAXLEN_LIST = [200, 300, 500]
PRINT_CONFUSION_PER_RUN = False  # Top-10 confused pairs 출력 여부

os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

In [3]:
# DL models
def build_cnn(vocab_size, maxlen, embed_dim=128, filters=128, ksz=5, dp=0.3, classes=46):
    inp = layers.Input(shape=(maxlen,), dtype="int32")
    x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(inp)
    x = layers.Conv1D(filters, ksz, padding="same", activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(dp)(x)
    x = layers.Dense(128, activation="relu")(x)
    out = layers.Dense(classes, activation="softmax")(x)
    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

def build_bilstm(vocab_size, maxlen, embed_dim=128, units=64, dp=0.3, classes=46):
    inp = layers.Input(shape=(maxlen,), dtype="int32")
    x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(dp)(x)
    x = layers.Dense(128, activation="relu")(x)
    out = layers.Dense(classes, activation="softmax")(x)
    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

In [4]:
# Index -> word decode
word_index = reuters.get_word_index()
index_to_word = {idx + 3: w for w, idx in word_index.items()}
index_to_word[0], index_to_word[1], index_to_word[2] = "<pad>", "<sos>", "<unk>"
SPECIALS = {0, 1, 2}

def decode_sequences(seqs, drop_special=True):
    out = []
    for s in seqs:
        toks = [index_to_word.get(t, "<unk>") for t in s if (not drop_special or t not in SPECIALS)]
        out.append(" ".join(toks))
    return out

In [5]:
# Confusion summary (Top-K) + formatter
def confusion_summary(y_true, y_pred, topk=10, all_labels=None):
    if all_labels is None:
        all_labels = np.unique(np.concatenate([y_true, y_pred]))
    cm = confusion_matrix(y_true, y_pred, labels=all_labels)
    cm_sum = cm.sum(axis=1, keepdims=True)
    cm_norm = np.divide(cm, cm_sum, out=np.zeros_like(cm, dtype=float), where=cm_sum!=0)
    cm_off = cm_norm.copy()
    np.fill_diagonal(cm_off, 0.0)
    pairs = np.dstack(np.unravel_index(np.argsort(cm_off.ravel())[::-1], cm_off.shape))[0][:topk]
    return cm, cm_norm, pairs, all_labels

def format_pairs(cm, cm_norm, pairs, labels):
    lines = []
    for i, j in pairs:
        count = cm[i, j]
        if count > 0:
            lines.append(f"{labels[i]} -> {labels[j]}: rate={cm_norm[i, j]:.3f}, count={count}")
    if not lines:
        lines = ["(no off-diagonal confusions)"]
    return lines

In [6]:
# 실행 함수
def run_one(num_words, use_model, maxlen):
    # Load data
    (xtr_ids, ytr), (xte_ids, yte) = reuters.load_data(num_words=num_words, test_split=0.2)

    # Pad for DL models
    Xtr_seq = pad_sequences(xtr_ids, maxlen=maxlen, padding="post", truncating="post")
    Xte_seq = pad_sequences(xte_ids, maxlen=maxlen, padding="post", truncating="post")

    # vocab_size (Embedding 용)
    max_index = 0
    for s in list(xtr_ids) + list(xte_ids):
        if len(s): max_index = max(max_index, max(s))
    vocab_size = max_index + 1
    n_classes = int(max(max(ytr), max(yte)) + 1)

    # DL 모델 선택/학습
    if use_model == "cnn":
        dl_model = build_cnn(vocab_size, maxlen, EMBED_DIM, classes=n_classes)
    elif use_model == "bilstm":
        dl_model = build_bilstm(vocab_size, maxlen, EMBED_DIM, classes=n_classes)
    else:
        raise ValueError("USE_MODEL must be 'cnn' or 'bilstm'.")

    es = callbacks.EarlyStopping(monitor="val_accuracy", patience=2, restore_best_weights=True)
    rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-5)

    dl_model.fit(Xtr_seq, ytr, validation_split=0.1, epochs=EPOCHS,
                 batch_size=BATCH_SIZE, callbacks=[es, rlr], verbose=0)

    y_prob_dl = dl_model.predict(Xte_seq, verbose=0)
    y_pred_dl = np.argmax(y_prob_dl, axis=1)
    acc_dl = accuracy_score(yte, y_pred_dl)
    bacc_dl = balanced_accuracy_score(yte, y_pred_dl)
    f1m_dl = f1_score(yte, y_pred_dl, average="macro")

    # Classical baseline: TF-IDF + Logistic Regression (동일 num_words의 텍스트)
    xtr_text = decode_sequences(xtr_ids, drop_special=True)
    xte_text = decode_sequences(xte_ids, drop_special=True)

    tfidf = TfidfVectorizer(ngram_range=(1,1), min_df=2, max_df=0.95, sublinear_tf=True)
    Xtr_tfidf = tfidf.fit_transform(xtr_text)
    Xte_tfidf = tfidf.transform(xte_text)

    lr = LogisticRegression(penalty="l2", solver="saga", max_iter=2000, random_state=SEED)
    lr.fit(Xtr_tfidf, ytr)
    y_pred_lr = lr.predict(Xte_tfidf)

    acc_lr = accuracy_score(yte, y_pred_lr)
    bacc_lr = balanced_accuracy_score(yte, y_pred_lr)
    f1m_lr = f1_score(yte, y_pred_lr, average="macro")

    # Confusion Top-10 (DL / LR)
    cm_dl, cmn_dl, pairs_dl, labels_dl = confusion_summary(yte, y_pred_dl, topk=10)
    cm_lr, cmn_lr, pairs_lr, labels_lr = confusion_summary(yte, y_pred_lr, topk=10)
    lines_dl = format_pairs(cm_dl, cmn_dl, pairs_dl, labels_dl)
    lines_lr = format_pairs(cm_lr, cmn_lr, pairs_lr, labels_lr)

    # Compact per-run print
    print(f"\n=== num_words={num_words}, use_model={use_model}, maxlen={maxlen} ===")
    print(f"[DL-{use_model}] acc={acc_dl:.6f}  balanced_acc={bacc_dl:.6f}  macro_f1={f1m_dl:.6f}")
    if PRINT_CONFUSION_PER_RUN:
        print("Top-10 confused pairs (DL):")
        for ln in lines_dl: print(ln)

    print(f"[TFIDF+LR] acc={acc_lr:.6f}  balanced_acc={bacc_lr:.6f}  macro_f1={f1m_lr:.6f}")
    if PRINT_CONFUSION_PER_RUN:
        print("Top-10 confused pairs (LR):")
        for ln in lines_lr: print(ln)

    return {"num_words": num_words, "use_model": use_model, "maxlen": maxlen,
            "dl_acc": acc_dl, "dl_bal_acc": bacc_dl, "dl_macro_f1": f1m_dl,
            "lr_acc": acc_lr, "lr_bal_acc": bacc_lr, "lr_macro_f1": f1m_lr,
            "delta_macro_f1": f1m_dl - f1m_lr,
            "_y_true": yte, "_y_pred_dl": y_pred_dl, "_y_pred_lr": y_pred_lr}

In [7]:
rows = []
for nw in NUM_WORDS_LIST:
    for um in USE_MODELS:
        for ml in MAXLEN_LIST:
            rows.append(run_one(nw, um, ml))

df = pd.DataFrame(rows).sort_values(["dl_macro_f1","lr_macro_f1"], ascending=False).reset_index(drop=True)

cols = ["num_words","use_model","maxlen", "dl_acc","dl_bal_acc","dl_macro_f1",
        "lr_acc","lr_bal_acc","lr_macro_f1", "delta_macro_f1"]

print("\n===== Summary (DL vs TFIDF+LR) — Macro-F1 기준 내림차순 (DL 우선) =====")
print(df[cols].to_string(index=False))


=== num_words=3000, use_model=cnn, maxlen=200 ===
[DL-cnn] acc=0.799644  balanced_acc=0.462937  macro_f1=0.481896
[TFIDF+LR] acc=0.801870  balanced_acc=0.416325  macro_f1=0.455547

=== num_words=3000, use_model=cnn, maxlen=300 ===
[DL-cnn] acc=0.805432  balanced_acc=0.488752  macro_f1=0.523166
[TFIDF+LR] acc=0.801870  balanced_acc=0.416325  macro_f1=0.455547

=== num_words=3000, use_model=cnn, maxlen=500 ===
[DL-cnn] acc=0.803206  balanced_acc=0.477127  macro_f1=0.507487
[TFIDF+LR] acc=0.801870  balanced_acc=0.416325  macro_f1=0.455547

=== num_words=3000, use_model=bilstm, maxlen=200 ===
[DL-bilstm] acc=0.754230  balanced_acc=0.247190  macro_f1=0.222489
[TFIDF+LR] acc=0.801870  balanced_acc=0.416325  macro_f1=0.455547

=== num_words=3000, use_model=bilstm, maxlen=300 ===
[DL-bilstm] acc=0.775601  balanced_acc=0.300459  macro_f1=0.280814
[TFIDF+LR] acc=0.801870  balanced_acc=0.416325  macro_f1=0.455547

=== num_words=3000, use_model=bilstm, maxlen=500 ===
[DL-bilstm] acc=0.763134  bal

- CNN vs BiLSTM
  - CNN 계열은 전체적으로 TFIDF+LR 대비 Macro-F1이 높고 안정적
    - CNN은 지역적인 패턴(ngram-like)을 잘 잡아주고 학습이 안정적
  - BiLSTM 계열은 모든 실험에서 TFIDF+LR보다 오히려 성능이 낮음
    - 특히 Macro-F1이 0.22~0.32 수준으로 크게 떨어짐
    - 학습 데이터 크기, 모델 구조, 하이퍼파라미터가 충분하지 않아 과적합 or 학습 불안정 가능성
    - 데이터 양 부족 + 긴 시퀀스에서 gradient 문제로 학습이 불안정했을 가능성

- 단어 수(CNN 기준)
  - 3000 → 5000 단어로 늘렸을 때 Macro-F1이 개선됨
  - 10000 단어에서는 오히려 성능 하락 → 희소성 증가 + 데이터 부족 → 학습 불안정

- 문장 길이
  - CNN : 200 → 300 → 500 길이 확장 시 Macro-F1이 조금씩 증가하다가 500에서 최고치
  - BiLSTM : 길이에 따른 명확한 개선 없음, 오히려 길어질수록 성능이 더 떨어짐