In [None]:
import numpy as np
import pandas as pd

from collections import defaultdict

from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)

from naive_bayes import NaiveBayes

SEED = 42

In [2]:
categories = [
    'sci.space',
    'comp.graphics',
    'soc.religion.christian'
]

X, y = fetch_20newsgroups(
    categories=categories,
    remove=('headers', 'footers', 'quotes') ,
    return_X_y=True,
)

In [None]:
target_names = categories

vectorizer = CountVectorizer(stop_words="english")
X_vec = vectorizer.fit_transform(X) 

vocab = vectorizer.get_feature_names_out()
vocab_set = set(vocab)

docs_by_class = defaultdict(list)
lengths_by_class = defaultdict(list)

for doc, label in zip(X, y):
    words = doc.split()
    lengths_by_class[target_names[label]].append(len(words))
    docs_by_class[target_names[label]].append(doc)


word_sets_by_class = {}
for class_name, docs in docs_by_class.items():
    class_text = " ".join(docs)
    tokens = vectorizer.build_analyzer()(class_text) 
    word_sets_by_class[class_name] = set(tokens)

print(f"üìÑ –û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(X)}\n")

print("üìä –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–ª–∞—Å—Å–∞–º:")
for class_name in target_names:
    print(f"- {class_name}: {len(docs_by_class[class_name])} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
print()

print("üìè –î–ª–∏–Ω–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ (–≤ —Å–ª–æ–≤–∞—Ö) –ø–æ –∫–ª–∞—Å—Å–∞–º:")
for class_name in target_names:
    lengths = lengths_by_class[class_name]
    print(f"- {class_name}:")
    print(f"  ‚Ä¢ –°—Ä–µ–¥–Ω—è—è –¥–ª–∏–Ω–∞ : {np.mean(lengths):.1f}")
    print(f"  ‚Ä¢ –ú–µ–¥–∏–∞–Ω–Ω–∞—è     : {np.median(lengths):.1f}")
    print(f"  ‚Ä¢ –ú–∏–Ω           : {np.min(lengths)}")
    print(f"  ‚Ä¢ –ú–∞–∫—Å          : {np.max(lengths)}")

    min_idx = np.argmin(lengths)
    max_idx = np.argmax(lengths)
    print(f"  ‚Ä¢ üîΩ –°–∞–º—ã–π –∫–æ—Ä–æ—Ç–∫–∏–π –¥–æ–∫—É–º–µ–Ω—Ç: ¬´{docs_by_class[class_name][min_idx][:100]}...¬ª")
    print(f"  ‚Ä¢ üîº –°–∞–º—ã–π –¥–ª–∏–Ω–Ω—ã–π –¥–æ–∫—É–º–µ–Ω—Ç : ¬´{docs_by_class[class_name][max_idx][:100]}...¬ª\n")

print("üî° –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤ (–ø–æ—Å–ª–µ CountVectorizer):")
for class_name in target_names:
    print(f"- {class_name}: {len(word_sets_by_class[class_name])} —Å–ª–æ–≤")

print(f"\nüß† –í—Å–µ–≥–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤ –≤–æ –≤—Å–µ—Ö –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö: {len(vocab_set)}")


üìÑ –û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: 1776

üìä –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–ª–∞—Å—Å–∞–º:
- sci.space: 584 –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
- comp.graphics: 593 –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
- soc.religion.christian: 599 –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

üìè –î–ª–∏–Ω–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ (–≤ —Å–ª–æ–≤–∞—Ö) –ø–æ –∫–ª–∞—Å—Å–∞–º:
- sci.space:
  ‚Ä¢ –°—Ä–µ–¥–Ω—è—è –¥–ª–∏–Ω–∞ : 157.8
  ‚Ä¢ –ú–µ–¥–∏–∞–Ω–Ω–∞—è     : 62.0
  ‚Ä¢ –ú–∏–Ω           : 0
  ‚Ä¢ –ú–∞–∫—Å          : 9109
  ‚Ä¢ üîΩ –°–∞–º—ã–π –∫–æ—Ä–æ—Ç–∫–∏–π –¥–æ–∫—É–º–µ–Ω—Ç: ¬´...¬ª
  ‚Ä¢ üîº –°–∞–º—ã–π –¥–ª–∏–Ω–Ω—ã–π –¥–æ–∫—É–º–µ–Ω—Ç : ¬´Archive-name: jpeg-faq
Last-modified: 18 April 1993

This FAQ article discusses JPEG image compressi...¬ª

- comp.graphics:
  ‚Ä¢ –°—Ä–µ–¥–Ω—è—è –¥–ª–∏–Ω–∞ : 202.3
  ‚Ä¢ –ú–µ–¥–∏–∞–Ω–Ω–∞—è     : 82.0
  ‚Ä¢ –ú–∏–Ω           : 0
  ‚Ä¢ –ú–∞–∫—Å          : 6109
  ‚Ä¢ üîΩ –°–∞–º—ã–π –∫–æ—Ä–æ—Ç–∫–∏–π –¥–æ–∫—É–º–µ–Ω—Ç: ¬´...¬ª
  ‚Ä¢ üîº –°–∞–º—ã–π –¥–ª–∏–Ω–Ω—ã–π –¥–æ–∫—É–º–µ–Ω—Ç : ¬´COMMERCIAL SPACE NEWS/S

In [4]:
pipeline = make_pipeline(
    CountVectorizer(stop_words="english"), 
    NaiveBayes(alpha=1.0)
)

sk_pipeline = make_pipeline(
    CountVectorizer(stop_words="english"), 
    MultinomialNB(alpha=1.0)
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

In [5]:
def evaluate_model_with_cv(model, X, y, cv=5, average='macro'):
    scoring = {
        'accuracy':  make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average=average, zero_division=0),
        'recall':    make_scorer(recall_score, average=average, zero_division=0),
        'f1':        make_scorer(f1_score, average=average, zero_division=0)
    }

    scores = cross_validate(model, X, y, scoring=scoring, cv=cv, return_train_score=False)

    metrics = ['accuracy', 'precision', 'recall', 'f1']
    result_df = pd.DataFrame({m: scores[f'test_{m}'] for m in metrics})
    result_df.loc['mean'] = result_df.mean()

    print(f"üìä –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏–∏ ({cv.n_splits}-fold, —É—Å—Ä–µ–¥–Ω–µ–Ω–∏–µ: '{average}'):\n")
    print(result_df.round(4))

In [6]:
evaluate_model_with_cv(pipeline, X, y, cv=cv, average='macro')

üìä –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏–∏ (5-fold, —É—Å—Ä–µ–¥–Ω–µ–Ω–∏–µ: 'macro'):

      accuracy  precision  recall      f1
0       0.8933     0.9043  0.8930  0.8933
1       0.8986     0.9017  0.8985  0.8982
2       0.9183     0.9215  0.9181  0.9182
3       0.9127     0.9164  0.9121  0.9121
4       0.9099     0.9139  0.9094  0.9095
mean    0.9065     0.9116  0.9062  0.9063


In [7]:
evaluate_model_with_cv(sk_pipeline, X, y, cv=cv, average='macro')

üìä –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏–∏ (5-fold, —É—Å—Ä–µ–¥–Ω–µ–Ω–∏–µ: 'macro'):

      accuracy  precision  recall      f1
0       0.8933     0.9043  0.8930  0.8933
1       0.8986     0.9017  0.8985  0.8982
2       0.9183     0.9215  0.9181  0.9182
3       0.9127     0.9164  0.9121  0.9121
4       0.9099     0.9139  0.9094  0.9095
mean    0.9065     0.9116  0.9062  0.9063


In [8]:
%timeit cross_validate(pipeline, X, y, scoring='accuracy', cv=cv, return_train_score=False)

696 ms ¬± 1.77 ms per loop (mean ¬± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit cross_validate(sk_pipeline, X, y, scoring='accuracy', cv=cv, return_train_score=False)

707 ms ¬± 5.66 ms per loop (mean ¬± std. dev. of 7 runs, 1 loop each)
