In [47]:
import numpy as np
import pandas as pd


In [48]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                        remove=('headers', 'footers', 'quotes'),
                                        categories=categories)
  
newsgroups_test = fetch_20newsgroups(subset='test',
                                       remove=('headers', 'footers', 'quotes'),
                                        categories=categories)

In [49]:
print(len(newsgroups_train.data), len(newsgroups_test.data))

2034 1353


In [50]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target

X_test = newsgroups_test.data
y_test = newsgroups_test.target

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df=5, max_df=0.5)
# 5개 미만의 문서에서 나타나는 단어 특성 제외
# 문서의 50%를 초과해 나타나는 단어 제외

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [52]:
for word, count in zip(
    cv.get_feature_names_out()[:100], X_train_cv[0].toarray()[0, :100]
):
  print(word, ':', count, end=',')

00 : 0,000 : 0,01 : 0,04 : 0,05 : 0,10 : 0,100 : 0,1000 : 0,11 : 0,12 : 0,128 : 0,129 : 0,13 : 0,130 : 0,14 : 0,15 : 0,16 : 0,17 : 0,18 : 0,19 : 0,1987 : 0,1988 : 0,1989 : 0,1990 : 0,1991 : 0,1992 : 0,1993 : 0,20 : 0,200 : 0,202 : 0,21 : 0,22 : 0,23 : 0,24 : 0,25 : 0,256 : 0,26 : 0,27 : 0,28 : 0,2d : 0,30 : 0,300 : 0,31 : 0,32 : 0,33 : 0,34 : 0,35 : 0,39 : 0,3d : 0,40 : 0,400 : 0,42 : 0,45 : 0,50 : 0,500 : 0,60 : 0,600 : 0,65 : 0,70 : 0,75 : 0,80 : 0,800 : 0,90 : 0,900 : 0,91 : 0,92 : 0,93 : 0,95 : 0,_the : 0,ability : 0,able : 1,abortion : 0,about : 1,above : 0,absolute : 0,absolutely : 0,ac : 0,accept : 0,acceptable : 0,accepted : 0,access : 0,according : 0,account : 0,accurate : 0,across : 0,act : 0,action : 0,actions : 0,active : 0,activities : 0,activity : 0,acts : 0,actual : 0,actually : 0,ad : 0,add : 0,added : 0,addition : 0,additional : 0,address : 0,

# Naive bayes

In [53]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()

NB_clf.fit(X_train_cv, y_train)

print('train set score : {:.3f}'.format(NB_clf.score(X_train_cv, y_train)))

print('test set score : {:.3f}'.format(NB_clf.score(X_test_cv, y_test)))

train set score : 0.824
test set score : 0.732


In [54]:
pred = NB_clf.predict(X_test_cv[:2])

print(pred)
print(newsgroups_train.target_names[pred[0]])
print(newsgroups_train.target_names[pred[1]])

[2 1]
sci.space
comp.graphics


개선점

1. max_features, min_df, max_df 등 countVectorizer의 매개변수 변경
2. 나이브베이즈 매개변수 조절. alpha 값을 늘리면 통계데이터가 완만해지고 복잡도가 낮아짐
3. 다른 머신러닝 알고리즘 시도

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

NB_clf.fit(X_train_tfidf, y_train)

print('train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))
print('test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

train set score: 0.862
test set score: 0.741


In [56]:
# 네 카테고리별로 영향을 많이 미친 특성 혹은 단어 추출

import numpy as np

def top10_features(classifier, vectorizer, categories):
  feature_names = np.asarray(vectorizer.get_feature_names_out())

  for i, category in enumerate(categories):
    # 역순 정렬 위해 계수에 음수를 취하여 정렬
    top10 = np.argsort(-classifier.coef_[i])[:10]
    print("%s: %s" % (category, ", ".join(feature_names[top10])))

top10_features(NB_clf, tfidf, newsgroups_train.target_names)

alt.atheism: you, not, are, be, this, have, as, what, they, if
comp.graphics: you, on, graphics, this, have, any, can, or, with, thanks
sci.space: space, on, you, be, was, this, as, they, have, are
talk.religion.misc: you, not, he, are, as, this, be, god, was, they




# logistic regression

In [57]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression()

LR_clf.fit(X_train_tfidf, y_train)

print('train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
print('test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))

train set score: 0.930
test set score: 0.734


과적합 방지
1. 특성의 수를 줄이는 방법 (텍스트 분석에서는 특성이 많음에도 좋은 성능을 보이는 경우가 많음)
2. 정규화(릿지, 라쏘 등)

In [58]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_tfidf, y_train)

print('train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

train set score: 0.960
test set score: 0.735


In [59]:
# 원초적인 그리드 서치

import numpy as np
from sklearn.model_selection import train_test_split

X_train_ridge, X_val_ridge, y_train_ridge, y_val_ridge = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

max_score = 0
max_alpha = 0

for alpha in np.arange(0.1, 10, 0.1):
  ridge_clf = RidgeClassifier(alpha=alpha)
  ridge_clf.fit(X_train_ridge, y_train_ridge)

  score = ridge_clf.score(X_val_ridge, y_val_ridge)

  if score > max_score:
    max_score = score
    max_alpha = alpha

print('max alpha {:.3f} at max validation score {:.3f}'.format(max_alpha, max_score))

max alpha 1.600 at max validation score 0.826


In [60]:
ridge_clf = RidgeClassifier(alpha=0.8)
ridge_clf.fit(X_train_tfidf, y_train)

print('train set score {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('test set score {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

train set score 0.962
test set score 0.729


In [61]:
top10_features(ridge_clf, tfidf, newsgroups_train.target_names)

alt.atheism: bobby, religion, motto, punishment, atheists, atheism, satan, liar, deletion, islamic
comp.graphics: graphics, computer, 42, 3d, file, hi, using, image, screen, looking
sci.space: space, orbit, sci, spacecraft, funding, 23, engineering, nick, sounds, star
talk.religion.misc: blood, christian, fbi, christians, order, hudson, objective, abortion, dead, children


In [62]:
# 라쏘

lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1)

lasso_clf.fit(X_train_tfidf, y_train)

print('train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))
print(np.sum(lasso_clf.coef_ != 0), X_train_tfidf.shape[1])
# 특성 선택. 2000개에서 377로 줄임

train set score: 0.819
test set score: 0.724
437 2000


# 결정트리를 이용한 분류

In [63]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train_tfidf, y_train)

forest = RandomForestClassifier(random_state=7)
forest.fit(X_train_tfidf, y_train)

gb = GradientBoostingClassifier(random_state=7)
gb.fit(X_train_tfidf, y_train)

print('tree train set score: {:.3f}'.format(tree.score(X_train_tfidf, y_train)))
print('tree test set score: {:.3f}'.format(tree.score(X_test_tfidf, y_test)))

print('forest train set score: {:.3f}'.format(forest.score(X_train_tfidf, y_train)))
print('forest test set score: {:.3f}'.format(forest.score(X_test_tfidf, y_test)))

print('gb train set score: {:.3f}'.format(gb.score(X_train_tfidf, y_train)))
print('gb test set score: {:.3f}'.format(gb.score(X_test_tfidf, y_test)))


tree train set score: 0.977
tree test set score: 0.536
forest train set score: 0.977
forest test set score: 0.685
gb train set score: 0.933
gb test set score: 0.696


In [65]:
# coef_ 대신 feature_importances_를 제공

sorted_feature_importances = sorted(
    zip(tfidf.get_feature_names_out(), gb.feature_importances_),
    key=lambda x: x[1],
    reverse=True,
)

for feature, value in sorted_feature_importances[:40]:
  print('%s: %.3f' % (feature, value), end=', ')

space: 0.126, graphics: 0.080, atheism: 0.024, thanks: 0.023, file: 0.021, orbit: 0.020, jesus: 0.018, god: 0.018, hi: 0.017, nasa: 0.015, image: 0.015, files: 0.014, christ: 0.010, moon: 0.010, bobby: 0.010, launch: 0.010, looking: 0.010, christian: 0.010, atheists: 0.009, christians: 0.009, fbi: 0.009, 3d: 0.008, you: 0.008, not: 0.008, islamic: 0.007, religion: 0.007, spacecraft: 0.007, flight: 0.007, computer: 0.007, islam: 0.007, ftp: 0.006, color: 0.006, software: 0.005, atheist: 0.005, card: 0.005, people: 0.005, koresh: 0.005, his: 0.005, kent: 0.004, sphere: 0.004, 

In [68]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [70]:
# 성능 향상
# 정규표현식 토크나이저 사용
# nltk 불용어 제거
# 포터 스테머로 스테밍 사용
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

RegTok = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

def tokenizer(text):
  tokens = RegTok.tokenize(text.lower())
  words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
  features = (list(map(lambda token: PorterStemmer().stem(token), words)))
  return features


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [72]:
# 토크나이즈 적용
tfidf = TfidfVectorizer(tokenizer=tokenizer, max_features=2000, min_df=5, max_df=.5)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

LR_clf = LogisticRegression()
LR_clf.fit(X_train_tfidf, y_train)

print('train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
print('test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))
print(len(LR_clf.coef_[0]))

train set score: 0.930
test set score: 0.751
2000


학습 데이터셋 샘플 수 : 2,034개

카테고리가 4개인 로지스틱 회귀분석이라면 8,000개의 계수를 2,034개의 샘플로 추정해야 하는 상황임

과적합이 일어나 일반화가 거의 안돼야 하지만 문서 분류는 그 특성상 별 문제 없이 가능하다

따라서 특성의 수는 많을수록 좋을 가능성이 높다.

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=tokenizer)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print('train set dimension', X_train_tfidf.shape)
print('test set dimension', X_test_tfidf.shape)

ridge_clf = RidgeClassifier(alpha=2.4)
ridge_clf.fit(X_train_tfidf, y_train)

print('ridge train score {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('ridge test score {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

NB_clf = MultinomialNB(alpha=0.01)
NB_clf.fit(X_train_tfidf, y_train)

print('NB_clf train score {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))
print('NB_clf test score {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

train set dimension (2034, 20085)
test set dimension (1353, 20085)
ridge train score 0.968
ridge test score 0.768
NB_clf train score 0.971
NB_clf test score 0.793


추가적으로 다양한 시도를 해볼 필요가 있다.

- BOW(bag of words) 방법은 문맥 정보를 이용할 수 없다.
- 해결 방안으로는 시퀀스로 표현해서 처리하는 것

In [81]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

cachedStopWords = stopwords.words("english")
tfidf = TfidfVectorizer(token_pattern="[a-zA-Z']{3,}",
                        decode_error='ignore',
                        lowercase=True,
                        stop_words=stopwords.words('english'),
                        max_df=0.5,
                        min_df=2)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 11483)


릿지 회귀분석을 이용해 성능 비교

N-gram을 사용하면 변수가 늘어나고 과적합 우려가 있으므로 릿지 회귀분석 사용

In [83]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_tfidf, y_train)

print('train score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('test score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

train score: 0.976
test score: 0.766


In [85]:
tfidf = TfidfVectorizer(token_pattern="[a-zA-Z']{3,}",
                        decode_error='ignore',
                        lowercase=True,
                        stop_words=stopwords.words('english'),
                        ngram_range=(1, 2),
                        max_df=0.5,
                        min_df=2)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 26550)


In [87]:
bigram_features = [f for f in tfidf.get_feature_names_out() if len(f.split()) > 1]
print('bi-gram samples:', bigram_features[:10])

ridge_clf.fit(X_train_tfidf, y_train)
print('train score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('test score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

bi-gram samples: ["'cause can't", "'em better", "'expected errors'", "'karla' next", "'nodis' password", "'official doctrine", "'ok see", "'sci astro'", "'what's moonbase", 'aas american']
train score: 0.976
test score: 0.773
