## 텍스트 분류 실습 - 20 뉴스그룹 분류

In [2]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset="all", random_state=156)

In [3]:
# check key value
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [4]:
import pandas as pd

print("target class value and distribution:\n", pd.Series(news_data.target).value_counts().sort_index())

target class value and distribution:
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64


In [5]:
print("target class names:\n", news_data.target_names)

target class names:
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [7]:
from sklearn.datasets import fetch_20newsgroups

# subset = "train" : only train data
# remove = ("headers", "footers", "quotes") : only extract main
train_news = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"),
                   random_state=156)
X_train = train_news.data
y_train = train_news.target

test_news = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"), random_state=156)
X_test = test_news.data
y_test = test_news.target
print("train data size {0}, test data size {1}".format(len(train_news.data), len(test_news.data)))

train data size 11314, test data size 7532


### 피처 벡터화, 머신러닝 모델 학습/예측/평가

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# 피처 벡터화 변환
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

# Using fitted CountVectorizer to transform test data
X_test_cnt_vect = cnt_vect.transform(X_test)

print("CountVectorizer Shape:", X_train_cnt_vect.shape)

CountVectorizer Shape: (11314, 101631)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
prd = lr_clf.predict(X_test_cnt_vect)
print("accuracy: {0:.3f}".format(accuracy_score(y_test, prd)))

accuracy: 0.606


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(solver="liblinear")
lr_clf.fit(X_train_tfidf_vect, y_train)
prd = lr_clf.predict(X_test_tfidf_vect)
print("accuracy: {0:.3f}".format(accuracy_score(y_test, prd)))

accuracy: 0.678


TF-IDF 가 일반적으로 성능 예측이 더 좋음

In [11]:
tfidf_vect = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(solver="liblinear")
lr_clf.fit(X_train_tfidf_vect, y_train)
prd = lr_clf.predict(X_test_tfidf_vect)
print("accuracy: {0:.3f}".format(accuracy_score(y_test, prd)))

accuracy: 0.690


In [12]:
from sklearn.model_selection import GridSearchCV

params = {'C': [0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring="accuracy", verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print("Logistic Regression best C parameter :", grid_cv_lr.best_params_)

prd = grid_cv_lr.predict(X_test_tfidf_vect)
print("accuracy: {0:.3f}".format(accuracy_score(y_test, prd)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Logistic Regression best C parameter : {'C': 10}
accuracy: 0.704


### ScikitLearn Pipeline and combining with GridSearchCV

ScikitLearn Pipeline 클래스: 피처 벡터화와 알고리즘 학습/예측 코드 작성 한 번에 진행
스케일링 또는 벡터 정규화, 변환 작업, 분류 혹은 회귀 결합

In [13]:
# Pipeline 선언
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf_vect", TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_df=300)),
    ("lr_clf", LogisticRegression(solver="liblinear", C=10))
])

In [14]:
pipeline.fit(X_train, y_train)
prd = pipeline.predict(X_test)
print("accuracy: {0:.3f}".format(accuracy_score(y_test, prd)))

accuracy: 0.704


GridSearchCV에서도 진행할 수 있도록 지원함
피처 벡터화를 위한 파라미터와 ML 알고리즘의 하이퍼 파라미터 모두 한 번에 최적화
이 때, 개별 객체명과 파라미터명/하이퍼 파라미터명을 결합해 Key 값으로 할당 (구분자 __)

In [15]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf_vect", TfidfVectorizer(stop_words="english")),
    ("lr_clf", LogisticRegression())
])

params = {
    "tfidf_vect__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "tfidf_vect__max_df": [100, 300, 700],
    "lr_clf__C": [1, 5, 10]
}

grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring="accuracy", verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)

prd = grid_cv_pipe.predict(X_test)
print("accuracy: {0:.3f}".format(accuracy_score(y_test, prd)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
{'lr_clf__C': 10, 'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 2)} 0.7537571852837964
accuracy: 0.701
