<h2>문서 분류(Document Classification)</h2>

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score

In [6]:
news = fetch_20newsgroups()

In [7]:
x = news.data
y = news.target

In [8]:
cv = CountVectorizer()
x = cv.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(7919, 130107) (7919,) (3395, 130107) (3395,)


In [12]:
print(x_train[0])

  (0, 56979)	2
  (0, 85354)	3
  (0, 111322)	1
  (0, 68532)	7
  (0, 114731)	2
  (0, 87620)	1
  (0, 95162)	1
  (0, 64095)	1
  (0, 90379)	1
  (0, 89362)	7
  (0, 76032)	1
  (0, 123292)	1
  (0, 65798)	1
  (0, 114579)	1
  (0, 89860)	4
  (0, 114455)	14
  (0, 68766)	3
  (0, 115475)	2
  (0, 32311)	2
  (0, 27436)	1
  (0, 73201)	2
  (0, 37565)	2
  (0, 90252)	1
  (0, 62221)	3
  (0, 35983)	2
  :	:
  (0, 90260)	7
  (0, 52299)	2
  (0, 34496)	1
  (0, 83506)	1
  (0, 100256)	1
  (0, 106981)	1
  (0, 118401)	1
  (0, 14074)	1
  (0, 33457)	2
  (0, 14543)	2
  (0, 105963)	1
  (0, 46838)	1
  (0, 108159)	1
  (0, 108156)	1
  (0, 13339)	1
  (0, 76086)	1
  (0, 31767)	1
  (0, 33552)	2
  (0, 90548)	1
  (0, 81560)	2
  (0, 68541)	3
  (0, 80887)	1
  (0, 53375)	1
  (0, 93491)	1
  (0, 124599)	1


In [11]:
from sklearn.metrics import accuracy_score

<h2>로지스틱 회귀(Logistic Regression)</h2>
<b>클레스가 2개인 이진 분류를 위한 모델, 다중 분류에는 부적합</b>

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
LR = LogisticRegression()
LR.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
pred = LR.predict(x_test)
acc = accuracy_score(pred, y_test)
print(acc)

0.8695139911634757


<h2>서포트 벡터 머신(Support Vector Machines)</h2>
<b>회귀, 분류, 이상치 탐지 등에 사용되는 지도 학습</b>
<b>클래스 사잉의 경계에 위치한 데이터 포인트를 서포트 벡터(Support Vector)라고 함</b>

In [17]:
from sklearn import svm

In [19]:
SVM = svm.SVC(kernel = 'linear')
SVM.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
pred = SVM.predict(x_test)
acc = accuracy_score(pred, y_test)
print(acc)

0.8223858615611193


<h2>나이브 베이스 분류기(Naive Bayes Classification)</h2>

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
NB = MultinomialNB()
NB.fit(x_train, y_train)
pred = NB.predict(x_test)
acc =accuracy_score(pred, y_test)
print(acc)

0.8120765832106038


<b>tf-idf를 이용한 정확도 향상</b>

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

In [29]:
tfidf = TfidfTransformer()
x_train_tf = tfidf.fit_transform(x_train)
x_test_tf = tfidf.fit_transform(x_test)

NB.fit(x_train_tf, y_train)
pred = NB.predict(x_test_tf)
acc = accuracy_score(pred, y_test)
print(acc)

0.8153166421207658


<h2>결정 트리(Decision Tree)</h2>
<b>데이터 특성으로부터 추런된 결정 규칙을 통해 값을 예측</b>

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)
pred = DT.predict(x_test)
acc = accuracy_score(pred,y_test)
print(acc)

0.6415316642120766


<h2>XGBoost</h2>
<b>트리 기반의 양상블 기법, 분류에는 좋은 예측 성능을 보여줌</b>

In [4]:
from xgboost import XGBClassifier

In [12]:
xgb = XGBClassifier(n_estimators = 30, learning_rate = 0.05, max_depth = 3)
xgb.fit(x_train, y_train)
pred = xgb.predict(x_test)
acc = accuracy_score(pred, y_test)
print(acc)

0.6960235640648011


<h2>교차 검증</h2>
<b>일반 검증보다 모델의 일반화가 잘 되어 있는지 평가 가능</b>

In [13]:
from sklearn.model_selection import cross_val_score

In [17]:
scores = cross_val_score(NB, x, y, cv=5)
print(scores, scores.mean())

[0.83870968 0.83826779 0.82368537 0.83031374 0.83642794] 0.833480903927519


<h2>정밀도와 재현률(precision & recall)</h2>
<b>정밀도: 양성 클래스(정답)으로 예측한 샘플이 양성 클래스일 확률</b><br>
<b>재현률: 양성 클래스로 예측한 샘플 비율을 의미, 모델이 얼마나 실제 상황을 재현하는지 나타냄</b><br>
<b>F1-score: 정밀도와 재현률의 가중조화평균, 정확도에 비해 더 효과적인 모델 분석 지표로 알려져 있음</b><br>
<b>None - 클래스간 지표를 합치지 말고 그대로 출력</b><br>
<b>micro - 정밀도와 재현률이 같음, f1-score도 정밀도, 재현률과 동일</b><br>
<b>macro - 클래스간 지표를 단순 평균한 값</b><br>
<b>weighted - 클래스간 지표를 가중 평균한 값</b>

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [19]:
precision = precision_score(pred, y_test, average = 'micro')
recall = recall_score(pred, y_test, average = 'micro')
f1 = f1_score(pred, y_test, average = 'micro')
print(precision, recall, f1)

0.8120765832106038 0.8120765832106038 0.8120765832106038


In [20]:
precision = precision_score(pred, y_test, average = 'macro')
recall = recall_score(pred, y_test, average = 'macro')
f1 = f1_score(pred, y_test, average = 'macro')
print(precision, recall, f1)

0.8098834267019477 0.8397376072008065 0.7917435861770061


In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
GS = GridSearchCV(estimator = NB, param_grid = {'alpha' : [0.001, 0.01, 0.1, 1.]}, scoring = 'accuracy', cv=10)
GS.fit(x, y)

print(GS.best_score_)
print(GS.best_params_)

0.8897820965842167
{'alpha': 0.001}


In [23]:
GS = GridSearchCV(estimator = NB, param_grid = {'alpha' : [0.001, 0.002, 0.003, 0.004, 0.005]}, scoring = 'accuracy', cv=10)
GS.fit(x, y)

print(GS.best_score_)
print(GS.best_params_)

0.8897820965842167
{'alpha': 0.001}


In [24]:
GS = GridSearchCV(estimator = NB, param_grid = {'alpha' : [0.0006, 0.0008, 0.001]}, scoring = 'accuracy', cv=10)
GS.fit(x, y)

print(GS.best_score_)
print(GS.best_params_)

0.8897820965842167
{'alpha': 0.001}
