In [4]:
import tensorflow
import matplotlib
import seaborn
import numpy as np
import pandas as pd
import sklearn

print(tensorflow.__version__)
print(matplotlib.__version__)
print(seaborn.__version__)
print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)

2.8.0
3.2.2
0.11.2
1.21.5
1.3.5
1.0.2


In [5]:
from tensorflow.keras.datasets import reuters 


사용할 모델 : 나이브 베이즈 분류기, CNB, 로지스틱 회귀, 서포트 벡터 머신, 결정 트리, 랜덤 포레스트, 그래디언트 부스팅 트리, 보팅

1. 모든 단어 사용

In [52]:
# 모든 단어 사용 
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

In [53]:
print("훈련 샘플의 수 : {}".format(len(x_train)))
print("테스트 샘플의 수 : {}".format(len(x_test)))

훈련 샘플의 수 : 8982
테스트 샘플의 수 : 2246


In [54]:
# 데이터 전처리 
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [55]:
index_to_word = { index + 3 : word for word, index in word_index.items() }
print(index_to_word[4])

the


In [56]:
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
		index_to_word[index]=token

In [57]:
print(' '.join([index_to_word[index] for index in x_train[0]]))

<sos> mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3


In [58]:
decoded = []
for i in range(len(x_train)) :
		t = ' '.join([index_to_word[index] for index in x_train[i]])
		decoded.append(t)

x_train = decoded
print(len(x_train))

8982


In [59]:
decoded = []

for i in range(len(x_test)) :
		t = ' '.join([index_to_word[index] for index in x_test[i]])
		decoded.append(t)

x_test = decoded
print(len(x_test))

2246


In [60]:
x_train[:5]

['<sos> mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3',
 "<sos> generale de banque sa lt genb br and lt heller overseas corp of chicago have each taken 50 pct stakes in factoring company sa belgo factors generale de banque said in a statement it gave no financial details of the transaction sa belgo factors' turnover in 1986 was 17 5 billion belgian francs reuter 3",
 '<sos> shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs vs 22 cts net 46 0 mln vs 3 328 000 avg shrs 14 0 mln vs 15 2 mln year shr 5 41 dlrs vs 1 56 dlrs shr diluted 4 94 dlrs vs 1 50 dlrs net 78 2 mln vs 25 9 mln avg shrs 14 5 mln vs 15 1 mln note earnings per share reflect th

In [61]:
# 데이터 전처리 (DTM, TD-IDF 행렬)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)

print(x_train_dtm.shape)

(8982, 26506)


In [62]:
tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidfv.shape)

(8982, 26506)


In [73]:
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산

In [65]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [64]:
x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환

MultinomialNB()

In [66]:
# 나이브 베이즈 분류기 
model = MultinomialNB()
model.fit(tfidfv, y_train)

print(classification_report(y_test,model.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.79      0.21      0.33       105
           2       0.00      0.00      0.00        20
           3       0.72      0.92      0.81       813
           4       0.45      0.96      0.61       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       0.00      0.00      0.00        25
          10       0.00      0.00      0.00        30
          11       0.80      0.29      0.42        83
          12       0.00      0.00      0.00        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.75      0.18      0.29        99
          17       0.00    

In [67]:
# CNB 

cb = ComplementNB()
cb.fit(tfidfv, y_train)

print(classification_report(y_test,cb.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.86      0.50      0.63        12
           1       0.63      0.88      0.73       105
           2       0.91      0.50      0.65        20
           3       0.87      0.91      0.89       813
           4       0.75      0.93      0.83       474
           5       0.00      0.00      0.00         5
           6       0.92      0.86      0.89        14
           7       1.00      0.67      0.80         3
           8       0.43      0.08      0.13        38
           9       0.81      0.88      0.85        25
          10       0.96      0.73      0.83        30
          11       0.55      0.67      0.61        83
          12       0.00      0.00      0.00        13
          13       0.62      0.54      0.58        37
          14       0.00      0.00      0.00         2
          15       0.50      0.11      0.18         9
          16       0.67      0.77      0.71        99
          17       0.00    

In [68]:
# 로지스틱 회귀

lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(tfidfv, y_train)

print(classification_report(y_test,lr.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.75      0.80      0.77       105
           2       0.70      0.70      0.70        20
           3       0.93      0.93      0.93       813
           4       0.81      0.87      0.84       474
           5       1.00      0.20      0.33         5
           6       0.93      1.00      0.97        14
           7       1.00      0.67      0.80         3
           8       0.68      0.71      0.69        38
           9       0.81      0.88      0.85        25
          10       0.93      0.87      0.90        30
          11       0.66      0.73      0.70        83
          12       0.57      0.31      0.40        13
          13       0.61      0.62      0.61        37
          14       0.67      1.00      0.80         2
          15       0.71      0.56      0.63         9
          16       0.71      0.77      0.74        99
          17       0.67    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [69]:
# 서포트 벡터 머신 
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(tfidfv, y_train)

print(classification_report(y_test,lsvc.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70        12
           1       0.73      0.70      0.71       105
           2       0.64      0.70      0.67        20
           3       0.90      0.90      0.90       813
           4       0.79      0.83      0.81       474
           5       0.50      0.20      0.29         5
           6       0.87      0.93      0.90        14
           7       1.00      0.33      0.50         3
           8       0.54      0.66      0.60        38
           9       0.88      0.84      0.86        25
          10       0.96      0.80      0.87        30
          11       0.57      0.66      0.61        83
          12       0.44      0.31      0.36        13
          13       0.49      0.51      0.50        37
          14       0.33      0.50      0.40         2
          15       0.50      0.22      0.31         9
          16       0.65      0.68      0.66        99
          17       0.67    



In [70]:
# 결정 트리
tree = DecisionTreeClassifier(max_depth=10, random_state = 0)
tree.fit(tfidfv, y_train)

print(classification_report(y_test,tree.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.69      0.43      0.53       105
           2       0.75      0.45      0.56        20
           3       0.94      0.85      0.89       813
           4       0.40      0.89      0.55       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       1.00      0.16      0.28        25
          10       0.89      0.80      0.84        30
          11       0.58      0.60      0.59        83
          12       0.00      0.00      0.00        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.61      0.83      0.70        99
          17       0.00    

In [71]:
# 랜덤 포레스트
forest = RandomForestClassifier(n_estimators = 5, random_state=0)
forest.fit(tfidfv, y_train)

print(classification_report(y_test,forest.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.25      0.58      0.35        12
           1       0.35      0.60      0.44       105
           2       0.32      0.40      0.36        20
           3       0.82      0.89      0.85       813
           4       0.62      0.84      0.71       474
           5       0.00      0.00      0.00         5
           6       0.67      0.43      0.52        14
           7       0.50      0.33      0.40         3
           8       0.51      0.47      0.49        38
           9       1.00      0.28      0.44        25
          10       0.46      0.20      0.28        30
          11       0.56      0.64      0.60        83
          12       0.40      0.15      0.22        13
          13       0.33      0.16      0.22        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.59      0.46      0.52        99
          17       0.00    

In [74]:
# 그래디언트 부스팅 트리
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

print(classification_report(y_test,grbt.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.60      0.50      0.55        12
           1       0.81      0.71      0.76       105
           2       0.58      0.70      0.64        20
           3       0.87      0.91      0.89       813
           4       0.78      0.86      0.82       474
           5       1.00      0.20      0.33         5
           6       0.77      0.71      0.74        14
           7       1.00      0.33      0.50         3
           8       0.60      0.63      0.62        38
           9       0.91      0.80      0.85        25
          10       0.79      0.77      0.78        30
          11       0.61      0.65      0.63        83
          12       0.50      0.46      0.48        13
          13       0.48      0.32      0.39        37
          14       0.00      0.00      0.00         2
          15       0.25      0.11      0.15         9
          16       0.72      0.71      0.71        99
          17       0.83    

In [75]:
#보팅 

voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)
voting_classifier.fit(tfidfv, y_train)


print(classification_report(y_test,voting_classifier.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75        12
           1       0.80      0.77      0.79       105
           2       0.67      0.80      0.73        20
           3       0.93      0.94      0.93       813
           4       0.82      0.88      0.85       474
           5       1.00      0.20      0.33         5
           6       0.87      0.93      0.90        14
           7       1.00      0.33      0.50         3
           8       0.69      0.71      0.70        38
           9       0.80      0.80      0.80        25
          10       0.90      0.90      0.90        30
          11       0.67      0.71      0.69        83
          12       0.60      0.46      0.52        13
          13       0.69      0.65      0.67        37
          14       0.29      1.00      0.44         2
          15       0.40      0.22      0.29         9
          16       0.73      0.76      0.74        99
          17       0.75    

2. 빈도수 상위 5000개의 단어만 사용

In [80]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

print("훈련 샘플의 수 : {}".format(len(x_train)))
print("테스트 샘플의 수 : {}".format(len(x_test)))

# 데이터 전처리 

decoded = []
for i in range(len(x_train)) :
		t = ' '.join([index_to_word[index] for index in x_train[i]])
		decoded.append(t)

x_train = decoded
print(len(x_train))

decoded = []

for i in range(len(x_test)) :
		t = ' '.join([index_to_word[index] for index in x_test[i]])
		decoded.append(t)

x_test = decoded
print(len(x_test))

훈련 샘플의 수 : 8982
테스트 샘플의 수 : 2246
8982
2246


In [81]:
x_train[:5]

['<sos> <unk> <unk> said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3',
 '<sos> generale de banque sa lt <unk> <unk> and lt heller overseas corp of chicago have each taken 50 pct stakes in <unk> company sa <unk> factors generale de banque said in a statement it gave no financial details of the transaction sa <unk> <unk> turnover in 1986 was 17 5 billion belgian francs reuter 3',
 '<sos> shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs vs 22 cts net 46 0 mln vs 3 328 000 avg shrs 14 0 mln vs 15 2 mln year shr 5 41 dlrs vs 1 56 dlrs shr diluted 4 94 dlrs vs 1 50 dlrs net 78 2 mln vs 25 9 mln avg shrs 14 5 mln vs 15 1 mln note earnings per share reflect the two fo

In [82]:
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)

print(x_train_dtm.shape)

tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidfv.shape)


(8982, 4867)
(8982, 4867)


In [84]:
x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환

In [85]:
# 나이브 베이즈 분류기 
model = MultinomialNB()
model.fit(tfidfv, y_train)

print(classification_report(y_test,model.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.50      0.80      0.62       105
           2       0.00      0.00      0.00        20
           3       0.86      0.89      0.87       813
           4       0.59      0.95      0.73       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       1.00      0.28      0.44        25
          10       0.00      0.00      0.00        30
          11       0.48      0.73      0.58        83
          12       0.00      0.00      0.00        13
          13       1.00      0.14      0.24        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.60      0.66      0.62        99
          17       0.00    

In [86]:
# CNB 

cb = ComplementNB()
cb.fit(tfidfv, y_train)

print(classification_report(y_test,cb.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70        12
           1       0.63      0.86      0.73       105
           2       0.91      0.50      0.65        20
           3       0.91      0.89      0.90       813
           4       0.74      0.92      0.82       474
           5       0.00      0.00      0.00         5
           6       0.86      0.86      0.86        14
           7       1.00      0.67      0.80         3
           8       0.57      0.21      0.31        38
           9       0.82      0.92      0.87        25
          10       0.96      0.80      0.87        30
          11       0.54      0.76      0.63        83
          12       0.00      0.00      0.00        13
          13       0.69      0.59      0.64        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.67      0.79      0.72        99
          17       0.00    

In [87]:
# 로지스틱 회귀

lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(tfidfv, y_train)

print(classification_report(y_test,lr.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.77      0.80      0.79       105
           2       0.74      0.85      0.79        20
           3       0.91      0.93      0.92       813
           4       0.81      0.87      0.84       474
           5       0.00      0.00      0.00         5
           6       0.92      0.86      0.89        14
           7       1.00      0.67      0.80         3
           8       0.64      0.74      0.68        38
           9       0.81      0.88      0.85        25
          10       0.93      0.87      0.90        30
          11       0.64      0.73      0.68        83
          12       0.57      0.31      0.40        13
          13       0.64      0.62      0.63        37
          14       0.50      0.50      0.50         2
          15       0.83      0.56      0.67         9
          16       0.67      0.73      0.70        99
          17       0.82    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [88]:
# 서포트 벡터 머신 
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(tfidfv, y_train)

print(classification_report(y_test,lsvc.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.80      0.67      0.73        12
           1       0.70      0.70      0.70       105
           2       0.67      0.70      0.68        20
           3       0.89      0.90      0.90       813
           4       0.80      0.84      0.82       474
           5       0.00      0.00      0.00         5
           6       0.86      0.86      0.86        14
           7       0.50      0.33      0.40         3
           8       0.73      0.71      0.72        38
           9       0.80      0.80      0.80        25
          10       0.92      0.80      0.86        30
          11       0.64      0.75      0.69        83
          12       0.18      0.23      0.20        13
          13       0.55      0.62      0.58        37
          14       1.00      0.50      0.67         2
          15       0.50      0.11      0.18         9
          16       0.61      0.68      0.64        99
          17       1.00    



In [89]:
# 결정 트리
tree = DecisionTreeClassifier(max_depth=10, random_state = 0)
tree.fit(tfidfv, y_train)

print(classification_report(y_test,tree.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.72      0.40      0.52       105
           2       0.60      0.45      0.51        20
           3       0.94      0.84      0.89       813
           4       0.39      0.91      0.55       474
           5       0.00      0.00      0.00         5
           6       1.00      0.57      0.73        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       0.88      0.88      0.88        25
          10       0.87      0.87      0.87        30
          11       0.62      0.48      0.54        83
          12       0.17      0.08      0.11        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.60      0.82      0.69        99
          17       0.00    

In [90]:
# 랜덤 포레스트
forest = RandomForestClassifier(n_estimators = 5, random_state=0)
forest.fit(tfidfv, y_train)

print(classification_report(y_test,forest.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.28      0.42      0.33        12
           1       0.42      0.78      0.55       105
           2       0.44      0.35      0.39        20
           3       0.84      0.90      0.87       813
           4       0.68      0.84      0.75       474
           5       0.00      0.00      0.00         5
           6       0.86      0.43      0.57        14
           7       1.00      0.33      0.50         3
           8       0.59      0.53      0.56        38
           9       0.71      0.40      0.51        25
          10       0.89      0.53      0.67        30
          11       0.57      0.69      0.62        83
          12       0.33      0.15      0.21        13
          13       0.46      0.32      0.38        37
          14       0.00      0.00      0.00         2
          15       1.00      0.11      0.20         9
          16       0.70      0.67      0.68        99
          17       0.00    

In [91]:
# 그래디언트 부스팅 트리
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

print(classification_report(y_test,grbt.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.80      0.68      0.73       105
           2       0.70      0.70      0.70        20
           3       0.90      0.90      0.90       813
           4       0.76      0.83      0.79       474
           5       0.14      0.20      0.17         5
           6       0.93      0.93      0.93        14
           7       0.50      0.33      0.40         3
           8       0.64      0.66      0.65        38
           9       0.91      0.84      0.87        25
          10       0.87      0.87      0.87        30
          11       0.62      0.66      0.64        83
          12       0.46      0.46      0.46        13
          13       0.55      0.43      0.48        37
          14       0.08      0.50      0.14         2
          15       0.33      0.22      0.27         9
          16       0.72      0.77      0.75        99
          17       0.33    

In [92]:
#보팅 

voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)
voting_classifier.fit(tfidfv, y_train)


print(classification_report(y_test,voting_classifier.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82        12
           1       0.80      0.77      0.79       105
           2       0.71      0.85      0.77        20
           3       0.92      0.94      0.93       813
           4       0.82      0.88      0.85       474
           5       0.33      0.20      0.25         5
           6       0.93      0.93      0.93        14
           7       0.67      0.67      0.67         3
           8       0.72      0.68      0.70        38
           9       0.81      0.84      0.82        25
          10       0.93      0.90      0.92        30
          11       0.67      0.70      0.68        83
          12       0.60      0.46      0.52        13
          13       0.68      0.62      0.65        37
          14       0.12      0.50      0.20         2
          15       0.67      0.44      0.53         9
          16       0.74      0.74      0.74        99
          17       0.57    

3. 직접 단어 개수를 설정해서 사용

In [93]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=1000, test_split=0.2)

print("훈련 샘플의 수 : {}".format(len(x_train)))
print("테스트 샘플의 수 : {}".format(len(x_test)))

# 데이터 전처리 

decoded = []
for i in range(len(x_train)) :
		t = ' '.join([index_to_word[index] for index in x_train[i]])
		decoded.append(t)

x_train = decoded
print(len(x_train))

decoded = []

for i in range(len(x_test)) :
		t = ' '.join([index_to_word[index] for index in x_test[i]])
		decoded.append(t)

x_test = decoded
print(len(x_test))

dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)

print(x_train_dtm.shape)

tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)
print(tfidfv.shape)

x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
tfidfv_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환

훈련 샘플의 수 : 8982
테스트 샘플의 수 : 2246
8982
2246
(8982, 969)
(8982, 969)


In [94]:
# 나이브 베이즈 분류기 
model = MultinomialNB()
model.fit(tfidfv, y_train)

print(classification_report(y_test,model.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.25      0.40        12
           1       0.46      0.76      0.58       105
           2       0.00      0.00      0.00        20
           3       0.92      0.86      0.89       813
           4       0.60      0.95      0.73       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.78      0.37      0.50        38
           9       0.91      0.40      0.56        25
          10       1.00      0.20      0.33        30
          11       0.44      0.81      0.57        83
          12       0.00      0.00      0.00        13
          13       1.00      0.11      0.20        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.53      0.70      0.60        99
          17       0.00    

In [95]:
# CNB 

cb = ComplementNB()
cb.fit(tfidfv, y_train)

print(classification_report(y_test,cb.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.55      0.50      0.52        12
           1       0.53      0.78      0.63       105
           2       0.00      0.00      0.00        20
           3       0.93      0.88      0.90       813
           4       0.69      0.93      0.79       474
           5       0.00      0.00      0.00         5
           6       0.82      0.64      0.72        14
           7       1.00      0.33      0.50         3
           8       0.69      0.24      0.35        38
           9       0.83      0.96      0.89        25
          10       0.88      0.73      0.80        30
          11       0.50      0.78      0.61        83
          12       0.00      0.00      0.00        13
          13       0.64      0.38      0.47        37
          14       1.00      0.50      0.67         2
          15       0.00      0.00      0.00         9
          16       0.60      0.75      0.67        99
          17       0.00    

In [96]:
# 로지스틱 회귀

lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(tfidfv, y_train)

print(classification_report(y_test,lr.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70        12
           1       0.74      0.75      0.75       105
           2       0.65      0.65      0.65        20
           3       0.90      0.93      0.91       813
           4       0.80      0.85      0.82       474
           5       1.00      0.20      0.33         5
           6       0.93      0.93      0.93        14
           7       1.00      0.33      0.50         3
           8       0.60      0.68      0.64        38
           9       0.80      0.80      0.80        25
          10       0.86      0.80      0.83        30
          11       0.64      0.69      0.66        83
          12       0.86      0.46      0.60        13
          13       0.50      0.62      0.55        37
          14       0.20      0.50      0.29         2
          15       0.80      0.44      0.57         9
          16       0.65      0.72      0.68        99
          17       0.64    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [97]:
# 서포트 벡터 머신 
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(tfidfv, y_train)

print(classification_report(y_test,lsvc.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.65      0.70      0.67       105
           2       0.39      0.45      0.42        20
           3       0.90      0.91      0.90       813
           4       0.75      0.79      0.77       474
           5       0.00      0.00      0.00         5
           6       0.81      0.93      0.87        14
           7       1.00      0.67      0.80         3
           8       0.57      0.66      0.61        38
           9       0.75      0.72      0.73        25
          10       0.84      0.70      0.76        30
          11       0.59      0.58      0.59        83
          12       0.45      0.38      0.42        13
          13       0.44      0.54      0.49        37
          14       0.50      0.50      0.50         2
          15       0.40      0.22      0.29         9
          16       0.62      0.62      0.62        99
          17       0.71    



In [98]:
# 결정 트리
tree = DecisionTreeClassifier(max_depth=10, random_state = 0)
tree.fit(tfidfv, y_train)

print(classification_report(y_test,tree.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.73      0.42      0.53       105
           2       0.67      0.30      0.41        20
           3       0.58      0.90      0.70       813
           4       0.69      0.81      0.75       474
           5       0.00      0.00      0.00         5
           6       1.00      0.64      0.78        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       0.00      0.00      0.00        25
          10       0.93      0.87      0.90        30
          11       0.57      0.59      0.58        83
          12       0.00      0.00      0.00        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.57      0.76      0.65        99
          17       0.00    

In [99]:
# 랜덤 포레스트
forest = RandomForestClassifier(n_estimators = 5, random_state=0)
forest.fit(tfidfv, y_train)

print(classification_report(y_test,forest.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.35      0.58      0.44        12
           1       0.47      0.79      0.59       105
           2       0.27      0.20      0.23        20
           3       0.85      0.91      0.88       813
           4       0.69      0.86      0.76       474
           5       0.00      0.00      0.00         5
           6       0.73      0.57      0.64        14
           7       0.20      0.33      0.25         3
           8       0.46      0.50      0.48        38
           9       0.82      0.72      0.77        25
          10       0.73      0.53      0.62        30
          11       0.62      0.63      0.62        83
          12       0.40      0.15      0.22        13
          13       0.48      0.35      0.41        37
          14       0.00      0.00      0.00         2
          15       0.50      0.11      0.18         9
          16       0.61      0.55      0.57        99
          17       1.00    

In [100]:
# 그래디언트 부스팅 트리
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

print(classification_report(y_test,grbt.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70        12
           1       0.75      0.66      0.70       105
           2       0.73      0.55      0.63        20
           3       0.89      0.91      0.90       813
           4       0.73      0.85      0.78       474
           5       0.20      0.20      0.20         5
           6       0.93      0.93      0.93        14
           7       0.25      0.33      0.29         3
           8       0.51      0.58      0.54        38
           9       0.79      0.76      0.78        25
          10       0.87      0.87      0.87        30
          11       0.65      0.67      0.66        83
          12       0.36      0.38      0.37        13
          13       0.51      0.51      0.51        37
          14       0.17      0.50      0.25         2
          15       0.40      0.22      0.29         9
          16       0.65      0.72      0.68        99
          17       0.50    

In [101]:
#보팅 

voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)
voting_classifier.fit(tfidfv, y_train)


print(classification_report(y_test,voting_classifier.predict(tfidfv_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70        12
           1       0.77      0.76      0.77       105
           2       0.76      0.65      0.70        20
           3       0.91      0.94      0.93       813
           4       0.78      0.86      0.82       474
           5       0.33      0.20      0.25         5
           6       0.93      1.00      0.97        14
           7       0.33      0.33      0.33         3
           8       0.62      0.63      0.62        38
           9       0.80      0.80      0.80        25
          10       0.86      0.83      0.85        30
          11       0.63      0.66      0.65        83
          12       0.43      0.46      0.44        13
          13       0.55      0.59      0.57        37
          14       0.20      0.50      0.29         2
          15       0.50      0.33      0.40         9
          16       0.69      0.74      0.71        99
          17       0.64    