In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [2]:
news_data = fetch_20newsgroups(subset='all', random_state=156)

In [3]:
# 훈련 데이터 추출
train_news = fetch_20newsgroups(subset='train', random_state=156,
                                remove=('headers', 'footers', 'quotes'))

In [4]:
df_train = pd.DataFrame(train_news.data, columns=['data'])
df_train['target'] = train_news.target
df_train.isna().sum()

data      0
target    0
dtype: int64

In [5]:
df_train[df_train.data == ''].count()

data      218
target    218
dtype: int64

In [6]:
df_train = df_train.drop(df_train[df_train.data == ''].index)
df_train[df_train.data == ''].count()

data      0
target    0
dtype: int64

In [7]:
df_train.keys()

Index(['data', 'target'], dtype='object')

In [8]:
# 훈련/테스트용 데이터 추출
X_train = df_train.data.values
y_train = df_train.target.values

In [9]:
print(train_news.data[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave



In [10]:
train_news.target[0]

4

In [11]:
# 테스트용 데이터 추출
test_news = fetch_20newsgroups(subset='test', random_state=156,
                                remove=('headers', 'footers', 'quotes'))

In [12]:
df_test = pd.DataFrame(test_news.data, columns=['data'])
df_test['target'] = test_news.target
df_test.isna().sum()

data      0
target    0
dtype: int64

In [13]:
df_test[df_test.data == ''].count()

data      162
target    162
dtype: int64

In [14]:
df_test = df_test.drop(df_test[df_test.data == ''].index)
df_test[df_test.data == ''].count()

data      0
target    0
dtype: int64

In [15]:
X_test = df_test.data.values
y_test = df_test.target.values

In [16]:
len(X_train), len(X_test)

(11096, 7370)

In [17]:
df_test.to_csv('../static/data/news/test.csv', index=False)

### 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [19]:
from sklearn.svm import SVC

- Case 1. Count Vectorizer

In [20]:
count_vect = CountVectorizer()
count_vect.fit(X_train) # fit_fransform()을 쓰면 훈련한게 사라지므로 나중에 사용 불가
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

In [21]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_count, y_train)

LogisticRegression()

In [22]:
lr_pred = lr_clf.predict(X_test_count) # 인자로 X_test_count 줌
lr_acc = accuracy_score(y_test, lr_pred)
print(f'Count Vectorizer : {lr_acc:.4f}')

Count Vectorizer : 0.6210


- Case 2. Tf - idf vectorizer

In [23]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [24]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf, y_train)

LogisticRegression()

In [25]:
tf_pred = lr_clf.predict(X_test_tfidf)
tf_acc = accuracy_score(y_test, tf_pred)
print(f'Count Vectorizer : {tf_acc:.4f}')

Count Vectorizer : 0.6872


- Case 3. Count Vectorizer + LogisticRegression

In [26]:
count_vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [27]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [28]:
params = {
    'count_vect__max_df': [200, 300],
    'lr_clf__C': [0.5, 1, 5]
}

In [29]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                        scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 13.4min finished
{'count_vect__max_df': 200, 'lr_clf__C': 0.5} 0.7071022951988398


In [30]:
best_count_lr = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_count_lr = best_count_lr.predict(X_test)
acc_count_lr = accuracy_score(y_test, pred_count_lr)
print(f'Count Vectorizer + LogisticRegression 평균 정확도 : {acc_count_lr:.4f}')

Count Vectorizer + LogisticRegression 평균 정확도 : 0.6476


In [31]:
import joblib

In [32]:
joblib.dump(best_count_lr, '../static/model/20news_count_lr.pkl')

['../static/model/20news_count_lr.pkl']

- Case 4. Tf - idf vectorizer + LogisticRegression

In [33]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [34]:
params = {
    'tfidf_vect__max_df': [100, 300, 500],
    'lr_clf__C': [5, 10, 15]
}

In [35]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                        scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 18.1min finished
{'lr_clf__C': 5, 'tfidf_vect__max_df': 500} 0.7621671193589465


In [36]:
best_tfid_lr = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_tfid_lr = best_tfid_lr.predict(X_test)
acc_tfid_lr = accuracy_score(y_test, pred_tfid_lr)
print(f'Tf - idf vectorizer + LogisticRegression 평균 정확도 : {acc_tfid_lr:.4f}')

Tf - idf vectorizer + LogisticRegression 평균 정확도 : 0.7045


In [37]:
joblib.dump(best_tfid_lr, '../static/model/20news_tfid_lr.pkl')

['../static/model/20news_tfid_lr.pkl']

- Case 4. Tf - idf vectorizer + SVM

In [38]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [39]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('sv_clf', SVC())
])

In [53]:
params = {
    'tfidf_vect__max_df': [100, 300, 500],
    'sv_clf__C': [1, 10, 20]
}

In [54]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                        scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 21.4min finished
{'sv_clf__C': 10, 'tfidf_vect__max_df': 300} 0.7586518518323572


In [55]:
best_tfid_sv = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_tfid_sv = best_tfid_sv.predict(X_test)
acc_tfid_sv = accuracy_score(y_test, pred_tfid_sv)
print(f'Tf - idf vectorizer + SVM 평균 정확도 : {acc_tfid_sv:.4f}')

Tf - idf vectorizer + SVM 평균 정확도 : 0.6958


In [56]:
joblib.dump(best_tfid_sv, '../static/model/20news_tfid_sv.pkl')

['../static/model/20news_tfid_sv.pkl']

### test

In [57]:
index = 100

In [58]:
df = pd.read_csv('../static/data/news/test.csv')
df.tail(3)

Unnamed: 0,data,target
7367,\n\n\n-- That means that there cannot be any a...,13
7368,s:\n I have a 1991 Toyota Camry Deluxe for sa...,6
7369,"May 13, 1993 _Five Russian soldiers sentenced...",17


In [60]:
label = df.target[index]
label

11

### test data 만드는 방법 1

In [61]:
test_data = []
test_data.append(df.data[index])
test_data

onservative" position is that we should not\nsell these computers to the Soviets, because they could  use\nthem  in weapons systems.  The "liberal" position is that we\nshould sell them, in  the  interests  of  mutual  trade  and\ncooperation--and  anyway,  if  we don\'t make the sale, there\nwill certainly be some other nation willing to.\n\n     For my part, I\'m ready to suggest that the  Libertarian\nposition should be to give them to the Soviets for free, and\nif  necessary, make them take them . . . and if that doesn\'t\nwork load up an SR-71  Blackbird  and  air  drop  them  over\nMoscow in the middle of the night.  Paid for by private sub-\nscription, of course, not taxation . . . I confess that this\nis not a position that has gained much support among members\nof  the conventional left-right political spectrum, but, af-\nter all, in the words of one of Illuminatus\'s characters, we\nare political non-Euclideans:   The shortest distance  to  a\nparticular  goal may not look an

### test data 만드는 방법 2

In [62]:
test_data = df.iloc[index:index+1, :-1].values
test_data

is that we should not\nsell these computers to the Soviets, because they could  use\nthem  in weapons systems.  The "liberal" position is that we\nshould sell them, in  the  interests  of  mutual  trade  and\ncooperation--and  anyway,  if  we don\'t make the sale, there\nwill certainly be some other nation willing to.\n\n     For my part, I\'m ready to suggest that the  Libertarian\nposition should be to give them to the Soviets for free, and\nif  necessary, make them take them . . . and if that doesn\'t\nwork load up an SR-71  Blackbird  and  air  drop  them  over\nMoscow in the middle of the night.  Paid for by private sub-\nscription, of course, not taxation . . . I confess that this\nis not a position that has gained much support among members\nof  the conventional left-right political spectrum, but, af-\nter all, in the words of one of Illuminatus\'s characters, we\nare political non-Euclideans:   The shortest distance  to  a\nparticular  goal may not look anything like what most 

In [63]:
pred = best_tfid_sv.predict(test_data)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
pred[0], label

In [None]:
news_data.target_names

In [None]:
target_names = dict(zip(range(20),news_data.target_names))
target_names