In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import fetch_20newsgroups 
news = fetch_20newsgroups(subset = 'all', random_state = 2021) # 모든 데이터를 가져오겠다는 의미 

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


# 데이터 탐색 

In [None]:
news.keys() # data에는 데이터, target은 y값 

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys() # feature_names는 4가지 속성 이름을 담고 있음 

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [None]:
iris.data # 딕셔너리지만 이렇게 읽을 수 있음
# why? 자바스크립트에서 object를 표현하는 방법이 .을 사용하기 때문이다. 

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
pd.Series(news.target).value_counts().sort_index() # 그룹별로 몇 개 있는가 

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

# 훈련/테스트 데이터 추출 

In [None]:
train_news = fetch_20newsgroups(
    subset = 'train', random_state = 2021,
    remove=('headers', 'footers', 'quotes') # 본문만 받기 위해
)

In [None]:
len(train_news.data)

11314

In [None]:
test_news = fetch_20newsgroups(
    subset = 'test', random_state = 2021,
    remove=('headers', 'footers', 'quotes') # 본문만 받기 위해
)
len(test_news.data)

7532

# 텍스트 데이터에 대해서 전처리 

In [None]:
df_train = pd.DataFrame({'article' : train_news.data})
df_test = pd.DataFrame({'article' : test_news.data})

df_train.head()

Unnamed: 0,article
0,\nStop! Hold it! You have a few problems here....
1,"]Is it possible to do a ""wheelie"" on a motorcy..."
2,\n\nBBS number\n510-226-2365
3,: [first post I've seen from the ol' Bug-Zoo (...
4,Archive-name: rec-autos/part5\n\n[this article...


- train dataset 

In [None]:
# 특수문자 제거 
df_train['article'] = df_train.article.str.replace("[^a-zA-Z]"," ")
df_test['article'] = df_test.article.str.replace("[^a-zA-Z]"," ")
df_train.head()

Unnamed: 0,article
0,Stop Hold it You have a few problems here ...
1,Is it possible to do a wheelie on a motorcy...
2,BBS number
3,first post I ve seen from the ol Bug Zoo ...
4,Archive name rec autos part this article i...


In [None]:
df_train.article[1]

' Is it possible to do a  wheelie  on a motorcycle with shaft drive   yes  '

In [None]:
# 길이가 3 이하인 단어 제거 
df_train['article'] = df_train.article.apply(lambda x : ' '.join([w for w in x.split() if len(w) > 3 ]))

# 소문자로 변환
df_train['article'] = df_train.article.apply(lambda x : x.lower())
df_train.head()

Unnamed: 0,article
0,stop hold have problems here official history ...
1,possible wheelie motorcycle with shaft drive
2,number
3,first post seen from bgsu there means that pos...
4,archive name autos part this article pair arti...


In [None]:
# 소문자로 변환하고 길이가 3 이하인 단어 제거 
df_train['article'] = df_train.article.apply(lambda x : ' '.join([w.lower() for w in x.split() if len(w) > 3 ]))
df_test['article'] = df_test.article.apply(lambda x : ' '.join([w.lower() for w in x.split() if len(w) > 3 ]))
df_test.head()


Unnamed: 0,article
0,need diet diverticular disease ideas gastroint...
1,there chips which perform voice compression ex...
2,total baseball which also tries evaluate playe...
3,anyone would like their segacd software please...
4,what aluminium siding keep seeing references s...


# 텍스트 변환

In [None]:
# from sklearn.feature_extraction.text import CountVecotrizer 
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(df_train.article)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
X_train = tvect.transform(df_train.article)
X_test = tvect.transform(df_test.article)
X_train.shape, X_test.shape

((11314, 64133), (7532, 64133))

In [None]:
y_train = train_news.target
y_test = test_news.target

훈련/예측/평가

In [None]:
from sklearn.svm import SVC 
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
pred = svc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score 
accuracy_score(y_test,pred)

0.6488316516197558