## 뉴스그룹 데이터 준비 및 특성 추출
http://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

fetch_20newsgroups

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

from pprint import pprint
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


데이터 가져오기

In [6]:
#20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

#학습 데이터셋을 가져옴
newsgroups_train = fetch_20newsgroups(subset='train',
#메일 내용에서 hint가 되는 부분을 삭제 - 순수하게 내용만으로 분류
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
#검증 데이터셋을 가져옴
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

print('#Train set size:', len(newsgroups_train.data))
print('#Test set size:', len(newsgroups_test.data))
print('#Selected categories:', newsgroups_train.target_names)
print('#Train labels:', set(newsgroups_train.target))

#Train set size: 2034
#Test set size: 1353
#Selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
#Train labels: {0, 1, 2, 3}


In [7]:
print('#Train set text samples:', newsgroups_train.data[0])
print('#Train set label smaples:', newsgroups_train.target[0])
print('#Test set text samples:', newsgroups_test.data[0])
print('#Test set label smaples:', newsgroups_test.target[0])

#Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
#Train set label smaples: 1
#Test set text samples: TRry the SKywatch project in  Arizona.
#Test set label smaples: 2


In [8]:
X_train = newsgroups_train.data   #학습 데이터셋 문서
y_train = newsgroups_train.target #학습 데이터셋 라벨

X_test = newsgroups_test.data     #검증 데이터셋 문서
y_test = newsgroups_test.target   #검증 데이터셋 라벨

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df=5, max_df=0.5)

X_train_cv = cv.fit_transform(X_train) # train set을 변환
print('Train set dimension:', X_train_cv.shape) 
X_test_cv = cv.transform(X_test) # test set을 변환
print('Test set dimension:', X_test_cv.shape)

Train set dimension: (2034, 2000)
Test set dimension: (1353, 2000)


In [12]:
import pandas as pd
pd.DataFrame(X_test_cv.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1349,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 문서 분류

### 나이브 베이즈 분류기(Naive Bayse Classifier)를 이용한 문서 분류
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [13]:
from sklearn.naive_bayes import MultinomialNB #sklearn이 제공하는 MultinomialNB 를 사용
NB_clf = MultinomialNB() # 분류기 선언

NB_clf.fit(X_train_cv, y_train) #train set을 이용하여 분류기(classifier)를 학습

print('Train set score: {:.3f}'.format(NB_clf.score(X_train_cv, y_train))) #train set에 대한 예측정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_cv, y_test))) #test set에 대한 예측정확도를 확인

Train set score: 0.824
Test set score: 0.732


In [14]:
#예측문제

print('#First document and label in test data:', X_test[0], y_test[0])
print('#Second document and label in test data:', X_test[1], y_test[1])

pred = NB_clf.predict(X_test_cv[:2])

print('#Predicted labels:', pred)
print('#Predicted categories:', newsgroups_train.target_names[pred[0]], newsgroups_train.target_names[pred[1]])

#First document and label in test data: TRry the SKywatch project in  Arizona. 2
#Second document and label in test data: The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available. 1
#Predicted labels: [2 1]
#Predicted categories: sci.space comp.graphics


In [15]:
#TF-IDF사용 

from sklearn.feature_extraction.text import TfidfVectorizer

#CountVectorizer와 동일한 인수를 사용
tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5) 
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

NB_clf.fit(X_train_tfidf, y_train) #tfidf train set을 이용하여 분류기(classifier)를 새로 학습
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train))) #train set에 대한 예측정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test))) #test set에 대한 예측정확도를 확인

Train set score: 0.862
Test set score: 0.741


In [17]:
#카테고리별로 영향을 많이 미친 특성(단어)

import numpy as np

def top10_features(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        # 역순으로 정렬하기 위해 계수에 음수를 취해서 정렬 후 앞에서부터 10개의 값을 반환
        top10 = np.argsort(-classifier.coef_[i])[:10]
        # 카테고리와 영향이 큰 특성 10개를 출력
        print("%s: %s" % (category, ", ".join(feature_names[top10])))

top10_features(NB_clf, tfidf, newsgroups_train.target_names)

alt.atheism: you, not, are, be, this, have, as, what, they, if
comp.graphics: you, on, graphics, this, have, any, can, or, with, thanks
sci.space: space, on, you, be, was, this, as, they, have, are
talk.religion.misc: you, not, he, are, as, this, be, god, was, they




### 의사결정나무를 이용한 문서 분류

In [19]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train_tfidf, y_train)
print('#Decision Tree train set score: {:.3f}'.format(tree.score(X_train_tfidf, y_train)))
print('#Decision Tree test set score: {:.3f}'.format(tree.score(X_test_tfidf, y_test)))

#Decision Tree train set score: 0.977
#Decision Tree test set score: 0.536


In [30]:
# 가지치기

tree2 = DecisionTreeClassifier(min_samples_split=50,random_state=7)
#tree2 = DecisionTreeClassifier(max_depth=2,random_state=7)

tree2.fit(X_train_tfidf, y_train)
print('#Decision Tree train set score: {:.3f}'.format(tree2.score(X_train_tfidf, y_train)))
print('#Decision Tree test set score: {:.3f}'.format(tree2.score(X_test_tfidf, y_test)))

#Decision Tree train set score: 0.807
#Decision Tree test set score: 0.551


### 불용어 제거하여 분류

In [31]:
# 필요한 library들을 import
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

RegTok = RegexpTokenizer("[\w']{3,}") # 정규포현식으로 토크나이저를 정의
english_stops = set(stopwords.words('english')) #영어 불용어를 가져옴

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower()) #이렇게 해도 되는지 확인
    # stopwords 제외
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
    # portr stemmer 적용
    features = (list(map(lambda token: PorterStemmer().stem(token),words)))
    return features

tfidf = TfidfVectorizer(tokenizer=tokenizer, max_features=2000, min_df=5, max_df=0.5) # 새로 정의한 토크나이저 사용
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

In [35]:
#나이브 베이즈 분류기 

NB_clf = MultinomialNB() 

NB_clf.fit(X_train_tfidf, y_train) #train set을 이용하여 분류기(classifier)를 학습

print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train))) #train set에 대한 예측정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test))) #test set에 대한 예측정확도를 확인

Train set score: 0.879
Test set score: 0.746


In [34]:
#의사결정모델 분류기

tree3 = DecisionTreeClassifier(min_samples_split=50,random_state=7)
#tree3 = DecisionTreeClassifier(random_state=7)
#tree3 = DecisionTreeClassifier(max_depth=2,random_state=7)

tree3.fit(X_train_tfidf, y_train)
print('#Decision Tree train set score: {:.3f}'.format(tree3.score(X_train_tfidf, y_train)))
print('#Decision Tree test set score: {:.3f}'.format(tree3.score(X_test_tfidf, y_test)))

#Decision Tree train set score: 0.880
#Decision Tree test set score: 0.603
