In [2]:
from sklearn.datasets import fetch_20newsgroups

# 20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# 학습 데이터셋을 가져옴
newsgroups_train = fetch_20newsgroups(subset='train',
                                      # remove를 사용하여 내용에서 힌트가 되는 부분을 삭제, 순수하게 내용만으로 분류
                                      remove=('headers', 'footers', 'quotes'),
                                      categories = categories)

# 평가 데이터셋을 가져옴
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories = categories)

print('Train set size:', len(newsgroups_train.data))
print('Test set size:', len(newsgroups_test.data))
print('Selected categories:', newsgroups_train.target_names)
print('Train labels:', set(newsgroups_train.target))

Train set size: 2034
Test set size: 1353
Selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
Train labels: {0, 1, 2, 3}


- #### .data는 텍스트의 내용을, .target은 숫자로 표시된 라벨(분류)을 의미한다.
- #### 라벨은 실행결과에서 나온(코드 작성 순서 아님) categories의 순서대로 번호를 붙인다.
- #### 예를 들어서 실행 결과 Selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'] 이거라면 
- #### 'alt.atheism' = 라벨 0, 'comp.graphics' = 라벨 1, 이런 순서대로 라벨이 정해진다.

In [4]:
print('Train set text samples:', newsgroups_train.data[0])
print('-----------------------------------------------------------')
print('Train set label samples:', newsgroups_train.target[0])
print('-----------------------------------------------------------')
print('Test set text samples:', newsgroups_test.data[0])
print('-----------------------------------------------------------')
print('Test set label samples:', newsgroups_test.target[0])

Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
-----------------------------------------------------------
Train set label samples: 1
-----------------------------------------------------------
Test set text samples: TRry the SKywatch project in  Arizona.
-----------------------------------------------------------
Test set label samples: 2


In [None]:
- #### train set[0]의 라벨이 1 이므로 이 텍스트의 내용은 'comp.graphics'의 내용이다.
- #### test set[0]의 라벨이 2 이므로 이 텍스트의 내용은 'sci.space'의 내용이다.

In [5]:
print('Train set text samples:', newsgroups_train.data[1])
print('-----------------------------------------------------------')
print('Train set label samples:', newsgroups_train.target[1])
print('-----------------------------------------------------------')
print('Test set text samples:', newsgroups_test.data[1])
print('-----------------------------------------------------------')
print('Test set label samples:', newsgroups_test.target[1])

Train set text samples: 

Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.
-----------------------------------------------------------
Train set label samples: 3
-----------------------------------------------------------
Test set text samples: The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available.
-----------------------------------------------------------
Test set label samples: 1


In [None]:
- #### .data는 텍스트의 내용을, .target은 숫자로 표시된 라벨(분류)을 의미한다.
- #### 라벨은 categories의 순서대로 번호를 붙인다.
- #### 예를들어서 categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] 이거라면 
- #### 'alt.atheism' = 라벨 0, 'talk.religion.misc' = 라벨 1, 이런 순서대로 라벨이 정해진다.

In [None]:
# X_train, X_test, y_train, y_test를 추출한 후에 실제로 문서 분류를 수행하기

In [6]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target

X_test = newsgroups_test.data
y_test = newsgroups_test.target

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df=5, max_df=0.5)

X_train_cv = cv.fit_transform(X_train) # Train set을 변환
print('Train set dimension:', X_train_cv.shape)
X_test_cv = cv.transform(X_test) # test set을 변환
print('Test set dimension:', X_test_cv.shape)

Train set dimension: (2034, 2000)
Test set dimension: (1353, 2000)
