# 20 뉴스그룹 분류

## 텍스트 데이터 확인

In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

news_data = fetch_20newsgroups(subset='all', random_state=100)  # subset=all: 훈련, 평가데이터 모두 섞어 가져오기

In [3]:
print(type(news_data))
print(np.array(news_data.data).shape)

<class 'sklearn.utils._bunch.Bunch'>
(18846,)


In [4]:
import pandas as pandasd

print('target 클래스')
print(dict(zip(np.unique(news_data.target_names), np.bincount(news_data.target))))

target 클래스
{'alt.atheism': 799, 'comp.graphics': 973, 'comp.os.ms-windows.misc': 985, 'comp.sys.ibm.pc.hardware': 982, 'comp.sys.mac.hardware': 963, 'comp.windows.x': 988, 'misc.forsale': 975, 'rec.autos': 990, 'rec.motorcycles': 996, 'rec.sport.baseball': 994, 'rec.sport.hockey': 999, 'sci.crypt': 991, 'sci.electronics': 984, 'sci.med': 990, 'sci.space': 987, 'soc.religion.christian': 997, 'talk.politics.guns': 910, 'talk.politics.mideast': 940, 'talk.politics.misc': 775, 'talk.religion.misc': 628}


In [5]:
print(news_data.data[0])

From: ggr@koonda.acci.com.au (Greg Rose)
Subject: Authentication and one-time-pads (was: Re: Advanced one time pad)
Summary: presents one-time-pad based MAC
Organization: Australian Computing and Communications Institute
Lines: 93

In article <1s1dbmINNehb@elang05.acslab.umbc.edu> olson@umbc.edu (Bryan Olson; CMSC (G)) writes:
>The one-time-pad yeilds ideal security, but has a well-known flaw in
>authentication.  Suppose you use a random bit stream as the pad, and
>exclusive-or as the encryption operation.  If an adversary knows the 
>plaintext of a message, he can change it into any other message.  
>Here's how it works.
>
>Alice is sending Bob a plaintext P, under a key stream S
>Alice computes the ciphertext C = S xor P,  and sends it to Bob.
>
>Eve knows the plainext P, but wants the message to appear as P'.
>Eve intercepts C, and computes  C' = C xor P xor P' = S xor P'.
>Eve sends C' to Bob.
>
>Bob decrypts C' by computing  C'xor S = P',  thus receiving the 
>false message which 

In [7]:
news_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), random_state=100)

In [8]:
print(news_data.data[0])


Firstly, an aside:

I agree that the weakness exists, but I have a lot of trouble
believing that it represents a difficulty in real life. Given:

1. the purpose of the one-time pad is to give unbreakable security,
and the expense of key distribution etc., imply that the clients
really do want that level of security

2. These same people want to keep P a secret

I find it hard to believe that Eve might happen to have a copy of P
lying around.

(I am aware that the same argument applies to Eve knowing even a small
part of the message, but Eve must know EXACTLY where (which bytes) in
C her known susequence starts, or the result will be garbled. I find
this at least as surprising.)

Back to the question:

If I had the resources to use a one-time-pad for such transmissions, I
would also append a Message Authentication Code to the message, using up
the next bits of the one-time-pad as the key perhaps. Your original
question basically asked whether there was any way to authenticate the
messa

## 학습 데이터/평가 데이터 분리

In [9]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=100)
x_train = train_news.data
y_train = train_news.target

test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=100)
x_test = test_news.data
y_test = test_news.target

print(f'학습데이터 크기: {len(x_train)}, 평가데이터 크기:{len(x_test)}')

학습데이터 크기: 11314, 평가데이터 크기:7532


## 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

### countvectorizer 이용

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

cnt_vect = CountVectorizer(stop_words = stopwords.words('english'))

# fit, transform을 나눈 이유: 학습데이터에서 학습한 기준으로 평가데이터도 학습 필요
cnt_vect.fit(x_train)
x_train_cnt_vect = cnt_vect.transform(x_train)

x_test_cnt_vect = cnt_vect.transform(x_test)
print(f'학습데이터 텍스트 cnt vect의 shape: {x_train_cnt_vect.shape}') # 열: 학습데이터에 있는 토큰화된 단어의 개수

학습데이터 텍스트 cnt vect의 shape: (11314, 101487)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(solver='liblinear')
lr.fit(x_train_cnt_vect, y_train)  # 종속변수는 0~19의 숫자를 가짐(뉴스 20개의 카테고리)

y_hat = lr.predict(x_test_cnt_vect)
print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')

정확도: 0.635


### tfidfvectorizer 이용 - 1

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

tfidf_vect = TfidfVectorizer(stop_words = stopwords.words('english'))

tfidf_vect.fit(x_train)
x_train_tfidf_vect = tfidf_vect.transform(x_train)

x_test_tfidf_vect = tfidf_vect.transform(x_test)
print(f'학습데이터 텍스트 cnt vect의 shape: {x_train_tfidf_vect.shape}')
# 영문서 빈도: 특정단어가 다른 문서에도 반복적으로 나오면 패널티 부여

학습데이터 텍스트 cnt vect의 shape: (11314, 101487)


In [13]:
lr = LogisticRegression(solver='liblinear')
lr.fit(x_train_tfidf_vect, y_train)  

y_hat = lr.predict(x_test_tfidf_vect)
print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')

정확도: 0.688


## tfidfvectorizer 이용 - 2

In [None]:
# max_df : 전체 문서에 걸쳐서 너무 높은 빈도수를 가지는 단어 피처를 제외
# 너무 높은 빈도수를 가지는 단어는 불용어오 같이 단어로서의 의미가 없는 반복적인 단어일 확률이 높다
# max_df=300 : 전체 문서에 걸쳐 300개 이하로 나오는 단어만 피처로 추출
tfidf_vec = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range=(1,2), max_df=300) 
tfidf_vec.fit(x_train)
x_train_tfidf_vec = tfidf_vec.transform(x_train) # 왜 나눠서 하나? 테스트 데이터도 학습 데이터와 같은 기준으로 transform 해야되기 때문에

x_test_tfidf_vec = tfidf_vec.transform(x_test)
print(f'학습 데이터 텍스트 tfidf vect shape: {x_train_tfidf_vec.shape}')

In [None]:
lr = LogisticRegression(solver='liblinear')
lr.fit(x_train_tfidf_vec, y_train)

y_hat = lr.predict(x_test_tfidf_vec)
print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')