# Ch.8 텍스트 분석
### 8.2 텍스트 사전 준비 작업(텍스트 전처리) - 텍스트 정규화

In [1]:
from nltk import sent_tokenize
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text = 'The Matrix is everywhere its all around us, here even in this room. \
                You can see it out your window or on your television. \
                You feel it when you go to work, or go to church or pay your taxes.'

sentences = sent_tokenize(text=text)

print(type(sentences),len(sentences))
print(sentences)

<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [5]:
from nltk import word_tokenize

sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [8]:
from nltk import word_tokenize, sent_tokenize

def tokenize_text(text):
    
    sentences = sent_tokenize(text)
    
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    
    return word_tokens

word_tokens = tokenize_text(text)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


In [9]:
# 스톱 워드 제거

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
print('영어 stop words 개수 :', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

영어 stop words 개수 : 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [11]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens=[]

In [12]:
for word in text.split():
    if word not in stopwords:
        print(word)

The
Matrix
everywhere
around
us,
even
room.
You
see
window
television.
You
feel
go
work,
go
church
pay
taxes.


In [13]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []

for sentence in word_tokens:
    filtered_words=[]
    
    for word in sentence:
        word = word.lower()
        
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
    
print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


- Stemming과 Lemmatization

In [14]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [15]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
lemma = WordNetLemmatizer()

In [17]:
print(lemma.lemmatize('amusing','v'), lemma.lemmatize('amuses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('happier','a'),lemma.lemmatize('happiest','a'))
print(lemma.lemmatize('fancier','a'),lemma.lemmatize('fanciest','a'))

amuse amuse amuse
happy happy
fancy fancy


### 8.3 Bag of Words - BOW

### 8.4 텍스트 분류 실습 - 20 뉴스그룹 분류

In [18]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all', random_state = 156)

In [19]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [20]:
import pandas as pd

print('target 클래스의 값과 분포도\n',pd.Series(news_data.target).value_counts().sort_index())
print('target 클래스의 이름 ', news_data.target_names)

target 클래스의 값과 분포도
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target 클래스의 이름  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [21]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [22]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers','quotes'), random_state=156)

X_train = train_news.data
Y_train = train_news.target

print(type(X_train),len(X_train))

<class 'list'> 11314


In [23]:
print(X_train[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave



In [24]:
print(train_news.target_names[Y_train[0]])

comp.sys.mac.hardware


In [25]:
test_news = fetch_20newsgroups(subset='test', remove=('headers','footers','quotes'), random_state=156)

X_test = test_news.data
Y_test = test_news.target
print(type(X_test),len(X_test))

<class 'list'> 7532


In [26]:
Y_test[:5]

array([ 4, 11,  1,  7,  8])

### 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

- CASE-1 CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

X_test_cnt_vect = cnt_vect.transform(X_test)

In [28]:
X_train_cnt_vect.shape

(11314, 101631)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, Y_train)
pred = lr_clf.predict(X_test_cnt_vect)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [30]:
accuracy_score(Y_test,pred)

0.6066117896972916

In [31]:
pred[:5]

array([ 4, 11,  6,  7,  8])

- CACE-2 TfidfVectorizer

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, Y_train)
pred_lr = lr_clf.predict(X_test_tfidf_vect)


In [101]:
accuracy_score(Y_test,pred_lr)

0.6736590546999469

In [34]:
pred[:5]

array([5, 1, 1, 7, 8])

In [35]:
tfidf_vect # 하이퍼 파라미터

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [36]:
lr_clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

- CASE-3 stop words 필터링을 추가하고 ngram을 기본(1,2)로 변경

In [37]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)

tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [98]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, Y_train)
pred_lr2 = lr_clf.predict(X_test_tfidf_vect)

In [99]:
accuracy_score(Y_test,pred_lr2)

0.6922464152947424

- CASE-4 case 3에서 LR 파라메터 조정

In [102]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, Y_train)
pred_lr3 = lr_clf.predict(X_test_tfidf_vect)

In [103]:
accuracy_score(Y_test, pred_lr3)

0.6845459373340415

In [42]:
pred[:5]

array([ 3, 11,  1,  7,  8])

- CASE-5 SVC

In [43]:
from sklearn.svm import SVC

In [114]:
svc_vect = SVC()
svc_vect

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [115]:
svc_vect.fit(X_train_cnt_vect, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [116]:
pred_svc = svc_vect.predict(X_test_cnt_vect)

In [117]:
pred_svc

array([ 6,  6,  6, ...,  6, 17,  6])

In [118]:
accuracy_score(Y_test,pred_svc)

0.097052575677111

- CASE-6 DecisionTree 

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [107]:
dt_clf =  LogisticRegression()
dt_clf.fit(X_train_cnt_vect, Y_train)
pred_dt = dt_clf.predict(X_test_cnt_vect)

In [108]:
accuracy_score(Y_test,pred_dt)

0.6066117896972916

- CASE-7 Pipeline

In [92]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english')),('lr_clf',LogisticRegression(random_state=156))])

In [93]:
pipeline = Pipeline([('tfidf_vect',TfidfVectorizer(stop_words='english', ngram_range=(1,2),max_df=300)),('lr_clf',LogisticRegression(C=10))])

In [94]:
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('tfidf_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=300, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr_clf',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, inter

In [109]:
pred_pi=pipeline.predict(X_test)

In [110]:
print('pipeline을 통한 Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(Y_test,pred_pi)))

pipeline을 통한 Logistic Regression의 예측 정확도는 0.701


In [113]:
df = pd.DataFrame({'LR':pred,'LR2':pred_lr2,'LR3':pred_lr3,'SVC':pred_svc,'DTC':pred_dt,'PIPE':pred_pi, '정답': Y_test})
df

Unnamed: 0,LR,LR2,LR3,SVC,DTC,PIPE,정답
0,3,3,3,6,4,3,4
1,11,11,11,6,11,11,11
2,1,1,1,6,6,1,1
3,7,7,7,6,7,7,7
4,8,8,8,6,8,8,8
...,...,...,...,...,...,...,...
7527,7,7,13,8,14,7,12
7528,0,0,0,6,18,0,13
7529,7,6,6,6,6,7,6
7530,17,17,17,17,17,17,17
