In [2]:
%matplotlib inline

from pathlib import Path

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize          
from nltk.stem.snowball import EnglishStemmer 
import matplotlib.pylab as plt
from dmba import printTermDocumentMatrix, classificationSummary, liftChart

nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/JAE111/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Table 20.1 문장 S1~S3 내 단어들의 용어-문서 행렬 표현

In [3]:
text = ['this is the first sentence.',
       'this is a second sentence',
       'the third sentence is here.']

# Learn features based on text
count_vect = CountVectorizer() # 각 term이 각 document에서 몇번 나오는지(빈도)를 추출하는 모델
counts = count_vect.fit_transform(text)

printTermDocumentMatrix(count_vect, counts) # term-document matrix를 출력

          S1  S2  S3
first      1   0   0
here       0   0   1
is         1   1   1
second     0   1   0
sentence   1   1   1
the        1   0   1
third      0   0   1
this       1   1   0


# Table 20.2 Term-document representation of words in sentences S1-S4 (Example 2)

In [4]:
text = ['this is the first    sentence!!',
       'this is a second Sentence :)',
       'the third sentence, is here ',
       'forth of all sentences']

# Learn features based on text. Include special characters that are part of a word in the analysis.
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)

printTermDocumentMatrix(count_vect, counts)

           S1  S2  S3  S4
all         0   0   0   1
first       1   0   0   0
forth       0   0   0   1
here        0   0   1   0
is          1   1   1   0
of          0   0   0   1
second      0   1   0   0
sentence    1   1   1   0
sentences   0   0   0   1
the         1   0   1   0
third       0   0   1   0
this        1   1   0   0


# Table 20.3 Tokenization of S1-S4

In [5]:
text = ['this is the first     sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

count_vect = CountVectorizer(token_pattern = '[a-zA-Z!:)]+')
counts = count_vect.fit_transform(text)

printTermDocumentMatrix(count_vect, counts)

            S1  S2  S3  S4
:)           0   1   0   0
a            0   1   0   0
all          0   0   0   1
first        1   0   0   0
forth        0   0   0   1
here         0   0   1   0
is           1   1   1   0
of           0   0   0   1
second       0   1   0   0
sentence     0   1   1   0
sentence!!   1   0   0   0
sentences    0   0   0   1
the          1   0   1   0
third        0   0   1   0
this         1   1   0   0


# Table 20.4 Stopwords in scitkit-learn

In [6]:
stopWords = list(sorted(ENGLISH_STOP_WORDS)) 
# ENGLISH_STOP_WORDS ; scikit-learn 모듈 내에 저장되어 있는 stop-words(전처리 단계에서 제거되어야 하는 용어들 - 단순히 양과 잡음을 늘리는 용어들)
ncolumns = 6; nrows = 30

print('First {} of {} stopwords'.format(ncolumns * nrows, len(stopWords)))
for i in range(0, len(stopWords[:(ncolumns * nrows)]), ncolumns):
    print(''.join(word.ljust(13) for word in stopWords[i:(i+ncolumns)]))

First 180 of 318 stopwords
a            about        above        across       after        afterwards   
again        against      all          almost       alone        along        
already      also         although     always       am           among        
amongst      amoungst     amount       an           and          another      
any          anyhow       anyone       anything     anyway       anywhere     
are          around       as           at           back         be           
became       because      become       becomes      becoming     been         
before       beforehand   behind       being        below        beside       
besides      between      beyond       bill         both         bottom       
but          by           call         can          cannot       cant         
co           con          could        couldnt      cry          de           
describe     detail       do           done         down         due          
during       each        

# Table 20.5 Text reduction of S1-S4 using stemming

In [12]:
text = ['this is the first     sentence!! ',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

# Create a custom tokenizer that will use NLTK for tokenizing and lemmatizing
# (removes interpunctuation and stop words)
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc) 
                if t.isalpha() and t not in self.stopWords]
# ⭐️⭐️⭐️이 코드 전체가 이해가 안 되는데요....??? 갑자기 클래스 정의?????? __init__???? __call__????⭐️⭐️⭐️

count_vect = CountVectorizer(tokenizer=LemmaTokenizer())# 아까 정의한 토크나이저를 사용하여 토크나이징
counts = count_vect.fit_transform(text)

printTermDocumentMatrix(count_vect, counts)

         S1  S2  S3  S4
forth     0   0   0   1
second    0   1   0   0
sentenc   1   1   1   1


# Table 20.6 tf-idf matrix for S1-S4 example (after tokenization)

In [13]:
text = ['this is the first     sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

# Apply CountVectorizer and TfidfTransformer sequentially
count_vect = CountVectorizer()
tfidfTransformer = TfidfTransformer(smooth_idf = False, norm=None)
# TfidfTransform : 셀의 값을 TermFrequency-InverseDocumentFrequency 값으로 바꿔주는 모델
counts = count_vect.fit_transform(text)
tfidf = tfidfTransformer.fit_transform(counts)

printTermDocumentMatrix(count_vect, tfidf)

                 S1        S2        S3        S4
all        0.000000  0.000000  0.000000  2.386294
first      2.386294  0.000000  0.000000  0.000000
forth      0.000000  0.000000  0.000000  2.386294
here       0.000000  0.000000  2.386294  0.000000
is         1.287682  1.287682  1.287682  0.000000
of         0.000000  0.000000  0.000000  2.386294
second     0.000000  2.386294  0.000000  0.000000
sentence   1.287682  1.287682  1.287682  0.000000
sentences  0.000000  0.000000  0.000000  2.386294
the        1.693147  0.000000  1.693147  0.000000
third      0.000000  0.000000  2.386294  0.000000
this       1.693147  1.693147  0.000000  0.000000


In [None]:
Table 20.6 tf-idf matrix for S1-S4 example (after tokenization)
text = ['this is the first     sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
​
# Apply CountVectorizer and TfidfTransformer sequentially
count_vect = CountVectorizer()
tfidfTransformer = TfidfTransformer(smooth_idf=False, norm=None)
counts = count_vect.fit_transform(text)
tfidf = tfidfTransformer.fit_transform(counts)
​
printTermDocumentMatrix(count_vect, tfidf)