### CNN 뉴스 문서에 대해 모든 단어를 토큰화 english stop word 제거

### 내 풀이

In [25]:
from nltk import word_tokenize, sent_tokenize

corpus = '''
A vacuum of knowledge about the origins of the new coronavirus ravaging the world has provided fertile ground for all manner of theories -- from the fantastic, to the dubious to the believable.
It was a bioweapon manufactured by the Chinese. The US Army brought the virus to Wuhan. It leaked -- like a genie out of a bottle -- from a lab in an accident. It took root at a wildlife market in Wuhan.
'''
sentences = sent_tokenize(text=corpus)
#print(sentences)

words = [word_tokenize(sentence) for sentence in sentences]
print(words)

[['A', 'vacuum', 'of', 'knowledge', 'about', 'the', 'origins', 'of', 'the', 'new', 'coronavirus', 'ravaging', 'the', 'world', 'has', 'provided', 'fertile', 'ground', 'for', 'all', 'manner', 'of', 'theories', '--', 'from', 'the', 'fantastic', ',', 'to', 'the', 'dubious', 'to', 'the', 'believable', '.'], ['It', 'was', 'a', 'bioweapon', 'manufactured', 'by', 'the', 'Chinese', '.'], ['The', 'US', 'Army', 'brought', 'the', 'virus', 'to', 'Wuhan', '.'], ['It', 'leaked', '--', 'like', 'a', 'genie', 'out', 'of', 'a', 'bottle', '--', 'from', 'a', 'lab', 'in', 'an', 'accident', '.'], ['It', 'took', 'root', 'at', 'a', 'wildlife', 'market', 'in', 'Wuhan', '.']]


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stop_w = nltk.corpus.stopwords.words('english')
#print(stop_w)
filtered_tokens = []

for word in words:
    for word_token in word:
        word_token = word_token.lower()
        if word_token not in stop_w:
            filtered_tokens.append(word_token)
print(filtered_tokens)

['vacuum', 'knowledge', 'origins', 'new', 'coronavirus', 'ravaging', 'world', 'provided', 'fertile', 'ground', 'manner', 'theories', '--', 'fantastic', ',', 'dubious', 'believable', '.', 'bioweapon', 'manufactured', 'chinese', '.', 'us', 'army', 'brought', 'virus', 'wuhan', '.', 'leaked', '--', 'like', 'genie', 'bottle', '--', 'lab', 'accident', '.', 'took', 'root', 'wildlife', 'market', 'wuhan', '.']


### 해답

In [52]:
# 정답

import nltk

def tokenize_text(text):
    # 문장별 분리 토큰화
    sentences = sent_tokenize(text)
    # 문장별 단어 토큰화
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

corpus = '''
A vacuum of knowledge about the origins of the new coronavirus ravaging the world has provided fertile ground for all manner of theories -- from the fantastic, to the dubious to the believable.
It was a bioweapon manufactured by the Chinese. The US Army brought the virus to Wuhan. It leaked -- like a genie out of a bottle -- from a lab in an accident. It took root at a wildlife market in Wuhan.
'''

word_tokens = tokenize_text(corpus)

stopwords = nltk.corpus.stopwords.words('english')

all_tokens = []
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens); print()
print('type: ', type(all_tokens), ', len: ', len(all_tokens))

[['vacuum', 'knowledge', 'origins', 'new', 'coronavirus', 'ravaging', 'world', 'provided', 'fertile', 'ground', 'manner', 'theories', '--', 'fantastic', ',', 'dubious', 'believable', '.'], ['bioweapon', 'manufactured', 'chinese', '.'], ['us', 'army', 'brought', 'virus', 'wuhan', '.'], ['leaked', '--', 'like', 'genie', 'bottle', '--', 'lab', 'accident', '.'], ['took', 'root', 'wildlife', 'market', 'wuhan', '.']]

type:  <class 'list'> , len:  5


### 동일한 문서에 대한 벡터화(ContVectorizer) english stop word 제거 -> 사전
### 사전을 이용 'This is the second document' 벡터 표현 

### 내 풀이

In [63]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

li = []
corpus = '''
A vacuum of knowledge about the origins of the new coronavirus ravaging the world has provided fertile ground for all manner of theories -- from the fantastic, to the dubious to the believable.
It was a bioweapon manufactured by the Chinese. The US Army brought the virus to Wuhan. It leaked -- like a genie out of a bottle -- from a lab in an accident. It took root at a wildlife market in Wuhan.
'''
li.append(corpus)

countvect = CountVectorizer()
countvect.fit(li)

print(countvect.get_feature_names())
print()
print(countvect.vocabulary_)


['about', 'accident', 'all', 'an', 'army', 'at', 'believable', 'bioweapon', 'bottle', 'brought', 'by', 'chinese', 'coronavirus', 'dubious', 'fantastic', 'fertile', 'for', 'from', 'genie', 'ground', 'has', 'in', 'it', 'knowledge', 'lab', 'leaked', 'like', 'manner', 'manufactured', 'market', 'new', 'of', 'origins', 'out', 'provided', 'ravaging', 'root', 'the', 'theories', 'to', 'took', 'us', 'vacuum', 'virus', 'was', 'wildlife', 'world', 'wuhan']

{'vacuum': 42, 'of': 31, 'knowledge': 23, 'about': 0, 'the': 37, 'origins': 32, 'new': 30, 'coronavirus': 12, 'ravaging': 35, 'world': 46, 'has': 20, 'provided': 34, 'fertile': 15, 'ground': 19, 'for': 16, 'all': 2, 'manner': 27, 'theories': 38, 'from': 17, 'fantastic': 14, 'to': 39, 'dubious': 13, 'believable': 6, 'it': 22, 'was': 44, 'bioweapon': 7, 'manufactured': 28, 'by': 10, 'chinese': 11, 'us': 41, 'army': 4, 'brought': 9, 'virus': 43, 'wuhan': 47, 'leaked': 25, 'like': 26, 'genie': 18, 'out': 33, 'bottle': 8, 'lab': 24, 'in': 21, 'an': 

In [65]:
countvect = CountVectorizer(stop_words='english').fit(li)
countvect.vocabulary_

print(countvect.get_feature_names())
print()
print(countvect.vocabulary_)

['accident', 'army', 'believable', 'bioweapon', 'bottle', 'brought', 'chinese', 'coronavirus', 'dubious', 'fantastic', 'fertile', 'genie', 'ground', 'knowledge', 'lab', 'leaked', 'like', 'manner', 'manufactured', 'market', 'new', 'origins', 'provided', 'ravaging', 'root', 'theories', 'took', 'vacuum', 'virus', 'wildlife', 'world', 'wuhan']

{'vacuum': 27, 'knowledge': 13, 'origins': 21, 'new': 20, 'coronavirus': 7, 'ravaging': 23, 'world': 30, 'provided': 22, 'fertile': 10, 'ground': 12, 'manner': 17, 'theories': 25, 'fantastic': 9, 'dubious': 8, 'believable': 2, 'bioweapon': 3, 'manufactured': 18, 'chinese': 6, 'army': 1, 'brought': 5, 'virus': 28, 'wuhan': 31, 'leaked': 15, 'like': 16, 'genie': 11, 'bottle': 4, 'lab': 14, 'accident': 0, 'took': 26, 'root': 24, 'wildlife': 29, 'market': 19}


### 해답

In [70]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['''
A vacuum of knowledge about the origins of the new coronavirus ravaging the world has provided fertile ground for all manner of theories -- from the fantastic, to the dubious to the believable.
It was a bioweapon manufactured by the Chinese. The US Army brought the virus to Wuhan. It leaked -- like a genie out of a bottle -- from a lab in an accident. It took root at a wildlife market in Wuhan.
''']

countvect = CountVectorizer(stop_words='english').fit(corpus)

print(countvect.get_feature_names())
print()
print(countvect.vocabulary_)

['accident', 'army', 'believable', 'bioweapon', 'bottle', 'brought', 'chinese', 'coronavirus', 'dubious', 'fantastic', 'fertile', 'genie', 'ground', 'knowledge', 'lab', 'leaked', 'like', 'manner', 'manufactured', 'market', 'new', 'origins', 'provided', 'ravaging', 'root', 'theories', 'took', 'vacuum', 'virus', 'wildlife', 'world', 'wuhan']

{'vacuum': 27, 'knowledge': 13, 'origins': 21, 'new': 20, 'coronavirus': 7, 'ravaging': 23, 'world': 30, 'provided': 22, 'fertile': 10, 'ground': 12, 'manner': 17, 'theories': 25, 'fantastic': 9, 'dubious': 8, 'believable': 2, 'bioweapon': 3, 'manufactured': 18, 'chinese': 6, 'army': 1, 'brought': 5, 'virus': 28, 'wuhan': 31, 'leaked': 15, 'like': 16, 'genie': 11, 'bottle': 4, 'lab': 14, 'accident': 0, 'took': 26, 'root': 24, 'wildlife': 29, 'market': 19}


In [71]:
print(countvect.transform(corpus).toarray())

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2]]


In [72]:
print(countvect.transform(['This is the second documnet']).toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


### TfidVectorizer를 이용하여 동일 작업 수행

### 내 풀이

In [73]:
#TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

li = []
corpus = '''
A vacuum of knowledge about the origins of the new coronavirus ravaging the world has provided fertile ground for all manner of theories -- from the fantastic, to the dubious to the believable.
It was a bioweapon manufactured by the Chinese. The US Army brought the virus to Wuhan. It leaked -- like a genie out of a bottle -- from a lab in an accident. It took root at a wildlife market in Wuhan.
'''
li.append(corpus)

tfidfv = TfidfVectorizer()
tfidfv = TfidfVectorizer(stop_words = 'english').fit(li)

print(tfidfv.get_feature_names())
print()
print(tfidfv.vocabulary_)

['accident', 'army', 'believable', 'bioweapon', 'bottle', 'brought', 'chinese', 'coronavirus', 'dubious', 'fantastic', 'fertile', 'genie', 'ground', 'knowledge', 'lab', 'leaked', 'like', 'manner', 'manufactured', 'market', 'new', 'origins', 'provided', 'ravaging', 'root', 'theories', 'took', 'vacuum', 'virus', 'wildlife', 'world', 'wuhan']

{'vacuum': 27, 'knowledge': 13, 'origins': 21, 'new': 20, 'coronavirus': 7, 'ravaging': 23, 'world': 30, 'provided': 22, 'fertile': 10, 'ground': 12, 'manner': 17, 'theories': 25, 'fantastic': 9, 'dubious': 8, 'believable': 2, 'bioweapon': 3, 'manufactured': 18, 'chinese': 6, 'army': 1, 'brought': 5, 'virus': 28, 'wuhan': 31, 'leaked': 15, 'like': 16, 'genie': 11, 'bottle': 4, 'lab': 14, 'accident': 0, 'took': 26, 'root': 24, 'wildlife': 29, 'market': 19}


## 해답

In [66]:
#TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['''
A vacuum of knowledge about the origins of the new coronavirus ravaging the world has provided fertile ground for all manner of theories -- from the fantastic, to the dubious to the believable.
It was a bioweapon manufactured by the Chinese. The US Army brought the virus to Wuhan. It leaked -- like a genie out of a bottle -- from a lab in an accident. It took root at a wildlife market in Wuhan.
''']

tfidfv = TfidfVectorizer(stop_words = 'english').fit(corpus)

print(tfidfv.get_feature_names())
print()
print(tfidfv.vocabulary_)

['accident', 'army', 'believable', 'bioweapon', 'bottle', 'brought', 'chinese', 'coronavirus', 'dubious', 'fantastic', 'fertile', 'genie', 'ground', 'knowledge', 'lab', 'leaked', 'like', 'manner', 'manufactured', 'market', 'new', 'origins', 'provided', 'ravaging', 'root', 'theories', 'took', 'vacuum', 'virus', 'wildlife', 'world', 'wuhan']

{'vacuum': 27, 'knowledge': 13, 'origins': 21, 'new': 20, 'coronavirus': 7, 'ravaging': 23, 'world': 30, 'provided': 22, 'fertile': 10, 'ground': 12, 'manner': 17, 'theories': 25, 'fantastic': 9, 'dubious': 8, 'believable': 2, 'bioweapon': 3, 'manufactured': 18, 'chinese': 6, 'army': 1, 'brought': 5, 'virus': 28, 'wuhan': 31, 'leaked': 15, 'like': 16, 'genie': 11, 'bottle': 4, 'lab': 14, 'accident': 0, 'took': 26, 'root': 24, 'wildlife': 29, 'market': 19}


In [68]:
print(tfidfv.transform(corpus).toarray())

[[0.16903085 0.16903085 0.16903085 0.16903085 0.16903085 0.16903085
  0.16903085 0.16903085 0.16903085 0.16903085 0.16903085 0.16903085
  0.16903085 0.16903085 0.16903085 0.16903085 0.16903085 0.16903085
  0.16903085 0.16903085 0.16903085 0.16903085 0.16903085 0.16903085
  0.16903085 0.16903085 0.16903085 0.16903085 0.16903085 0.16903085
  0.16903085 0.3380617 ]]


In [67]:
print(tfidfv.transform(['This is the second documnet']).toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]
