In [1]:
bards_words = [
    "The fool doth think he is wise,",
    "but the wise man knows himself to be a fool"
]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words)

CountVectorizer()

In [4]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary size: 13
Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [5]:
bag_of_words = vect.transform(bards_words)
print("bag_of_words: {}".format(repr(bag_of_words)))

bag_of_words: <2x13 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>


In [6]:
print("Dense representation of bag_of_words: \n{}".format(
        bag_of_words.toarray()
))

Dense representation of bag_of_words: 
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


### ストップワード
**あまりに頻出するため**役に立たない単語を捨てる方法がある

- 言語固有のストップワードを作る
  - scikit-learn はこの
- 頻度の高い単語を捨てる

In [8]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword: \n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword: 
['already', 'fire', 'hence', 'last', 'also', 'de', 'see', 'first', 'me', 'their', 'one', 'you', 'not', 'elsewhere', 'bill', 'under', 'whereafter', 'else', 'sometime', 'there', 'give', 'can', 'until', 'whole', 'describe', 'three', 'becomes', 'seems', 're', 'thin', 'eg', 'before']


### tf-idf
[wiki](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

### n-グラム
BoW 表現の問題は単語の順番が完全に失われること。

そこで、コンテクストを捉える手法の一つとして、2つもしくは3つの連続するトークンの列を考えることがある。

In [9]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary: \n{}".format(cv.get_feature_names()))

Vocabulary size: 14
Vocabulary: 
['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']


In [10]:
cv = CountVectorizer(ngram_range=(1, 3)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary: \n{}".format(cv.get_feature_names()))

Vocabulary size: 39
Vocabulary: 
['be', 'be fool', 'but', 'but the', 'but the wise', 'doth', 'doth think', 'doth think he', 'fool', 'fool doth', 'fool doth think', 'he', 'he is', 'he is wise', 'himself', 'himself to', 'himself to be', 'is', 'is wise', 'knows', 'knows himself', 'knows himself to', 'man', 'man knows', 'man knows himself', 'the', 'the fool', 'the fool doth', 'the wise', 'the wise man', 'think', 'think he', 'think he is', 'to', 'to be', 'to be fool', 'wise', 'wise man', 'wise man knows']


In [None]:
print("hi")
