In [1]:
from konlpy.tag import Okt

In [2]:
okt = Okt()
def build_bagofwords(document):
    document = document.replace('.', '')
    tokenized_doc = okt.morphs(document)
    word_to_index = {}
    bow = []
    for word in tokenized_doc:
        if word not in word_to_index.keys():
            word_to_index[word] = len(word_to_index)
            bow.insert(len(word_to_index) -1, 1)
        else:
            index = word_to_index.get(word)
            bow[index] = bow[index] + 1
    return word_to_index, bow

## 함수 설명
- for문 전에 토큰화를 함
- word_to_index, bow를 각각 빈 딕셔너리, 리스트 형태로 선언
- word가 word_to_index에 없는 경우(처음 나온 단어인 경우)
  - word_to_index의 워드의 값은 word_to_index길이로 넣음
  - bow에 기본값 1을 넣음
- 처음 나온 단어가 아니라면
  - 재등장하는 단어의 인덱스에 1을 더함

In [3]:
doc1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
vocab, bow = build_bagofwords(doc1)
print('vocabulary : ', vocab)
print('bag of words : ', bow)

vocabulary :  {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
bag of words :  [1, 2, 1, 1, 2, 1, 1, 1, 1, 1]


In [4]:
doc2 = '소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'
vocab2, bow2 = build_bagofwords(doc2)
print('vocabulary : ', vocab2)
print('bag of words : ', bow2)


vocabulary :  {'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
bag of words :  [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]


In [5]:
doc3 = doc1 + ' ' + doc2
vocab3, bow3 = build_bagofwords(doc3)
print('vocabulary : ', vocab3)
print('bag of words : ', bow3)

vocabulary :  {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9, '는': 10, '주로': 11, '소비': 12, '상품': 13, '을': 14, '기준': 15, '으로': 16, '느낀다': 17}
bag of words :  [1, 2, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]


# CountVectorizer클래스로 BOW 만들기

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()
print('bag of words vector : ', vector.fit_transform(corpus).toarray())
print('vocabulary : ', vector.vocabulary_)

bag of words vector :  [[1 1 2 1 2 1]]
vocabulary :  {'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


# 불용어를 제거한 bow만들기

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

## 사용자가 직접 정의한 불용어

In [8]:
text = ["Family is not an important thing. It's everything."]
vector = CountVectorizer(stop_words = ["the", "a", "an", "is", "not"])
print('bag of words vector : ', vector.fit_transform(text).toarray())
print('vocabulary : ', vector.vocabulary_)

bag of words vector :  [[1 1 1 1 1]]
vocabulary :  {'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [9]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")
print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1]]
vocabulary : {'family': 0, 'important': 1, 'thing': 2}


In [10]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)
print('bag of words vector :',vect.fit_transform(text).toarray()) 
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}
