# Bag of Words

단어들의 순서는 전혀 고려하지 않고, 단어들의 출현 빈도에만 집중하는 텍스트 데이터의 수치화 표현 방법이다.

In [19]:
from konlpy.tag import Okt

okt = Okt()

def build_bag_of_words(document):
    document = document.replace(".", "")
    tokenized_document = okt.morphs(document)
    
    word_to_idx = {}
    bow = []

    for word in tokenized_document:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
            bow.insert(len(word_to_idx) - 1, 1)
        else:
            idx = word_to_idx.get(word)
            bow[idx] += 1
    
    return word_to_idx, bow

In [20]:
doc_1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
vocab, bow = build_bag_of_words(doc_1)

print(vocab)
print(bow)

{'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
[1, 2, 1, 1, 2, 1, 1, 1, 1, 1]


## CountVectorizer 사용

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["you know I want your love. because I love you"]
vector = CountVectorizer()

print(f"BoW vector: {vector.fit_transform(corpus).toarray()}")
print(f"vocab: {vector.vocabulary_}")

BoW vector: [[1 1 2 1 2 1]]
vocab: {'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


## 불용어 제거를 사용한 전처리

## 사용자 정의

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

corpus = ["Family is not and important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])

print(f"BoW vector: {vect.fit_transform(corpus).toarray()}")
print(f"vocabulary: {vect.vocabulary_}")

BoW vector: [[1 1 1 1 1 1]]
vocabulary: {'family': 2, 'and': 0, 'important': 3, 'thing': 5, 'it': 4, 'everything': 1}


## CountVectorizer에서 제공하는 불용어 사용

In [31]:
corpus = ["Family is not and important thing. It's everything."]
vect = CountVectorizer(stop_words="english")

print(f"BoW vector: {vect.fit_transform(corpus).toarray()}")
print(f"vocabulary: {vect.vocabulary_}")

BoW vector: [[1 1 1]]
vocabulary: {'family': 0, 'important': 1, 'thing': 2}


## nltk에서 제공하는 불용어 사용

In [32]:
corpus = ["Family is not and important thing. It's everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)

print(f"BoW vector: {vect.fit_transform(corpus).toarray()}")
print(f"vocabulary: {vect.vocabulary_}")

BoW vector: [[1 1 1 1]]
vocabulary: {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}
