In [1]:
from konlpy.tag import Okt
okt = Okt()

def build_bag_of_words(document):
    document = document.replace('.', '')
    tokenized_document = okt.morphs(document)

    word_to_index = {}
    bow = []

    for word in tokenized_document:
        if word not in word_to_index.keys():
            word_to_index[word] = len(word_to_index)
            bow.insert(len(word_to_index)-1, 1)
        else:
            index = word_to_index.get(word)
            bow[index] = bow[index] + 1
    return word_to_index, bow

In [2]:
doc1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
vocab, bow = build_bag_of_words(doc1)

In [3]:
vocab

{'정부': 0,
 '가': 1,
 '발표': 2,
 '하는': 3,
 '물가상승률': 4,
 '과': 5,
 '소비자': 6,
 '느끼는': 7,
 '은': 8,
 '다르다': 9}

In [4]:
bow

[1, 2, 1, 1, 2, 1, 1, 1, 1, 1]

In [5]:
doc2 = '소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'
vocab, bow = build_bag_of_words(doc2)

In [6]:
vocab

{'소비자': 0,
 '는': 1,
 '주로': 2,
 '소비': 3,
 '하는': 4,
 '상품': 5,
 '을': 6,
 '기준': 7,
 '으로': 8,
 '물가상승률': 9,
 '느낀다': 10}

In [7]:
bow

[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]

In [8]:
doc3 = doc1 + ' ' + doc2
vocab, bow = build_bag_of_words(doc3)

In [9]:
doc3

'정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다. 소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'

In [10]:
vocab

{'정부': 0,
 '가': 1,
 '발표': 2,
 '하는': 3,
 '물가상승률': 4,
 '과': 5,
 '소비자': 6,
 '느끼는': 7,
 '은': 8,
 '다르다': 9,
 '는': 10,
 '주로': 11,
 '소비': 12,
 '상품': 13,
 '을': 14,
 '기준': 15,
 '으로': 16,
 '느낀다': 17}

In [11]:
bow

[1, 2, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()

In [14]:
vector.fit_transform(corpus).toarray()

array([[1, 1, 2, 1, 2, 1]], dtype=int64)

In [15]:
vector.vocabulary_

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

In [16]:
from nltk.corpus import stopwords

In [17]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])

In [18]:
vect.fit_transform(text).toarray()

array([[1, 1, 1, 1, 1]], dtype=int64)

In [19]:
vect.vocabulary_

{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}

In [20]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")

In [21]:
vect.fit_transform(text).toarray()

array([[1, 1, 1]], dtype=int64)

In [22]:
vect.vocabulary_

{'family': 0, 'important': 1, 'thing': 2}

In [23]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)

In [24]:
vect.fit_transform(text).toarray()

array([[1, 1, 1, 1]], dtype=int64)

In [25]:
vect.vocabulary_

{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}

In [26]:
import pandas as pd
from math import log

In [27]:
docs = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나 바나나',
    '저는 과일이 좋아요'
]


In [28]:
vocab = list(set(w for doc in docs for w in doc.split()))

In [29]:
vocab

['저는', '길고', '먹고', '노란', '과일이', '사과', '좋아요', '바나나', '싶은']

In [30]:
vocab.sort()

In [31]:
vocab

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [32]:
N = len(docs)

In [33]:
N

4

In [34]:
def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in docs:
        df += t in doc        
    return log(N/(df+1))

def tfidf(t, d):
    return tf(t,d) * idf(t)

In [35]:
result = []

for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t,d))
tf_ = pd.DataFrame(result, columns=vocab)

In [37]:
result

[[0, 0, 0, 1, 0, 1, 1, 0, 0],
 [0, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 1, 0, 2, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 1, 1]]

In [38]:
a = []

In [39]:
a.append([])

In [40]:
a

[[]]

In [36]:
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [41]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
idf_

False
False
False
True
False
False
True
False
False
False
True
False
True
True
False
False
False
True
True
False
True
False
False
False
True
True
False
False
False
False
False
True
False
False
False
True


Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [42]:
d

'저는 과일이 좋아요'

In [43]:
t

'좋아요'

In [45]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t,d))
tfidf_ = pd.DataFrame(result, columns=vocab)
tfidf_

False
False
False
True
False
False
True
False
False
False
True
False
True
True
False
False
False
True
True
False
True
False
False
False
True
True
False
False
False
False
False
True
False
False
False
True
False
False
False
True
False
False
True
False
False
False
True
False
True
True
False
False
False
True
True
False
True
False
False
False
True
True
False
False
False
False
False
True
False
False
False
True
False
False
False
True
False
False
True
False
False
False
True
False
True
True
False
False
False
True
True
False
True
False
False
False
True
True
False
False
False
False
False
True
False
False
False
True
False
False
False
True
False
False
True
False
False
False
True
False
True
True
False
False
False
True
True
False
True
False
False
False
True
True
False
False
False
False
False
True
False
False
False
True


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [46]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

vector = CountVectorizer()


In [47]:
vector.fit_transform(corpus).toarray()

array([[0, 1, 0, 1, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [48]:
vector.vocabulary_

{'you': 7,
 'know': 1,
 'want': 5,
 'your': 8,
 'love': 3,
 'like': 2,
 'what': 6,
 'should': 4,
 'do': 0}

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]


In [51]:
tfidfv = TfidfVectorizer().fit(corpus)

In [52]:
tfidfv.transform(corpus).toarray()

array([[0.        , 0.46735098, 0.        , 0.46735098, 0.        ,
        0.46735098, 0.        , 0.35543247, 0.46735098],
       [0.        , 0.        , 0.79596054, 0.        , 0.        ,
        0.        , 0.        , 0.60534851, 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.57735027, 0.        , 0.        ]])

In [53]:
tfidfv.vocabulary_

{'you': 7,
 'know': 1,
 'want': 5,
 'your': 8,
 'love': 3,
 'like': 2,
 'what': 6,
 'should': 4,
 'do': 0}