<a href="https://colab.research.google.com/github/hoon2hooni/code_practice/blob/master/nlpinaction_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from nltk.tokenize import TreebankWordTokenizer
sentence = """The faster Harry got to the store, the faster Harry, the faster, would get home."""
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentence.lower())

from collections import Counter
bag_of_words = Counter(tokens)
bag_of_words

Counter({',': 3,
         '.': 1,
         'faster': 3,
         'get': 1,
         'got': 1,
         'harry': 2,
         'home': 1,
         'store': 1,
         'the': 4,
         'to': 1,
         'would': 1})

In [4]:
bag_of_words.most_common(4)

[('the', 4), ('faster', 3), (',', 3), ('harry', 2)]

In [8]:
import nltk
nltk.download('stopwords', quiet = True)
stopwords = nltk.corpus.stopwords.words('english')

In [9]:
tokens = [x for x in tokens if x not in stopwords]

In [11]:
kite_counts = Counter(tokens)

In [12]:
kite_counts

Counter({',': 3,
         '.': 1,
         'faster': 3,
         'get': 1,
         'got': 1,
         'harry': 2,
         'home': 1,
         'store': 1,
         'would': 1})

In [13]:
document_vector = []
doc_length = len(tokens)
for key, value in kite_counts.most_common():
    document_vector.append(value/doc_length)
    

In [14]:
document_vector

[0.21428571428571427,
 0.21428571428571427,
 0.14285714285714285,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142,
 0.07142857142857142]

In [20]:
docs = ["The faster Harry got to the store, the faster and faster \
Harry would get home."]
docs.append("Harry is hairy and faster than Jill")
docs.append("Jill is not as hairy as Harry.")

In [21]:
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]

In [22]:
doc_tokens

[[',',
  '.',
  'and',
  'faster',
  'faster',
  'faster',
  'get',
  'got',
  'harry',
  'harry',
  'home',
  'store',
  'the',
  'the',
  'the',
  'to',
  'would'],
 ['and', 'faster', 'hairy', 'harry', 'is', 'jill', 'than'],
 ['.', 'as', 'as', 'hairy', 'harry', 'is', 'jill', 'not']]

In [25]:
len(doc_tokens[0])
all_doc_tokens = sum(doc_tokens, [])#배열 다 합치는방법 메모

In [27]:
len(all_doc_tokens)


32

In [28]:
lexicon = sorted(set(all_doc_tokens))

In [29]:
len(lexicon)

18

In [30]:
lexicon

[',',
 '.',
 'and',
 'as',
 'faster',
 'get',
 'got',
 'hairy',
 'harry',
 'home',
 'is',
 'jill',
 'not',
 'store',
 'than',
 'the',
 'to',
 'would']

In [32]:
from collections import OrderedDict
zero_vector = OrderedDict((token,0) for token in lexicon)

In [33]:
zero_vector

OrderedDict([(',', 0),
             ('.', 0),
             ('and', 0),
             ('as', 0),
             ('faster', 0),
             ('get', 0),
             ('got', 0),
             ('hairy', 0),
             ('harry', 0),
             ('home', 0),
             ('is', 0),
             ('jill', 0),
             ('not', 0),
             ('store', 0),
             ('than', 0),
             ('the', 0),
             ('to', 0),
             ('would', 0)])

In [36]:
import copy
doc_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector) #메모리 복사가 아닌 값만 복사
    tokens = tokenizer.tokenize(doc.lower()) #nltk tokenizer활용
    token_counts = Counter(tokens) #token화 된거 Counter로 각 토큰당 빈도수 형식으로 만들어줌
    for key, value in token_counts.items(): #key 단어 value 값
        vec[key] =value/len(lexicon) #vec dict에 단어를 value/총 단어
    doc_vectors.append(vec)

list

In [48]:
#코사인 유사도 구하는 함수

import math

def cosine_sim(vec1, vec2):

    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    dot_prod = 0
    for i, v_1 in enumerate(vec1):
        dot_prod += v_1 * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod/(mag_1 * mag_2) #-1 에서 1 범위

In [49]:
#tf idf
document_tfidf_vectors =[]
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)

    for key, value in token_counts.items():
        docs_containing_key = 0 
        for _doc in docs:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0 
        vec[key] = tf * idf
    document_tfidf_vectors.append(vec)

query = "How long does it take to get to the store?"
query_vec = copy.copy(zero_vector)
tokens = tokenizer.tokenize(query.lower())
token_counts = Counter(tokens)

for key, value in token_counts.items():
    docs_containing_key = 0 
    for _doc in docs:
        if key in _doc.lower():
            docs_containing_key +=1
    if docs_containing_key == 0:
        continue
    tf = value/len(tokens)
    idf = len(docs) / docs_containing_key
    query_vec[key] = tf * idf

cosine_sim(query_vec, document_tfidf_vectors[0])



0.6110100926607784

[OrderedDict([(',', 0.16666666666666666),
              ('.', 0.08333333333333333),
              ('and', 0.08333333333333333),
              ('as', 0),
              ('faster', 0.25),
              ('get', 0.16666666666666666),
              ('got', 0.16666666666666666),
              ('hairy', 0),
              ('harry', 0.0),
              ('home', 0.16666666666666666),
              ('is', 0),
              ('jill', 0),
              ('not', 0),
              ('store', 0.16666666666666666),
              ('than', 0),
              ('the', 0.5),
              ('to', 0.16666666666666666),
              ('would', 0.16666666666666666)]),
 OrderedDict([(',', 0),
              ('.', 0),
              ('and', 0.08333333333333333),
              ('as', 0),
              ('faster', 0.08333333333333333),
              ('get', 0),
              ('got', 0),
              ('hairy', 0.08333333333333333),
              ('harry', 0.0),
              ('home', 0),
              ('is', 0.083333333333

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = docs
vectorizer = TfidfVectorizer(min_df = 1)
model = vectorizer.fit_transform(corpus)

In [53]:
print(model.todense().round(2))

[[0.16 0.   0.48 0.21 0.21 0.   0.25 0.21 0.   0.   0.   0.21 0.   0.64
  0.21 0.21]
 [0.37 0.   0.37 0.   0.   0.37 0.29 0.   0.37 0.37 0.   0.   0.49 0.
  0.   0.  ]
 [0.   0.75 0.   0.   0.   0.29 0.22 0.   0.29 0.29 0.38 0.   0.   0.
  0.   0.  ]]
