## Math with Words (TF-IDF Vectors)

In [1]:
from nltk.tokenize import TreebankWordTokenizer
sentence = "Harry ran faster to the store, the faster he ran, the faster he reached."
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentence)
tokens

['Harry',
 'ran',
 'faster',
 'to',
 'the',
 'store',
 ',',
 'the',
 'faster',
 'he',
 'ran',
 ',',
 'the',
 'faster',
 'he',
 'reached',
 '.']

In [2]:
from collections import Counter
bag_of_words = Counter(tokens)
bag_of_words

Counter({'Harry': 1,
         'ran': 2,
         'faster': 3,
         'to': 1,
         'the': 3,
         'store': 1,
         ',': 2,
         'he': 2,
         'reached': 1,
         '.': 1})

In [3]:
bag_of_words.most_common(4)

[('faster', 3), ('the', 3), ('ran', 2), (',', 2)]

In [4]:
times_faster_appears = bag_of_words['faster']
num_unique_words = len(bag_of_words)
tf = times_faster_appears/num_unique_words
round(tf,4)


0.3

In [5]:
from nlpia.data.loaders import kite_text

tokens = tokenizer.tokenize(kite_text.lower())
token_counts = Counter(tokens)
token_counts

Counter({'a': 20,
         'kite': 16,
         'is': 7,
         'traditionally': 1,
         'tethered': 2,
         'heavier-than-air': 1,
         'craft': 2,
         'with': 2,
         'wing': 5,
         'surfaces': 1,
         'that': 2,
         'react': 1,
         'against': 1,
         'the': 26,
         'air': 2,
         'to': 5,
         'create': 1,
         'lift': 4,
         'and': 10,
         'drag.': 1,
         'consists': 2,
         'of': 10,
         'wings': 1,
         ',': 15,
         'tethers': 2,
         'anchors.': 2,
         'kites': 8,
         'often': 2,
         'have': 4,
         'bridle': 2,
         'guide': 1,
         'face': 1,
         'at': 3,
         'correct': 1,
         'angle': 1,
         'so': 3,
         'wind': 2,
         'can': 3,
         'it.': 1,
         "'s": 2,
         'also': 3,
         'may': 4,
         'be': 5,
         'designed': 2,
         'not': 1,
         'needed': 1,
         ';': 2,
         'when': 2,


In [6]:
token_counts.most_common(20)

[('the', 26),
 ('a', 20),
 ('kite', 16),
 (',', 15),
 ('and', 10),
 ('of', 10),
 ('kites', 8),
 ('is', 7),
 ('in', 7),
 ('or', 6),
 ('wing', 5),
 ('to', 5),
 ('be', 5),
 ('as', 5),
 ('lift', 4),
 ('have', 4),
 ('may', 4),
 ('at', 3),
 ('so', 3),
 ('can', 3)]

In [11]:
import nltk
stopwords =  nltk.corpus.stopwords.words('english')
tokens = [x for x in tokens if x not in stopwords]
kite_counts = Counter(tokens)
kite_counts

Counter({'kite': 16,
         'traditionally': 1,
         'tethered': 2,
         'heavier-than-air': 1,
         'craft': 2,
         'wing': 5,
         'surfaces': 1,
         'react': 1,
         'air': 2,
         'create': 1,
         'lift': 4,
         'drag.': 1,
         'consists': 2,
         'wings': 1,
         ',': 15,
         'tethers': 2,
         'anchors.': 2,
         'kites': 8,
         'often': 2,
         'bridle': 2,
         'guide': 1,
         'face': 1,
         'correct': 1,
         'angle': 1,
         'wind': 2,
         'it.': 1,
         "'s": 2,
         'also': 3,
         'may': 4,
         'designed': 2,
         'needed': 1,
         ';': 2,
         'kiting': 3,
         'sailplane': 1,
         'launch': 1,
         'tether': 1,
         'meets': 1,
         'single': 1,
         'point.': 1,
         'fixed': 1,
         'moving': 2,
         'untraditionally': 1,
         'technical': 2,
         'tether-set-coupled': 1,
         'sets': 1,

In [13]:
# vectorizing
document_vector = []
doc_lenght = len(tokens)
for key,value in kite_counts.most_common():
    document_vector.append(value/doc_lenght)
document_vector

[0.07207207207207207,
 0.06756756756756757,
 0.036036036036036036,
 0.02252252252252252,
 0.018018018018018018,
 0.018018018018018018,
 0.013513513513513514,
 0.013513513513513514,
 0.013513513513513514,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.