# Bag of Words

<pre>
This technique is used to convert text data into vectors.

In this technique
1) We take all the text data (all strings or all documents) and make a vocabulary/set of all unique words.
   In given example all r1,r2,r3,r4
   
2) Then we compare all unique words in dictionary with each string(r1,r2 etc...)
   If word is in the string/document then assign 1 in it's vector form else assign 0.
   That's it. That's how we got vectors containing 0s and 1s for each string.
 
Note: We are trying to make a code to understand basic working of "bag of words" by our own code.
      It will be implemented differently in python.
</pre>

In [1]:
# These are 4 strings or documents of food reviews. We will apply bag of words on these.

r1 = "This pasta is very tasty and affordable."
r2 = "This pasta is not tasty and is affordable."
r3 = "This pasta. is delicious, and cheap"
r4 = "pasta is tasty, pasta tastes good."

In [2]:
r3.split() # To make a dictionary we need strings converted into single words. But split() function also considers symbols.

['This', 'pasta.', 'is', 'delicious,', 'and', 'cheap']

In [3]:
import re  # importing regex lib.

print(re.sub(r'[^\w]', ' ', r3))   # from string replace all symbols with space.
print(re.sub(r'[^\w]', '_', r3))   # from string replace all symbols with underscore.

# Now we can replace all symbols with space. We can apply split() that will split the string by space.
print(re.sub(r'[^\w]', ' ', r3).split())  # Now this can be used in making dict.

This pasta  is delicious  and cheap
This_pasta__is_delicious__and_cheap
['This', 'pasta', 'is', 'delicious', 'and', 'cheap']


In [4]:
rev_list = [r1, r2, r3, r4]

# We have to loop through all reviews/strings to make dictionary(bag of words). So we need them inside a list.
# This r_list can be seen as array of all strings/documents.
#
#     [r1
# X =  r2
#      r3
#      r4]

In [5]:
rev_list

['This pasta is very tasty and affordable.',
 'This pasta is not tasty and is affordable.',
 'This pasta. is delicious, and cheap',
 'pasta is tasty, pasta tastes good.']

In [6]:
# Here we are making bag of words(dictionary of unique values).
def vocabulary(r_list):
    d = {}
    for rev in r_list:                   # Take a review, let rev = r3
        s = re.sub(r'[^\w]', ' ', rev)   # replace symbols and split it. s = ['This', 'pasta', 'is', 'delicious', 'and', cheap']
        for word in s.split():           # word = "This"
            d[word] = 0                  # d['This'] = 0
    return d                             # d = {"This":0}

In [7]:
def makeVector(strng, dct):
    l = []
    s = re.sub(r'[^\w]', ' ', strng)
    for word in s.split():
        dct[word] = dct[word] + 1
    
    l = list(dct.values())
    del dct
    return l

In [8]:
d = vocabulary(rev_list)

In [31]:
d

{'This': 0,
 'pasta': 0,
 'is': 0,
 'very': 0,
 'tasty': 0,
 'and': 0,
 'affordable': 0,
 'not': 0,
 'delicious': 0,
 'cheap': 0,
 'tastes': 0,
 'good': 0}

In [9]:
makeVector(r2, d.copy())

[1, 1, 2, 0, 1, 1, 1, 1, 0, 0, 0, 0]

In [10]:
makeVector(r4, d.copy())

[0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1]

In [11]:
d

{'This': 0,
 'pasta': 0,
 'is': 0,
 'very': 0,
 'tasty': 0,
 'and': 0,
 'affordable': 0,
 'not': 0,
 'delicious': 0,
 'cheap': 0,
 'tastes': 0,
 'good': 0}

In [12]:
for rev in rev_list:
    print(makeVector(rev, d.copy()))

[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 2, 0, 1, 1, 1, 1, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0]
[0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1]


In [13]:
r1 = "This pasta is very tasty and affordable."
r2 = "This pasta is not tasty and is affordable."
r3 = "This pasta. is delicious, and cheap"
r4 = "pasta is tasty, pasta tastes good."

In [14]:
bv1 = makeVector(r1, d.copy())
bv2 = makeVector(r2, d.copy())
bv3 = makeVector(r3, d.copy())
bv4 = makeVector(r4, d.copy())

In [15]:
print(bv1, bv2, bv3, bv4)

[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] [1, 1, 2, 0, 1, 1, 1, 1, 0, 0, 0, 0] [1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0] [0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1]


# TF - IDF

In [16]:
d1 = "cat dog lion cow cat"
d2 = "cat cat cat dog"
d3 = "dog cat"
d4 = "ox cow rat dog"

In [17]:
Dc = [d1, d2, d3, d4]
for doc in Dc:
    print(doc)

cat dog lion cow cat
cat cat cat dog
dog cat
ox cow rat dog


In [18]:
voc = vocabulary(Dc)
print(voc)

{'cat': 0, 'dog': 0, 'lion': 0, 'cow': 0, 'ox': 0, 'rat': 0}


In [19]:
v1 = makeVector(d1, voc.copy())
v2 = makeVector(d2, voc.copy())
v3 = makeVector(d3, voc.copy())
v4 = makeVector(d4, voc.copy())

In [20]:
bow_vectors = [v1, v2, v3, v4]
for vec in bow_vectors:
    print(vec)

[2, 1, 1, 1, 0, 0]
[3, 1, 0, 0, 0, 0]
[1, 1, 0, 0, 0, 0]
[0, 1, 0, 1, 1, 1]


In [21]:
# BoW to Binary BoW
bin_vec = []
for vec in bow_vectors:
    temp = []
    for value in vec:
        if value != 0:
            temp.append(1)
        else:
            temp.append(0)
    bin_vec.append(temp)
    
bin_vec

[[1, 1, 1, 1, 0, 0],
 [1, 1, 0, 0, 0, 0],
 [1, 1, 0, 0, 0, 0],
 [0, 1, 0, 1, 1, 1]]

In [22]:
import numpy as np

In [23]:
bow_arr = np.array(bin_vec)
bow_arr

array([[1, 1, 1, 1, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1]])

In [24]:
ni = bow_arr.sum(axis=0)
ni

array([3, 4, 1, 2, 1, 1])

In [25]:
len(bow_arr)

4

In [26]:
import math
idf = []
for val in ni:
    idf.append(math.log(len(bow_arr)/val))

idf

[0.28768207245178085,
 0.0,
 1.3862943611198906,
 0.6931471805599453,
 1.3862943611198906,
 1.3862943611198906]

In [27]:
def tf_idf(bow_vectors, idf):
    length = 6
    count = 0
    tfidf = []
    for b_vec in bow_vectors:
        i = 0
        print(b_vec)
        temp = []
        for val in b_vec:
            tf = val/length
            temp.append(tf * idf[i])
            i = i + 1
        print("temp", temp)
        tfidf.append(temp)
    #return tfidf

In [28]:
tf_idf(bow_vectors, idf)

[2, 1, 1, 1, 0, 0]
temp [0.09589402415059362, 0.0, 0.23104906018664842, 0.11552453009332421, 0.0, 0.0]
[3, 1, 0, 0, 0, 0]
temp [0.14384103622589042, 0.0, 0.0, 0.0, 0.0, 0.0]
[1, 1, 0, 0, 0, 0]
temp [0.04794701207529681, 0.0, 0.0, 0.0, 0.0, 0.0]
[0, 1, 0, 1, 1, 1]
temp [0.0, 0.0, 0.0, 0.11552453009332421, 0.23104906018664842, 0.23104906018664842]


In [29]:
len(v1)

6

In [30]:
math.log(4/3)

0.28768207245178085