# Bag of Words/ k-Grams Models in Python

#### Example page 66 of the book Jeff M. Phillips

In [17]:
D1 = "I am Sam"
D2 = "Sam I am"
D3 = "I do not like jelly and ham"
D4 = "I do not do not like them Sam I am"
corpus = [D1,D2,D3,D4]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['am', 'and', 'do', 'ham', 'jelly', 'like', 'not', 'sam', 'them'],
      dtype=object)

In [6]:
V = X.toarray()
print(V)

[[1 0 0 0 0 0 0 1 0]
 [1 0 0 0 0 0 0 1 0]
 [0 1 1 1 1 1 1 0 0]
 [1 0 2 0 0 1 2 1 1]]


In [20]:
import numpy as np
from numpy import linalg as LA
import scipy as sp
from scipy import stats

v1=V[0]
v2=V[1]
v3=V[2]
v4=V[3]

In [21]:
#Euclidean distance
print([LA.norm(v1-v2), LA.norm(v1-v3), LA.norm(v1-v4), LA.norm(v2-v3),
LA.norm(v2-v4), LA.norm(v3-v4)])

[0.0, 2.8284271247461903, 3.1622776601683795, 2.8284271247461903, 3.1622776601683795, 2.8284271247461903]


In [22]:
#normalized vectors
v1n = v1/LA.norm(v1)
v2n = v2/LA.norm(v2)
v3n = v3/LA.norm(v3)
v4n = v4/LA.norm(v4)
#Cosine distance
print([1-v1n.dot(v2n), 1-v1n.dot(v3n), 1-v1n.dot(v4n), 1-v2n.dot(v3n),1-v2n.dot(v4n),1-v3n.dot(v4n)])

[2.220446049250313e-16, 1.0, 0.591751709536137, 1.0, 0.591751709536137, 0.41074434901121026]


In [26]:
#regularizer required for KL (it cannot handle 0 terms)
reg = 0.01
v1r = v1+reg
v2r = v2+reg
v3r = v3+reg
v4r = v4+reg

# KL-divergence (the entropy function L1-normalizes vectors internally)
print([stats.entropy(v1r,v1r),
       stats.entropy(v1r,v2r),
       stats.entropy(v1r,v3r),
       stats.entropy(v1r,v4r)])

print([stats.entropy(v2r,v1r),
       stats.entropy(v2r,v2r),
       stats.entropy(v2r,v3r),
       stats.entropy(v2r,v4r)])

print([stats.entropy(v3r,v1r),
       stats.entropy(v3r,v2r),
       stats.entropy(v3r,v3r),
       stats.entropy(v3r,v4r)])

print([stats.entropy(v4r,v1r),
       stats.entropy(v4r,v2r),
       stats.entropy(v4r,v3r),
       stats.entropy(v4r,v4r)])

[0.0, 0.0, 5.397539620055548, 1.2585515031329115]
[0.0, 0.0, 5.397539620055548, 1.2585515031329115]
[3.507745528051499, 3.507745528051499, 0.0, 2.329174258587312]
[2.4341532798280534, 2.4341532798280534, 1.769401650184088, 0.0]


#  k-Grams

##### Example of the page 70  book Jeff M. Phillips

In [19]:
from nltk import ngrams
n = 2
G = list()
for x in corpus:
    twograms = ngrams(x.split(), n)
    G.append(set(twograms))

print(G)

[{('am', 'Sam'), ('I', 'am')}, {('Sam', 'I'), ('I', 'am')}, {('jelly', 'and'), ('not', 'like'), ('and', 'ham'), ('I', 'do'), ('do', 'not'), ('like', 'jelly')}, {('not', 'like'), ('Sam', 'I'), ('I', 'am'), ('I', 'do'), ('do', 'not'), ('them', 'Sam'), ('like', 'them'), ('not', 'do')}]


In [20]:
G1=G[0]
G2=G[1]
G3=G[2]
G4=G[3]

In [21]:
# Define the jaccard distance
def jacard(x,y):
    return 1-len(x.intersection(y))/len(x.union(y))

In [77]:
import numpy as np
d_J = np.zeros((4,4))
for i in range(4):
    Gi = G[i]
    for j in range(4):
        Gj = G[j]
        d_J[i,j] = round(jacard(Gi,Gj),3)
print(d_J)

[[0.    0.667 1.    0.889]
 [0.667 0.    1.    0.75 ]
 [1.    1.    0.    0.727]
 [0.889 0.75  0.727 0.   ]]
