In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# sentences I want to CountVectorize
sentences = ['I hate eggs',
            'I hate beef',
            'I like chicken and hate pork',
            'Hate Pork and like squid']

In [3]:
# finding the vocabulary
vocabulary = list(set((' '.join(sentences)).split()))
print(vocabulary, 'size : ',len(vocabulary))

['chicken', 'and', 'hate', 'like', 'beef', 'eggs', 'squid', 'pork', 'Hate', 'I', 'Pork'] size :  11


In [4]:
# Basic CountVectorizer
# creating an empty dictionary
vector_dict = {}

# populate the dictionary
for sentence in sentences:
    for word in vocabulary:
        #print('*'*50)
        #print(word)
        similar_words = [ x for x in sentence.split(' ') if x == word]
        #print('similar_words : ',similar_words)
        if word not in vector_dict.keys() :
            vector_dict[word] = [len(similar_words)]
            #print(vector_dict[word])
        elif word in vector_dict.keys():
            vector_dict[word] = vector_dict[word] + [0 if len(similar_words) == 0 else len(similar_words)]
            #print('updated : ',vector_dict[word])
vector_dict        

{'chicken': [0, 0, 1, 0],
 'and': [0, 0, 1, 1],
 'hate': [1, 1, 1, 0],
 'like': [0, 0, 1, 1],
 'beef': [0, 1, 0, 0],
 'eggs': [1, 0, 0, 0],
 'squid': [0, 0, 0, 1],
 'pork': [0, 0, 1, 0],
 'Hate': [0, 0, 0, 1],
 'I': [1, 1, 1, 0],
 'Pork': [0, 0, 0, 1]}

In [5]:
# CountVectorized sentences
CountVectorized_sentences = np.array(list(vector_dict.values())).T
CountVectorized_sentences

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]])

In [6]:
# Normalizing the vectors
normalized_vector = CountVectorized_sentences[0]/np.linalg.norm(CountVectorized_sentences[0])
normalized_vector

array([0.        , 0.        , 0.57735027, 0.        , 0.        ,
       0.57735027, 0.        , 0.        , 0.        , 0.57735027,
       0.        ])

In [7]:
# using sklearn's normalize()
from sklearn.preprocessing import normalize

In [8]:
# normalizing the vector
normalize([CountVectorized_sentences[0]])

array([[0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        ]])

print(angle_between_vectors(CountVectorized_sentences[1],CountVectorized_sentences[1]))

In [9]:
# define a class Cosine_Similarity

class Cosine_Similarity():
    
    @staticmethod
    def cosine_similarity(v1,v2):
        # calculate the magnitudes
        u1 = v1/np.linalg.norm(v1)
        u2 = v2/np.linalg.norm(v2)
    
        # The dot product of vectors
        cos_sim = np.dot(u1,u2)
        cos_sim = np.clip(cos_sim,-1.0,1.0)

        return cos_sim
    
    def angle_between_vectors(self,v1,v2):
        
        dot_product = self.cosine_similarity(v1,v2)
        
        if np.abs(dot_product) > 1:
            raise ValueError('Invalid dot product, Vectors are not normalized') 
        
        # The angle between them
        angle_in_radians = np.arccos(dot_product)
        angle_in_degrees = np.degrees(angle_in_radians)
        
        return angle_in_radians, angle_in_degrees
        

In [10]:
# getting combinations of vectors
# importing combinations from itertools
from itertools import combinations

# creating a list of combinations from the given vectors
vec_combi = list(combinations(CountVectorized_sentences,2))
vec_combi

[(array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]),
  array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0])),
 (array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]),
  array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0])),
 (array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]),
  array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1])),
 (array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]),
  array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0])),
 (array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]),
  array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1])),
 (array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]),
  array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]))]

In [32]:
# creating a dataframe with the combination of vectors
vec_combi_df = pd.DataFrame(vec_combi, columns = ['vector_1', 'vector_2'])
#vec_combi_df

# creating an instance of class Cosine similarity
cosine_sim = Cosine_Similarity()

# finding out the angle between the vectors
vec_combi_df['angle_in_radians'], vec_combi_df['angle_in_degrees'] = zip(*vec_combi_df.apply(lambda row: cosine_sim.angle_between_vectors(row['vector_1'],row['vector_2']),axis = 1))

vec_combi_df['Cosine_Similarity'] = vec_combi_df.apply(lambda row: cosine_sim.cosine_similarity(row['vector_1'],row['vector_2']), axis = 1)

# finding the rank for the combination
vec_combi_df['rank'] = vec_combi_df['Cosine_Similarity'].rank(method = 'dense', ascending = False)

# sorting the values based on the ranks
vec_combi_df.sort_values(by = 'rank')


Unnamed: 0,vector_1,vector_2,angle_in_radians,angle_in_degrees,Cosine_Similarity,rank
0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]",0.841069,48.189685,0.666667,1.0
1,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]",1.079914,61.874494,0.471405,2.0
3,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]",1.079914,61.874494,0.471405,2.0
5,"[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]",1.197004,68.583286,0.365148,3.0
2,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]",1.570796,90.0,0.0,4.0
4,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]",1.570796,90.0,0.0,4.0


In [34]:
# creating tuple columns so we can map
vec_combi_df.insert(2,'tuple_vector1',vec_combi_df['vector_1'].apply(tuple))
vec_combi_df.insert(3,'tuple_vector2',vec_combi_df['vector_2'].apply(tuple))

In [35]:
vec_combi_df

Unnamed: 0,vector_1,vector_2,tuple_vector1,tuple_vector2,angle_in_radians,angle_in_degrees,Cosine_Similarity,rank
0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0)","(0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0)",0.841069,48.189685,0.666667,1.0
1,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0)","(1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0)",1.079914,61.874494,0.471405,2.0
2,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]","(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0)","(0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1)",1.570796,90.0,0.0,4.0
3,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","(0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0)","(1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0)",1.079914,61.874494,0.471405,2.0
4,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]","(0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0)","(0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1)",1.570796,90.0,0.0,4.0
5,"[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]","(1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0)","(0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1)",1.197004,68.583286,0.365148,3.0


In [36]:
# creating a mapping object to map each sentences with their corresponding vectors.
map_dic = dict(zip(sentences,list(CountVectorized_sentences)))
map_dic

{'I hate eggs': array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]),
 'I hate beef': array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]),
 'I like chicken and hate pork': array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]),
 'Hate Pork and like squid': array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1])}

In [37]:
# inverting the dictionary
map_dic_tuple = {tuple(values):keys for keys,values in map_dic.items()}
map_dic_tuple

{(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0): 'I hate eggs',
 (0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0): 'I hate beef',
 (1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0): 'I like chicken and hate pork',
 (0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1): 'Hate Pork and like squid'}

In [38]:
# adding a new columns for sentences corresponding to each vector.
vec_combi_df['sentences_vector1'] = vec_combi_df['tuple_vector1'].map(map_dic_tuple)
vec_combi_df['sentences_vector2'] = vec_combi_df['tuple_vector2'].map(map_dic_tuple)

In [39]:
# Sorting them by rank.
vec_combi_df.sort_values(by = 'rank')

Unnamed: 0,vector_1,vector_2,tuple_vector1,tuple_vector2,angle_in_radians,angle_in_degrees,Cosine_Similarity,rank,sentences_vector1,sentences_vector2
0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0)","(0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0)",0.841069,48.189685,0.666667,1.0,I hate eggs,I hate beef
1,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0)","(1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0)",1.079914,61.874494,0.471405,2.0,I hate eggs,I like chicken and hate pork
3,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","(0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0)","(1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0)",1.079914,61.874494,0.471405,2.0,I hate beef,I like chicken and hate pork
5,"[1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]","(1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0)","(0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1)",1.197004,68.583286,0.365148,3.0,I like chicken and hate pork,Hate Pork and like squid
2,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]","(0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0)","(0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1)",1.570796,90.0,0.0,4.0,I hate eggs,Hate Pork and like squid
4,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0]","[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]","(0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0)","(0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1)",1.570796,90.0,0.0,4.0,I hate beef,Hate Pork and like squid
