In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
# Sample corpus
documents = ['Machine learning is the study of computer algorithms that improve automatically through experience.\
Machine learning algorithms build a mathematical model based on sample data, known as training data.\
The discipline of machine learning employs various approaches to teach computers to accomplish tasks \
where no fully satisfactory algorithm is available.',
'Machine learning is closely related to computational statistics, which focuses on making predictions using computers.\
The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.',
'Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. \
It involves computers learning from data provided so that they carry out certain tasks.',
'Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the "signal"\
or "feedback" available to the learning system: Supervised, Unsupervised and Reinforcement',
'Software engineering is the systematic application of engineering approaches to the development of software.\
Software engineering is a computing discipline.',
'A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concerned\
about the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.\
Developing a machine learning application is more iterative and explorative process than software engineering.'
]

In [3]:
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_columns', 0)

In [4]:
documents_df=pd.DataFrame(documents,columns=['documents'])

### Corpus Dataframe

In [5]:
documents_df

Unnamed: 0,documents
0,"Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available."
1,"Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning."
2,Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
3,"Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the ""signal""or ""feedback"" available to the learning system: Supervised, Unsupervised and Reinforcement"
4,Software engineering is the systematic application of engineering approaches to the development of software.Software engineering is a computing discipline.
5,"A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concernedabout the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.Developing a machine learning application is more iterative and explorative process than software engineering."


In [6]:
# removing special characters and stop words from the text
stop_words_l=stopwords.words('english')
documents_df['documents_cleaned']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() 
                                                                                  if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )


#### Tf-idf vectors

In [7]:
tfidfvectoriser=TfidfVectorizer(max_features=64)
tfidfvectoriser.fit(documents_df.documents_cleaned)
tfidf_vectors=tfidfvectoriser.transform(documents_df.documents_cleaned)

In [8]:
tfidf_vectors.shape

(6, 64)

In [17]:
# Every vector is already normalised to have unit L2 norm
#np.linalg.norm(tfidf_vectors[0],ord=2)

In [14]:
tfidf_vectors=tfidf_vectors.toarray()
print (tfidf_vectors[0])

[0.20860612 0.41721224 0.         0.14442061 0.17106    0.17106
 0.         0.         0.         0.         0.         0.
 0.17106    0.14442061 0.         0.         0.         0.
 0.28884121 0.17106    0.         0.         0.         0.
 0.32062347 0.32062347 0.         0.17106    0.         0.20860612
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.20860612 0.20860612 0.         0.         0.         0.
 0.         0.17106    0.         0.         0.         0.17106
 0.20860612 0.17106    0.         0.         0.         0.20860612
 0.         0.         0.         0.        ]


##### Every document has been converted into a 64 dimensional vector. As we set the max_features=64

### Pairwise similarity

##### Pairwise cosine similarity would just be the dot product of the vectors becasue tf-idf vectors from sklearn are already normalized and L2 norm of these vectors is 1. So denominator of cosine similiarity formula is 1 in this case.

In [15]:
pairwise_similarities=np.dot(tfidf_vectors,tfidf_vectors.T)
pairwise_differences=euclidean_distances(tfidf_vectors)

In [16]:
print (tfidf_vectors[0])
print (pairwise_similarities.shape)
print (pairwise_similarities[0][:])

[0.20860612 0.41721224 0.         0.14442061 0.17106    0.17106
 0.         0.         0.         0.         0.         0.
 0.17106    0.14442061 0.         0.         0.         0.
 0.28884121 0.17106    0.         0.         0.         0.
 0.32062347 0.32062347 0.         0.17106    0.         0.20860612
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.20860612 0.20860612 0.         0.         0.         0.
 0.         0.17106    0.         0.         0.         0.17106
 0.20860612 0.17106    0.         0.         0.         0.20860612
 0.         0.         0.         0.        ]
(6, 6)
[1.         0.30335642 0.29899126 0.20763548 0.06056832 0.16004863]


##### similarity is highest, 1 at index 0 becasue they are the same documents

In [18]:
def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Document: {documents_df.iloc[doc_id]["documents"]}')
    print ('\n')
    print (f'Similar Documents using {matrix}:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {documents_df.iloc[ix]["documents"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

In [19]:
most_similar(0,pairwise_similarities,'Cosine Similarity')

Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents using Cosine Similarity:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Cosine Similarity : 0.30335642341823865


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Cosine Similarity : 0.29899125782686603


Document: Machine learning approaches a

In [20]:
most_similar(0,pairwise_differences,'Euclidean Distance')

Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents using Euclidean Distance:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Euclidean Distance : 1.180375852499331


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Euclidean Distance : 1.184068192439214


Document: Machine learning approaches ar

#### word2vec embeddings

In [None]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.8.0-cp38-cp38-manylinux2010_x86_64.whl (497.6 MB)
[K     |████████████████████████████████| 497.6 MB 20.5 MB/s eta 0:00:01

In [25]:
from keras.preprocessing.text import Tokenizer
import gensim
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# tokenize and pad every document to make them of the same size
tokenizer=Tokenizer()
tokenizer.fit_on_texts(documents_df.documents_cleaned)
tokenized_documents=tokenizer.texts_to_sequences(documents_df.documents_cleaned)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=64,padding='post')
vocab_size=len(tokenizer.word_index)+1

print (tokenized_paded_documents[0])

In [None]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
import gensim
W2V_PATH="/Users/varunchaudhary/Documents/Varun Docs/Medium/GoogleNews-vectors-negative300.bin.gz"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

In [None]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,300))
for word,i in tokenizer.word_index.items():
    if word in model_w2v:
        embedding_matrix[i]=model_w2v[word]

In [None]:
embedding_matrix[0]

In [None]:
embedding_matrix[tokenizer.word_index['machine']]

In [None]:
# creating document-word embeddings
document_word_embeddings=np.zeros((len(tokenized_paded_documents),64,300))

for i in range(len(tokenized_paded_documents)):
    for j in range(len(tokenized_paded_documents[0])):
        document_word_embeddings[i][j]=embedding_matrix[tokenized_paded_documents[i][j]]

In [None]:
document_word_embeddings.shape

In [None]:
# tf-idf vectors do not keep the original sequence of words, converting them into actual word sequences from the documents

document_embeddings=np.zeros((len(tokenized_paded_documents),300))
words=tfidfvectoriser.get_feature_names()

for i in range(len(document_word_embeddings)):
    for j in range(len(words)):
        document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        
document_embeddings=document_embeddings/np.sum(tfidf_vectors,axis=1).reshape(-1,1)
    

In [None]:
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

In [None]:
most_similar(0,pairwise_similarities,'Cosine Similarity')

In [None]:
most_similar(0,pairwise_differences,'Euclidean Distance')

#### Glove embeddings

In [None]:
# reading Glove word embeddings into a dictionary with "word" as key and values as word vectors

embeddings_index = dict()

with open('glove.6B.100d.txt') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    

In [None]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 

embedding_matrix=np.zeros((vocab_size,100))

for word,i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# tf-idf vectors do not keep the original sequence of words, converting them into actual word sequences from the documents

document_embeddings=np.zeros((len(tokenized_paded_documents),100))
words=tfidfvectoriser.get_feature_names()

for i in range(documents_df.shape[0]):
    for j in range(len(words)):
        document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        
document_embeddings=document_embeddings/np.sum(tfidf_vectors,axis=1).reshape(-1,1)

In [None]:
document_embeddings.shape

In [None]:
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

In [None]:
most_similar(0,pairwise_similarities,'Cosine Similarity')

In [None]:
most_similar(0,pairwise_differences,'Euclidean Distance')

#### Doc2vec model

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df.documents_cleaned)]

In [None]:
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(100):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)

In [None]:
document_embeddings=np.zeros((documents_df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]

In [None]:
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

In [None]:
most_similar(0,pairwise_similarities,'Cosine Similarity')

In [None]:
most_similar(0,pairwise_differences,'Euclidean Distance')

#### BERT model

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
document_embeddings = sbert_model.encode(documents_df['documents_cleaned'])

In [None]:
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

In [None]:
most_similar(0,pairwise_similarities,'Cosine Similarity')

In [None]:
most_similar(0,pairwise_differences,'Euclidean Distance')