In [9]:
from textacy import preprocessing
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Word2Vec
import gensim
import pandas as pd

In [51]:
#Word2Vec
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_word2vec_cosine_similarity(sentence1, sentence2, min_count=1):
    # Tokenize the sentences into words
    tokenized_sentence1 = sentence1.split()
    tokenized_sentence2 = sentence2.split()

    # Train a Word2Vec model on the tokenized sentences
    sentences = [tokenized_sentence1, tokenized_sentence2]
    model = Word2Vec(sentences, min_count=min_count)

    # Get the Word2Vec embeddings for each word in the sentences
    word_vectors = []
    for tokenized_sentence in sentences:
        sentence_vectors = []
        for word in tokenized_sentence:
            # Skip words not in the vocabulary
            if word in model.wv:
                sentence_vectors.append(model.wv[word])
        # Calculate the mean vector for the sentence
        if sentence_vectors:
            sentence_vector = np.mean(sentence_vectors, axis=0)
            word_vectors.append(sentence_vector)

    # Calculate cosine similarity between the Word2Vec embeddings of the sentences
    cosine_similarity_score = cosine_similarity([word_vectors[0]], [word_vectors[1]])

    return cosine_similarity_score[0][0]

# Example usage:
sentence1 = "Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages. As such, NLP is related to the area of human–computer interaction."
sentence2 = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human (natural) languages. NLP is used to apply algorithms to human language to enable computers to understand and interpret human language and extract meaning from text."
similarity_score = calculate_word2vec_cosine_similarity(sentence1, sentence2)
print("Cosine Similarity between sentence 1 and sentence 2:", similarity_score)


Cosine Similarity between sentence 1 and sentence 2: 0.7470758


In [64]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_tfidf_cosine_similarity(sentence1, sentence2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([sentence1, sentence2])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]

# Example usage:
sentence1 = "Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages. As such, NLP is related to the area of human–computer interaction.."
sentence2 = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human (natural) languages. NLP is used to apply algorithms to human language to enable computers to understand and interpret human language and extract meaning from text"
similarity_score = calculate_tfidf_cosine_similarity(sentence1, sentence2)
print("TF-IDF Cosine Similarity between sentence 1 and sentence 2:", similarity_score)


TF-IDF Cosine Similarity between sentence 1 and sentence 2: 0.6625278058650176


In [60]:
#Doc2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

def calculate_doc2vec_cosine_similarity(sentence1, sentence2):
    tagged_data = [TaggedDocument(words=sentence.split(), tags=[str(i)]) for i, sentence in enumerate([sentence1, sentence2])]
    model = Doc2Vec(vector_size=100, min_count=1, epochs=50)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    vector1 = model.infer_vector(sentence1.split())
    vector2 = model.infer_vector(sentence2.split())
    cosine_sim = cosine_similarity([vector1], [vector2])
    return cosine_sim[0][0]

# Example usage:
sentence1 = "Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages. As such, NLP is related to the area of human–computer interaction."
sentence2 = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human (natural) languages. NLP is used to apply algorithms to human language to enable computers to understand and interpret human language and extract meaning from text"
similarity_score = calculate_doc2vec_cosine_similarity(sentence1, sentence2)
print("Doc2Vec Cosine Similarity between sentence 1 and sentence 2:", similarity_score)


Doc2Vec Cosine Similarity between sentence 1 and sentence 2: 0.96662915


In [61]:
#Sentence Transformer
#!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_sentence_transformers_cosine_similarity(sentence1, sentence2):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)
    cosine_sim = cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0))
    return cosine_sim[0][0]

# Example usage:
sentence1 = "Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages. As such, NLP is related to the area of human–computer interaction.."
sentence2 = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human (natural) languages. NLP is used to apply algorithms to human language to enable computers to understand and interpret human language and extract meaning from text"
similarity_score = calculate_sentence_transformers_cosine_similarity(sentence1, sentence2)
print("Sentence Transformers Cosine Similarity between sentence 1 and sentence 2:", similarity_score)


Sentence Transformers Cosine Similarity between sentence 1 and sentence 2: 0.95967066
