## Text Similarity

https://medium.com/@Intellica.AI/comparison-of-different-word-embeddings-on-text-similarity-a-use-case-in-nlp-e83e08469c1c

In [3]:
!pip install nltk



In [None]:
# Download all NLTK files
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()

## Pre-processing

In [99]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from unidecode import unidecode
import string
def pre_process(corpus):
    # convert input corpus to lower case.
    corpus = corpus.lower()
    # collecting a list of stop words from nltk and punctuation form
    # string class and create single array.
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations from string.
    # word_tokenize is used to tokenize the input corpus in word tokens.
    corpus = " ".join([i for i in word_tokenize(corpus) if i not in stopset])
    # remove non-ascii characters
    corpus = unidecode(corpus)
    

    return corpus
pre_process("Sample of non ASCII: Ceñía. How to remove stopwords and punctuations?")

'sample non ascii cenia remove stopwords punctuations'

## Lemmatization

In [100]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
sentence = "The striped bats are hanging on their feet for best"
words = word_tokenize(sentence)
# for w in words:
#     print(w, " : ", lemmatizer.lemmatize(w))

    
# word_tokenize is used to tokenize the input corpus in word tokens.
new_sentence = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(sentence)])


sentence, new_sentence

('The striped bats are hanging on their feet for best',
 'The striped bat are hanging on their foot for best')

## Feature Extraction

In [8]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
# sentence pair
corpus = ["A girl is styling her hair.", "A girl is brushing her hair."]
for c in range(len(corpus)):
    corpus[c] = pre_process(corpus[c])
# creating vocabulary using uni-gram and bi-gram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_vectorizer.fit(corpus)
feature_vectors = tfidf_vectorizer.transform(corpus)

In [53]:
# Gensim tutorial
# https://radimrehurek.com/gensim/models/word2vec.html

# Download trained word2vec model from Google here
# https://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import numpy as np


path = get_tmpfile("word2vec.model")
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

# give a path of model to load function
word_emb_model = Word2Vec.load("word2vec.model")

# Load Google's pre-trained Word2Vec model.
word_emb_model = gensim.models.KeyedVectors.load_word2vec_format('./browser-qa-app/browser-qa-api/model/GoogleNews-vectors-negative300.bin', binary=True) 

In [74]:
# SIF
from collections import Counter
import itertools

def map_word_frequency(document):
    return Counter(itertools.chain(*document))
    
def get_sif_feature_vectors(sentence1, sentence2, word_emb_model=word_emb_model):
    sentence1 = [token for token in sentence1.split() if token in word_emb_model.vocab]
    sentence2 = [token for token in sentence2.split() if token in word_emb_model.vocab]
    
    word_counts = map_word_frequency((sentence1 + sentence2))
    embedding_size = 300 # size of vectore in word embeddings
    a = 0.001
    sentence_set=[]
    for sentence in [sentence1, sentence2]:
        vs = np.zeros(embedding_size)
        sentence_length = len(sentence)
        for word in sentence:
            a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
            vs = np.add(vs, np.multiply(a_value, word_emb_model.wv[word])) # vs += sif * word_vector
        
        
        # TODO: Fix me
#         print(vs)
#         print(sentence_length)
        vs = np.divide(vs, sentence_length) # weighted average
        sentence_set.append(vs)
    return sentence_set


## Vector Similarity

In [89]:
# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_similarity(feature_vec_1, feature_vec_2):    
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

## Results

In [93]:
sentence1 = "A girl is styling her hair."
sentence2 = "A girl is brushing her hair."
sentence3 = "A boy is playing football"

In [94]:
processed_sentence1 = pre_process(sentence1)
processed_sentence2 = pre_process(sentence2)
processed_sentence3 = pre_process(sentence3)

processed_sentence1, processed_sentence2, processed_sentence3

('girl styling hair', 'girl brushing hair', 'boy playing football')

In [95]:
sentence_set1 = get_sif_feature_vectors(processed_sentence1, processed_sentence2)
sentence_set2 = get_sif_feature_vectors(processed_sentence1, processed_sentence3)

  vs = np.add(vs, np.multiply(a_value, word_emb_model.wv[word])) # vs += sif * word_vector


In [87]:
get_cosine_similarity(sentence_set1[0], sentence_set1[1])

0.7876443533464236

In [96]:
get_cosine_similarity(sentence_set2[0], sentence_set2[1])

0.33389045197895445