# Inter-document similarity measures


## 1. Word Embeddings 
### We want to project words (word/token vectors) onto numeric vectors such that we can apply mathematical algorithms and methods on them

#### 1.1 Prerequisites

In [12]:
import scipy
import numpy as np
import gensim
import time

from ast import literal_eval
from preprocess_data import processed_data
from numpy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from gensim import corpora
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors


#### 1.2 BoW, Tf-Idf, Word2Vec using Skipgram and CBOW. 

In [104]:
#convert token list to bow-vector from scratch
def text2bow2(tk1,tk2):
    dictionary = corpora.Dictionary()
    vec = []
    vec1 = []
    vec2 = []
    vocab = []
    for t1,t2 in zip(tk1,tk2):
        vocab.append(t1)
        vocab.append(t2)    
    for v in vocab:
        vec1.append(tk1.count(v))
        vec2.append(tk2.count(v))
    vec.append(vec1)
    vec.append(vec2)
    return vec

def identity_tokenizer(text):
    return text

#convert token list to tf-idf-vector using sklearn
def text2tfidf(token_lists):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
    tfidf_vecs = tfidf_vectorizer.fit_transform(token_lists)
    return tfidf_vecs.todense()

#convert token list to word2vec using gensim
#training algorithm is defined in w2v, sg=0 meaning skip-gram, 1 meaning CBOW
def text2w2v_sg(tk1,tk2):
    token_list=[]
    for t1,t2 in zip(tk1,tk2):
        token_list.append(t1)
        token_list.append(t2)
    model = Word2Vec(token_list, window=5, min_count=1, workers=4, sg=1)
    #model.train(token_lists,total_examples=len(token_lists),epochs=10)
    return model

def text2w2v_cb(tk1,tk2):
    token_list=[]
    for t1,t2 in zip(tk1,tk2):
        token_list.append(t1)
        token_list.append(t2)
    model = Word2Vec(token_list, window=5, min_count=1, workers=4, sg=1)
    return model

def loadGlv():
    glove2word2vec(glove_input_file="./data/glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")
    glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)
    return glove_model


## 2. Cosine Similarity

### We want to compute the cosine similarity between documents as a measure for inter-document similarity.
### We will compute the cossim first using the BoW embedding, word2vec with SkipGram and CBoW, tf-idf and finally with the pre-trained GloVe 

In [110]:
#we set the similarity threshold to 0.8 to avoid false positives
def cos_similarities_bow(processed_data):
    similarities = []
    #compute pairwise similarities between bow vectors
    for (str1,str2) in zip(processed_data[1:,3],processed_data[1:,4]):
        bows = text2bow2(str1,str2)
        cosine = 1 - scipy.spatial.distance.cosine(np.array(bows[0]), np.array(bows[1]))
        if cosine >= 0.8:
            similarities.append('1')
        else:
            similarities.append('0')
    return np.array(similarities)

#threshold here is higher, as numeric vectors seem to be very close to each other -> not suitable!
def cos_similarities_w2v(processed_data, algorithm_str):
    similarities = []
    if algorithm_str=="skipgram":
        model= text2w2v_sg(processed_data[1:,3],processed_data[1:,4])
    else:
        model= text2w2v_cb(processed_data[1:,3],processed_data[1:,4])
    #compute pairwise similarities between skipgram vectors
    for (str1,str2) in zip(processed_data[1:,3],processed_data[1:,4]):
        vec1 = np.mean([model[word] for word in str1 if word in model],axis=0)
        vec2 = np.mean([model[word] for word in str2 if word in model],axis=0)
        cosine = scipy.spatial.distance.cosine(vec1, vec2)
        if 1-cosine > 0.994:
            similarities.append('1')
        else:
            similarities.append('0')
    return np.array(similarities)


def cos_similarities_tfidf(processed_data):
    similarities = []
    #compute pairwise similarities between tf-idf vectors
    for (str1,str2) in zip(processed_data[1:,3],processed_data[1:,4]):
        pair = []
        pair.append(str1)
        pair.append(str2)
        tfidf_matrix = text2tfidf(pair)
        similarity = tfidf_matrix * tfidf_matrix.T
        if similarity[0,1] >= 0.8:
            similarities.append('1')
        else:
            similarities.append('0')
    return np.array(similarities)

#using the pre-trained vectors from Wikipedia database from GloVe
#train the gloVe model with the vectors and applying model on each token of a sentence 
#to generate numeric vectors
def cos_similarities_glove(processed_data):
    similarities = []
    glove_model = loadGlv()
    for (str1,str2) in zip(processed_data[1:,3],processed_data[1:,4]):
        vec1 = np.mean([glove_model[word] for word in str1 if word in glove_model],axis=0)
        vec2 = np.mean([glove_model[word] for word in str2 if word in glove_model],axis=0)
        cosine = scipy.spatial.distance.cosine(vec1, vec2)
        if 1-cosine >= 0.8:
            similarities.append('1')
        else:
            similarities.append('0')
    return np.array(similarities)

#calculating normalized mutual info score from sklearn to measure the accuracy of the results
print("NMI score, Cossim, BoW embedding:",normalized_mutual_info_score(cos_similarities_bow(processed_data), processed_data[1:,0]))   
print("NMI score, Cossim, W2V with SkipGram:",normalized_mutual_info_score(cos_similarities_w2v(processed_data,"skipgram"), processed_data[1:,0])) 
print("NMI score, Cossim, W2V with CBoW:",normalized_mutual_info_score(cos_similarities_w2v(processed_data,"cbow"), processed_data[1:,0])) 
print("NMI score, Cossim, Tf-Idf embedding:",normalized_mutual_info_score(cos_similarities_tfidf(processed_data), processed_data[1:,0])) 
print("NMI score, Cossim, pre-trained Glove embedding:",normalized_mutual_info_score(cos_similarities_glove(processed_data), processed_data[1:,0])) 

NMI score, Cossim, BoW embedding: 0.06392774474703444


  vec1 = np.mean([model[word] for word in str1 if word in model],axis=0)
  vec1 = np.mean([model[word] for word in str1 if word in model],axis=0)
  vec2 = np.mean([model[word] for word in str2 if word in model],axis=0)
  vec2 = np.mean([model[word] for word in str2 if word in model],axis=0)


NMI score, Cossim, W2V with SkipGram: 0.012826149818436385


  vec1 = np.mean([model[word] for word in str1 if word in model],axis=0)
  vec1 = np.mean([model[word] for word in str1 if word in model],axis=0)
  vec2 = np.mean([model[word] for word in str2 if word in model],axis=0)
  vec2 = np.mean([model[word] for word in str2 if word in model],axis=0)


NMI score, Cossim, W2V with CBoW: 0.014549726680304647
NMI score, Cossim, Tf-Idf embedding: 0.0129055029389371
NMI score, Cossim, pre-trained Glove embedding: 0.04162622040210457
