These text snippets are randomly sampled from a raw dataset. Each sentence pair may or may not be semantically related. 

### Importing the required packages.

In [1]:
#import nltk
#'nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')')
import numpy as np 
import pandas as pd
import re
import scipy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import FastText

In [2]:
data = pd.read_csv('../input/textsimilarity/Text_Similarity_Dataset.csv')
print(data.shape)

X = data.iloc[:,1:].values
s1 = X[:,0]
s2 = X[:,1]

(4023, 3)


### Preprocessing of Data

In [3]:
wnl = WordNetLemmatizer() #Lemmatization usually refers to doing things properly with the use of a vocabulary 
#and morphological analysis of words, normally aiming to remove inflectional endings only and to return the 
#base or dictionary form of a word. (lemma)
stop_words = set(stopwords.words('english'))

In [4]:
def preprocess(s):

    tokens = [word_tokenize(str(sentence)) for sentence in s] #Splitting strings into tokens (nominally words). 
    #It splits tokens based on white space and punctuation. 
    #For example, commas and periods are taken as separate tokens. 
    #Contractions are split apart

    rm = []
    for w in tokens:
        sm = re.sub('[^A-Za-z]',' ', str(w)) #removing non-alphabetical characters. Failed case -> one string has "2", another has "two".
        x = re.split("\s", sm) 
        rm.append(x)

    for sent in rm:
        while "" in sent:
            sent.remove('') #removing the empty elems

    low = []
    for i in rm:
        i = [x.lower() for x in i] #converts all the chars to lowercase.
        low.append(i)
    
    lemmatized = []
    for sent in low:
        tok = [wnl.lemmatize(w) for w in sent]
        lemmatized.append(tok)
    
    filtered_sent = []
    for sent in lemmatized:
        toks = [w for w in sent if w not in stop_words]
        filtered_sent.append(toks)
        
    return filtered_sent

In [5]:
filter_words1 = preprocess(s1)
filter_words2 = preprocess(s2)
# 139.517s on kaggle kernel.

### FastText
FastText is an extension to Word2Vec proposed by Facebook in 2016. FastText divides words into several n-grams instead of feeding individual words into the Neural Network. For example, app, ppl, and ple are trigrams for the word apple. The sum of these n-grams will form the word embedding vector for apple. We will have word embeddings for all of the n-grams given the training dataset after training the Neural Network. Because some of their n-grams are likely to appear in other words, rare words can now be properly represented.

In [12]:
model1 = FastText('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz')
model2 = FastText('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')
word_vector1 = model1.wv
word_vector2 = model2.wv
print(type(word_vector1))
#vocabulary = word_vectors.vocab.items() -> obsolete in gensim 4x
similarity = [word_vector1.similarity('woman', 'man'), word_vector2.similarity('woman','man')]
print(similarity)
# [0.3194794, 0.32284385] -> dissimilar value of similarity for different word vectors.

<class 'gensim.models.keyedvectors.FastTextKeyedVectors'>
[0.3194794, 0.32284385]


In [16]:
print(word_vector1['media'])
print(len(word_vector1['saw'])) #100-dimension vector.

[ 1.8601252e-03  3.6149721e-03 -1.2410122e-03  3.5201798e-03
  1.9627514e-03  1.5549501e-04 -3.0650198e-04 -2.4973419e-03
  1.0811526e-03  1.0770498e-03 -1.7207910e-03  3.5857665e-04
 -1.6502569e-03  1.7301711e-03  3.3062345e-03  1.2433920e-03
 -7.1278092e-04 -3.2974439e-04 -8.4022281e-04  2.8064770e-03
 -1.5423523e-03 -2.3654152e-03 -9.9924277e-04  2.6055963e-03
  1.6111322e-04  1.2526757e-03 -8.9503551e-04  7.4827758e-04
 -6.8006373e-04  1.4815993e-03  1.0589353e-03 -9.3595829e-04
  1.3139204e-03  1.0518164e-03  6.4095814e-04  1.5005955e-03
  1.1427484e-03 -1.4020366e-03 -8.3798834e-04  5.7458162e-04
  5.3651596e-04  1.8680288e-03  1.4767486e-04 -3.2695434e-03
 -2.6735817e-03 -1.6171483e-03  3.7804416e-03 -6.1163031e-05
  1.5303438e-03  2.2725803e-03  7.6203687e-05  2.5516321e-04
 -9.6527458e-04  3.5743082e-03 -3.1296138e-03  3.4323537e-03
 -1.5139274e-03 -1.4549088e-03  2.9783570e-03 -1.3296019e-03
 -1.0197319e-03  3.0602850e-03 -1.8883805e-03 -2.1588353e-03
 -8.6682744e-04  1.06070

## Cosine Similarity
*Cosine similarity* calculates the similarity of two vectors by taking the cosine of the angle formed by the two vectors in their dot product space. If the angle is zero, their similarity is one; and as the angle goes up, the similarity goes down. Because the measure is independent of vector length, it is a popular measure for high-dimensional spaces.

Other measures include *Jacard Similarity* and *Word mover distance* (independent of the words used in the dataset.)

In [7]:
result1, result2 = [], []

for sent1, sent2 in zip(filter_words1,filter_words2):
    vector11 = np.mean([word_vector1[word] for word in sent1], axis = 0)
    vector12 = np.mean([word_vector1[word] for word in sent2], axis = 0)
    vector21 = np.mean([word_vector2[word] for word in sent1], axis = 0)
    vector22 = np.mean([word_vector2[word] for word in sent2], axis = 0)
    cosine1 = scipy.spatial.distance.cosine(vector11, vector12)
    cosine2 = scipy.spatial.distance.cosine(vector21, vector22)
    result1.append((1-cosine1))
    result2.append((1-cosine2))
    
data['Result 1'], data['Result 2'] = result1, result2

In [8]:
ans  = data[['Unique_ID', 'Result 1', 'Result 2']]
print(ans[:12])
ans.to_csv('file1.csv', index=0)
# Unique_ID  Result 1  Result 2  -> dissimlar values for different word vectors.
#0           0  0.315760  0.418325
#1           1  0.385438  0.354219
#2           2  0.451542  0.454217
#3           3  0.436756  0.479157
#4           4  0.528687  0.514801
#5           5  0.379276  0.512622
#6           6  0.232499  0.261513
#7           7  0.510037  0.460600
#8           8  0.539121  0.498320
#9           9  0.396184  0.256629
#10         10  0.408196  0.441658
#11         11  0.252241  0.214241

    Unique_ID  Result 1  Result 2
0           0  0.315760  0.418325
1           1  0.385438  0.354219
2           2  0.451542  0.454217
3           3  0.436756  0.479157
4           4  0.528687  0.514801
5           5  0.379276  0.512622
6           6  0.232499  0.261513
7           7  0.510037  0.460600
8           8  0.539121  0.498320
9           9  0.396184  0.256629
10         10  0.408196  0.441658
11         11  0.252241  0.214241
