#### OM NAMO NARAYANA

**Reference implementation** : https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/ 

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pandas as pd
import random
import numpy as np
import math
from tqdm import tqdm

## Dataset preparation

We use NLTK for word tokenizers

#### Desired format

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_dir = '/content/drive/My Drive/thirukkural_meanings.json'


In [None]:
# dataset_dir = 'H:/sem8/nlp/proof-of-concept/Motivational Quotes Database.csv'


# df = pd.read_csv(dataset_dir)
# df.head()



df = pd.DataFrame([
 {
    'text': 'If one is easy of access to all, it will be easy for one to obtain the virtue called goodness',
    'id': 991
},
  {
    'text': 'Affectionateness and birth in a good family, these two constitute what is called a proper behaviour to all',
    'id': 992
},
  {
    'text': 'Resemblance of bodies is no resemblance of souls; true resemblance is the resemblance of qualities that attract',
    'id': 993
},
 ])

df.head()

Unnamed: 0,text,id
0,"If one is easy of access to all, it will be ea...",991
1,"Affectionateness and birth in a good family, t...",992
2,Resemblance of bodies is no resemblance of sou...,993


In [None]:
df = pd.read_json(dataset_dir)
df=df.T
df["text"]=df["explanation"]
del df["explanation"]
df=df.sort_index()
df.head(1330)

Unnamed: 0,id,text
1,1,As all letters have the letter A for their fi...
2,2,"What Profit have those derived from learning,..."
3,3,They who are united to the glorious feet of H...
4,4,To those who meditate the feet of Him who is ...
5,5,The two-fold deeds that spring from darkness ...
...,...,...
1326,1326,To digest what has been eaten is more delight...
1327,1327,Those are conquerors whose dislike has been d...
1328,1328,"Will I enjoy once more through her dislike, t..."
1329,1329,"May the bright-jewelled one feign dislike, an..."


In [None]:
sentences = df['text'].to_list()
sentences[:2]

[' As all letters have the letter A for their first, so the world has the eternalGod for its first. ',
 ' What Profit have those derived from learning, who worship not the good feet of Him who is possessed of pure knowledge ?. ']

In [None]:
tokenized_sent = []
for s in tqdm(sentences):
    tokenized_sent.append(word_tokenize(s.lower()))
print(*tokenized_sent[1:3])

100%|██████████| 1328/1328 [00:00<00:00, 4882.27it/s]

['what', 'profit', 'have', 'those', 'derived', 'from', 'learning', ',', 'who', 'worship', 'not', 'the', 'good', 'feet', 'of', 'him', 'who', 'is', 'possessed', 'of', 'pure', 'knowledge', '?', '.'] ['they', 'who', 'are', 'united', 'to', 'the', 'glorious', 'feet', 'of', 'him', 'who', 'occupies', 'swiftly', 'the', 'flower', 'of', 'the', 'mind', ',', 'shall', 'flourish', 'in', 'the', 'highest', 'of', 'worlds', '(', 'heaven', ')', '.']





In [None]:
from scipy.spatial import distance

## Doc2Vec

**documentation** : https://radimrehurek.com/gensim/models/doc2vec.html

**demonstration** : https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py <br/>

- We train the model from scratch here
- Unsupervised algorithm
- Continuous Bag of words type and skip gram type

For each token (sentence) from tokenizer, we assign an index to the sentence

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
tagged_data[:2]

[TaggedDocument(words=['as', 'all', 'letters', 'have', 'the', 'letter', 'a', 'for', 'their', 'first', ',', 'so', 'the', 'world', 'has', 'the', 'eternalgod', 'for', 'its', 'first', '.'], tags=[0]),
 TaggedDocument(words=['what', 'profit', 'have', 'those', 'derived', 'from', 'learning', ',', 'who', 'worship', 'not', 'the', 'good', 'feet', 'of', 'him', 'who', 'is', 'possessed', 'of', 'pure', 'knowledge', '?', '.'], tags=[1])]

In [None]:
# workers=2 works fastest
model = Doc2Vec(vector_size = 20, window = 2, min_count = 2, epochs = 10, workers=2)

model.build_vocab(tagged_data)

%time model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

## Print model vocabulary
words = list(model.wv.vectors)
print(len(words))
print(words[:2])


CPU times: user 1.52 s, sys: 185 ms, total: 1.71 s
Wall time: 1.25 s
1747
[array([ 0.26011738,  0.20686513,  1.6946808 , -0.41928676,  1.1649321 ,
        0.52856475, -0.07325538,  0.5509414 ,  0.8189825 , -1.0708112 ,
        0.8434144 ,  0.79471856,  0.04162694, -0.1935131 , -0.8573786 ,
       -0.648625  , -1.1369631 ,  1.0232544 ,  0.7841552 , -0.46445104],
      dtype=float32), array([ 0.28801158,  0.28036454,  1.7793783 , -0.45514157,  1.1687104 ,
        0.5580035 , -0.09314895,  0.567301  ,  0.8334145 , -1.1959307 ,
        0.8051916 ,  0.76798785,  0.010695  , -0.20312254, -0.9081747 ,
       -0.6467503 , -1.180803  ,  1.1208392 ,  0.87361956, -0.42745265],
      dtype=float32)]


In [None]:
test_doc = word_tokenize("Education is the need of the hour".lower())
test_doc_vector = model.infer_vector(test_doc)
print(test_doc_vector)
model.dv = model.__dict__['docvecs']
x = model.dv.most_similar(positive = [test_doc_vector], topn=10)
print(x)

[ 0.01837045  0.02522366  0.09475654 -0.04790021  0.09466945  0.04616991
  0.0054189   0.063945    0.03282956 -0.04896021  0.06326944  0.05096693
 -0.008582    0.00538691 -0.07771384 -0.03333202 -0.08294827  0.08072831
  0.05460158 -0.02376709]
[(1245, 0.9775989055633545), (231, 0.977020800113678), (1047, 0.9761792421340942), (963, 0.9758896827697754), (1185, 0.9758815169334412), (957, 0.9751823544502258), (251, 0.9748131036758423), (1021, 0.9734592437744141), (185, 0.9731235504150391), (434, 0.9729969501495361)]


In [None]:
index = [sentences[j] for j in [i[0] for i in x]]
# index = [i[0] for i in x]
for y in index:
    print(y)

 You are a fool, O my soul! to go after my departed one, while you mourn that he is not kind enough to favour you. 
 Whatsoever is spoken in the world will abide as praise upon that man who gives alms to the poor. 
 The destitute poor, who do not renounce their bodies, only consume their neighbour's salt and water. 
 Of what good is it (for the high-born) to go and stand in vain before those who revile him ? it only brings him loss of honour and exclusion from heaven. 
 Besides those who say "she has turned sallow" there are none who say "he has forsaken her". 
 He who desires a good name must desire modesty; and he who desires (the continuance of) a family greatness must be submissive to all. 
 As those possess no property who do not take care of it, so those possess no kindness who feed on flesh. 
 Those who are prompt in their efforts (to better their family) need no deliberation, such efforts will of themselves succeed. 
 The character of the faults of that man who publishes abroad

In [None]:
!pip install sentence_transformers transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 2.9 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 9.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 26.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.4 MB/s 


## SentenceBERT

**Reference**: https://www.sbert.net/

- We import and use the pre-trained mode
- Here we don't train the model

In [None]:
from sentence_transformers import SentenceTransformer
%time sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

CPU times: user 12.3 s, sys: 4.32 s, total: 16.6 s
Wall time: 39.7 s


In [None]:
%time sentence_embeddings = sbert_model.encode(sentences[:2000])

CPU times: user 2min 35s, sys: 1.3 s, total: 2min 36s
Wall time: 2min 40s


In [None]:
query = "God is Great"
query_vec = sbert_model.encode([query])[0]

In [None]:
topn = min(5, len(sentences))
n = min(2000, len(sentences))

sim = []
for sent in tqdm(random.sample(sentences, n)):
  sim.append(distance.cosine(query_vec, sbert_model.encode([sent])[0]))
#   print("Sentence = ", sent, "; similarity = ", sim)
ind = np.argpartition(sim, -topn)[-topn:]
print(np.array(sentences)[ind])

100%|██████████| 1328/1328 [03:52<00:00,  5.70it/s]

[' It is difficult to obtain another good equal to benevolence either in this world or in that of the gods. '
 ' Friendship is to be practiced not for the purpose of laughing but for that of being beforehand in giving one another sharp rebukes in case of transgression. '
 ' That ignorance which considers those things to be stable which are not so, is dishonorable (to the wise). '
 ' As the body is the abode of the spirit, so the excellence of modesty is the abode of perfection. '
 ' Those who are swallowed by the goddess called "gambling" will never have their hunger satisfied, but suffer the pangs of hell in the next world. ']





## InferSent

In [None]:
import os
os.getcwd()

'/content'

In [None]:
import os

# encoder_dir = 'H:/sem8/nlp/proof-of-concept/'
encoder_dir = os.getcwd() + '/'
base_dir = os.getcwd() + '/'

os.chdir(encoder_dir)

if(os.path.isdir(encoder_dir + 'encoder')  == False):
    ! mkdir encoder
else:
    print('A subdirectory already exists.')
    
! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl

if(os.path.isdir(encoder_dir + 'Glove')  == False):
    ! mkdir GloVe
else:
    print('A subdirectory already exists.')
! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
! unzip GloVe/glove.840B.300d.zip -d GloVe/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  16.1M      0  0:00:09  0:00:09 --:--:-- 21.9M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   315    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   352    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 2075M  100 2075M    0     0  5090k      0  0:06:57  0:06:57 --:--:-- 5385k
Archive:  GloVe/glove.840B.300d.zip
  inflating: GloVe/glove.840B.300d.txt  


In [None]:
!pip install models==0.9.3

In [None]:
from models import InferSent
import torch

V = 2
# base_path = 'H:/sem8/nlp/proof-of-concept/'
base_path = base_dir
MODEL_PATH = base_path + 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = base_path + '/GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

In [None]:
%time model.build_vocab(sentences, tokenize=True)

In [None]:
query = "Life is very short nanba, always be happy. Design Design a problems will come and go don't worry"
query_vec = model.encode(query)[0]
query_vec

In [None]:
topn = min(len(sentences), 10)
n = min(len(sentences), 100)

similarity = []
sim = []

for sent in tqdm(random.sample(sentences, n)):
  sim.append(distance.cosine(query_vec, model.encode([sent])[0]))
ind = np.argpartition(sim, -topn)[-topn:]
print(np.array(sentences)[ind])

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize =(16, 9))
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
ax.barh([x[:30] + '...' for x in np.array(sentences)[ind]], np.array(sim)[ind])
ax.invert_yaxis()
plt.show()

## Universal Sentence Encoder

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np