In [68]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint

import numpy as np
import pandas as pd
import torch
import pickle
from textblob import TextBlob

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load model

In [69]:
# Load model
from models import InferSent
model_version = 1
MODEL_PATH = "../encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

In [70]:
# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

In [71]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = '../dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [72]:
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


## Load sentences

In [29]:
def load_pickle(filename):
    with open("../../data/" + filename + ".pickle", "rb") as pickling_on:
        obj = pickle.load(pickling_on)
    return obj

In [31]:
train = load_pickle("train_context")

In [41]:
train.context.values.tolist()[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [73]:
train = pd.read_json("../../data/training.json")

In [74]:
valid = pd.read_json("../../data/development.json")

In [75]:
train.shape

(442, 1)

In [76]:
valid.shape

(442, 1)

In [77]:
valid.head()

Unnamed: 0,data
0,"{'title': 'Beyoncé', 'paragraphs': [{'context'..."
1,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,{'title': 'Sino-Tibetan_relations_during_the_M...
3,"{'title': 'IPod', 'paragraphs': [{'context': '..."
4,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [78]:
train.iloc[4,0]['paragraphs'][0]

{'context': 'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]',
 'qas': [{'question': 'What year was the Wii version of Legend of Zelda: Twilight Princess released?',
   'id': '56d1159e17492d1400aab8cf',
   'answers': [{'text': '2006', 'answer_start': 578}],
   'is_impossible': False},
  {'plausible_answers': [{'text': 'GameCube and Wii', 'a

In [79]:
contexts = []
questions = []
answers_text = []
answers_start = []
is_impossible = []
for i in range(train.shape[0]):
    topic = train.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'] if len(q_a['answers']) > 0 else None)
            answers_text.append(q_a['answers'][0]['text'] if len(q_a['answers']) > 0 else None)
            is_impossible.append(q_a['is_impossible'])
            contexts.append(sub_para['context'])   
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})

In [80]:
df.shape

(69596, 4)

In [81]:
paras = list(df["context"].drop_duplicates().reset_index(drop= True))

In [82]:
len(paras)

17766

In [83]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [84]:
len(sentences)

87723

In [85]:
model.build_vocab(sentences, tokenize=True)

Found 86905(/107041) words with w2v vectors
Vocab size : 86905


In [None]:
dict_embeddings = {}
for i in range(len(sentences)):
    dict_embeddings[sentences[i]] = model.encode([sentences[i]], tokenize=True)

In [None]:
questions = list(df["question"])

In [None]:
len(questions)

In [None]:
for i in range(len(questions)):
    print(i)
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)

In [None]:
d1 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 0}
d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [None]:
with open('data/dict_embeddings1.pickle', 'wb') as handle:
    pickle.dump(d1, handle)

In [None]:
with open('data/dict_embeddings2.pickle', 'wb') as handle:
    pickle.dump(d2, handle)

In [None]:
# Load some sentences
sentences = []
with open('samples.txt') as f:
    for line in f:
        sentences.append(line.strip())
print(len(sentences))

In [None]:
sentences[:5]

In [6]:
df_stack = pd.read_csv("../../Stanford_politeness_corpus/stack-exchange.annotated.csv", index_col=1)
df_wiki = pd.read_csv("../../Stanford_politeness_corpus/wikipedia.annotated.csv", index_col=1)

In [7]:
df = pd.concat([df_stack, df_wiki])

In [8]:
df.head()

Unnamed: 0_level_0,Community,Request,Score1,Score2,Score3,Score4,Score5,TurkId1,TurkId2,TurkId3,TurkId4,TurkId5,Normalized Score
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3210318,092011 Stack Overflow,"Can you explain more in detail, what should I ...",12,16,15,16,13,A33SMNMTMIOJ6T,A2OXXHGAM7B0Y,A28TXBSZPWMEU9,A3EJ5TT2ZGBIDA,A3OY0OL2M0HUTT,0.217326
1314703,092011 Stack Overflow,Will expressions always be unambiguously paren...,13,13,18,14,12,A1UIH2IMG9DV95,A23FB7HE970AZJ,A17G2LOYX7WQZ,A2BKPNKU3EG41Z,A3L459M9ME6WKI,0.063302
4740262,092011 Stack Overflow,how are you resolving function pointers? I am ...,13,15,17,13,17,A1BS64O3JY0YJ4,A2AE4MZVUX9JPX,ARJ44YPGA2FPK,A14TK8NCD4CUN1,A17G2LOYX7WQZ,0.128902
6972808,092011 Stack Overflow,What is the definition of `buffer`? Is it a lo...,13,13,13,13,19,A3VJDU2VRMN05L,A1ZHP80O13CEUI,A28TXBSZPWMEU9,A3MMLCBV2W3BP9,A2BKPNKU3EG41Z,0.240188
5269106,092011 Stack Overflow,Is `A` a global variable? What is x?,17,18,16,16,13,A3OW54MEVDKXJL,A2RDZ580VXUO1X,A872FSFU7WV6W,A1HBDQ0BJQBA4Q,A34M93NJC830DP,0.508284


### get sentence list

In [9]:
requests = [(k, v["Request"]) for k, v in df[["Request"]].to_dict("index").items()]

In [10]:
len(requests)

10755

## Encode sentences

In [None]:
# gpu mode : >> 1000 sentences/s
# cpu mode : ~100 sentences/s

In [None]:
embeddings = model.encode([i[1] for i in requests], bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

---

In [11]:
id_embs = [(k, model.encode([v], bsize=128, tokenize=False, verbose=True)[0]) for k, v in requests]

Nb words kept : 11/16 (68.8%)
Speed : 2.0 sentences/s (cpu mode, bsize=128)
Nb words kept : 19/26 (73.1%)
Speed : 2.2 sentences/s (cpu mode, bsize=128)
Nb words kept : 23/25 (92.0%)
Speed : 1.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 11/13 (84.6%)
Speed : 3.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 7/10 (70.0%)
Speed : 6.0 sentences/s (cpu mode, bsize=128)
Nb words kept : 13/15 (86.7%)
Speed : 3.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 8/11 (72.7%)
Speed : 5.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 24/28 (85.7%)
Speed : 1.7 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/16 (87.5%)
Speed : 3.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/16 (87.5%)
Speed : 1.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/12 (83.3%)
Speed : 3.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 20/22 (90.9%)
Speed : 2.0 sentences/s (cpu mode, bsize=128)
Nb words kept : 24/28 (85.7%)
Speed : 1.7 sentences/s (cpu mode, bsize=128)
Nb words kept 

Speed : 4.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 19/21 (90.5%)
Speed : 2.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 9/13 (69.2%)
Speed : 4.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 18/21 (85.7%)
Speed : 2.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 17/21 (81.0%)
Speed : 2.7 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/16 (87.5%)
Speed : 3.3 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/12 (83.3%)
Speed : 4.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 8/12 (66.7%)
Speed : 5.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/19 (73.7%)
Speed : 2.8 sentences/s (cpu mode, bsize=128)
Nb words kept : 14/18 (77.8%)
Speed : 3.1 sentences/s (cpu mode, bsize=128)
Nb words kept : 10/13 (76.9%)
Speed : 4.5 sentences/s (cpu mode, bsize=128)
Nb words kept : 31/35 (88.6%)
Speed : 1.4 sentences/s (cpu mode, bsize=128)
Nb words kept : 13/15 (86.7%)
Speed : 3.2 sentences/s (cpu mode, bsize=128)
Nb words kept : 22/28 (78.6%)
Speed : 1.9 se

KeyboardInterrupt: 

In [None]:
new_req = "..."
new_emb = model.encode([new_req])[0]

In [None]:
def cosine(id_emb):
    return np.dot(new_emb, id_emb[1]) / (np.linalg.norm(new_emb) * np.linalg.norm(id_emb[1]))

In [None]:
ret = min(id_embs, key=lambda x : cosine)

In [None]:
rid, remb = ret

In [None]:
df.loc[rid]

## Visualization

In [None]:
np.linalg.norm(model.encode(['the cat eats.']))

In [None]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
cosine(model.encode(['the cat eats.'])[0], model.encode(['the cat drinks.'])[0])

In [None]:
idx = randint(0, len(sentences))
_, _ = model.visualize(sentences[idx])

In [None]:
my_sent = 'The cat is drinking milk.'
_, _ = model.visualize(my_sent)

In [None]:
model.build_vocab_k_words(500000) # getting 500K words vocab
my_sent = 'barack-obama is the former president of the United-States.'
_, _ = model.visualize(my_sent)