# Import

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
import json
import numpy as np
from scipy.spatial.distance import cdist
import spacy
nlp = spacy.load("en_core_web_sm")

# Data

In [2]:
f = open('dev-v2.0.json')
data = json.load(f)

In [3]:
# compiles contexts & build a guide to trace context back to paragraphs & questions
contexts = []
contexts_to_data = {}
questions=[]
questions_to_data = {}
context_new_i = 0
question_new_i = 0
for p_i, paragraph_sets in enumerate(data['data']):
    for c_i, context_sets in enumerate(data['data'][p_i]['paragraphs']):
        contexts.append(data['data'][p_i]['paragraphs'][c_i]['context'])
        contexts_to_data[context_new_i] = (p_i, c_i)
        for q_i, question_sets in enumerate(data['data'][p_i]['paragraphs'][c_i]['qas']):
            questions.append(data['data'][p_i]['paragraphs'][c_i]['qas'][q_i]['question'])
            if data['data'][p_i]['paragraphs'][c_i]['qas'][q_i]['is_impossible'] == False:
                questions_to_data[question_new_i] = (context_new_i, p_i, c_i)
            else:
                questions_to_data[question_new_i] = ('impossible', p_i, c_i)
            question_new_i +=1
        context_new_i +=1

# Train with Doc2Vec

Why am I using Doc2Vec for document similiarity?   
The newer techniques such as BERT can only take in a maximum of 512 tokens
for this step, i wanted a model that could handle any length of text.

Parameter research: dm vs dbow  
dm (distributed memory): randomly samples consecutive words from a paragraph, randomly samples sets of words, using those words as input, predicts a center word. (kind of like CBOW, continuous bag of words) Doc vectors and word vectors are average together. co trains word vectors
dbow (distributed bag of words): ignores context words & has to predict words randomly sampled from the paragraph. 
dbow_words = 1, skip-gram word vector will also be trained (in addition to doc-vectors over the whole text), word-vectors over each sliding context window will be trained.
  
If you only need doc-vectors, use Doc2Vec in a mode that doesn't create or word-vectors (pure PV-DBOW, dm=0, dbow_words=0) or a Doc2Vec mode that also happens to create word-vectors but just choose to ignore them. 

In [4]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(contexts)]

In [5]:
cores = multiprocessing.cpu_count()
model = Doc2Vec(dm=0, dbow_words=0, vector_size=200, min_count=0, seed=42, workers=cores)

In [6]:
%%time 
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=200)

CPU times: user 1min 10s, sys: 3.73 s, total: 1min 14s
Wall time: 42.1 s


In [7]:
model.save("Doc2Vec_squad2.model")

# Pick a question 

In [23]:
q_i = 6000
question = questions[q_i]
print(question)
context_new_i, p_i, c_i = questions_to_data[q_i]
print(context_new_i)

What are other major fatality causes?
594


In [25]:
context = contexts[context_new_i]
print(context)

Construction is one of the most dangerous occupations in the world, incurring more occupational fatalities than any other sector in both the United States and in the European Union. In 2009, the fatal occupational injury rate among construction workers in the United States was nearly three times that for all workers. Falls are one of the most common causes of fatal and non-fatal injuries among construction workers. Proper safety equipment such as harnesses and guardrails and procedures such as securing ladders and inspecting scaffolding can curtail the risk of occupational injuries in the construction industry. Other major causes of fatalities in the construction industry include electrocution, transportation accidents, and trench cave-ins.


# Predict

In [26]:
text = nlp(question)
tokens = [token.lemma_ for token in text]
model.random.seed(42)
vector = model.infer_vector(tokens)

# Evaluate

In [27]:
scores = [(1- cdist(vector.reshape(1,-1)
                    , model.dv[i].reshape(1,-1)
                    , 'cosine'))[0][0] 
          for i in range(len(model.dv))]
ranks = np.argsort(scores)[::-1]

In [28]:
print('Top contexts: ', ranks[:5])
print('Top contexts scores: ', [scores[i] for i in ranks[:5]])

Top contexts:  [ 672  558   49 1028  553]
Top contexts scores:  [0.5827660880269205, 0.5549569234538123, 0.5488332389591334, 0.5390468547532865, 0.5276334579594801]


In [29]:
print('Right answers score: ', scores[context_new_i])
print('Right answers rank: ', list(ranks).index(context_new_i))

Right answers score:  0.1865200198304846
Right answers rank:  877


# Next steps
Shuffle NLP processing & different parameters. Calculate sum of errors.