# Extract similarity scores between question and answer sentence

---

In [1]:
import numpy as np

In [2]:
ls ../data/glove_embeddings/

glove.6B.100d.txt         glove.6B.50d.txt          glove.840B.300d.zip.cpgz
glove.6B.200d.txt         glove.6B.zip              train-less-than-40.xml
glove.6B.300d.txt         glove.840B.300d.zip


In [13]:
# Map each word to an index
ndim = 300
glove_path = "../data/glove_embeddings/glove.6B.300d.txt"
with open(glove_path, "rb") as lines:
    w2idx = {line.split()[0].decode("utf-8"): i for i, line in enumerate(lines)}

In [14]:
vectors = np.empty((len(w2idx), ndim), dtype=np.float)
with open(glove_path, "rb") as lines:
    for i, line in enumerate(lines):
        vectors[i] = np.asarray(map(float, line.split()[1:]))

In [5]:
w2idx.keys()[:5]  # First few words in Word2Vec

[u'biennials', u'verplank', u'soestdijk', u'woode', u'mdbo']

---

## Extract similarity score for question answer pair

For each question, get its group of candidate answers, then compute similarity scores for each one

In [6]:
import string
from nltk.corpus import stopwords

words_to_exclude = frozenset(string.punctuation) | frozenset(["..", "..."])
words_to_exclude |= frozenset(stopwords.words("english"))

In [7]:
list(words_to_exclude)[:10]

[u'all',
 u'just',
 u'being',
 u'when',
 u'over',
 u'through',
 u'during',
 u'its',
 u'before',
 '$']

In [8]:
def get_QA_group(infile):
    question = []
    answers = []
    line = infile.readline().strip()
    if line == "":
        return None
    
    if not line.startswith("<QApairs"):
        raise Exception("Invalid data format: {}<-----".format(line))
            
    while not line.strip().startswith("</QApairs"):
        line = infile.readline().replace('\t', ' ')
        if line.strip().lower().startswith("<question"):
            line = infile.readline().replace('\t', ' ')
            question.append(line.strip())
        elif line.strip().lower().startswith("<positive"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("positive", line.strip()))
        elif line.strip().lower().startswith("<negative"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("negative", line.strip()))
    
    return {"question": question, "answers": answers}            
                

In [9]:
from nltk.tokenize import WhitespaceTokenizer

In [10]:
def extract_vector(sentence, exclude, w2idx, wordvectors):
    # Tokenize sentence
    splitter = WhitespaceTokenizer()
    tokens = splitter.tokenize(sentence)    
    # Remove stopwords and punctuation
    words = [t.lower() for t in tokens if t.lower() not in exclude ]
    
    # If we cannot find any words, we can consider returning a vector of 0
    # and set the resulting cosine similarity to 0 otherwise will result in nan
    # because cosine similarity will divide by 0.
    assert(len(words) > 0)
            
    # Average words in sentence that are in word matrix
    try:
        avg_vec = np.mean([wordvectors[w2idx[w]] for w in words if w in w2idx ] 
                                                 or [np.zeros(wordvectors.shape[1])], 
                           axis=0)
        if not np.any(avg_vec):
            print("Tokens cannot be found: {}".format(words))
        assert(np.any(avg_vec))
        return avg_vec
    except UnicodeDecodeError:
        print(line.strip())
        raise
    

In [17]:
import scipy as sp

with open("../data/answerSelectionExperiments/data/train-less-than-40.xml") as infile, open("../data/features/glove_embedding_sentence_similarities_train.txt", "w") as outfile:
    num_questions = 0
    while infile:
        scores = []
        group = get_QA_group(infile)
        
        # Check for EOF
        if group is None:
            break
            
        question = group["question"]
        qvec = extract_vector(question[0], words_to_exclude, w2idx, vectors)

        print("Question {}".format(num_questions))
        scores_by_label = {"positive": list(), "negative": list()}
        for (label, sentence) in group["answers"]:
            vec = extract_vector(sentence, words_to_exclude, w2idx, vectors)
            # Compute similarity with qvec
            cosine_distance = sp.spatial.distance.cosine(qvec, vec)
            print("Label: {} Cosine distance: {}".format(label, cosine_distance))
            scores.append(cosine_distance)
            scores_by_label[label].append(cosine_distance)
            
            outfile.write("{},{}\n".format(label, cosine_distance))
        
        print("std: {}".format(np.std(scores)))
        print("std pos: {}".format(np.std(scores_by_label["positive"])))
        print("std neg: {}".format(np.std(scores_by_label["negative"])))        
        
        num_questions += 1

Question 0
Label: positive Cosine distance: 0.307530843772
Label: positive Cosine distance: 0.400054167038
Label: positive Cosine distance: 0.225623698306
Label: positive Cosine distance: 0.204485827907
Label: negative Cosine distance: 0.333107006727
Label: negative Cosine distance: 0.527497039815
Label: negative Cosine distance: 0.368540450717
Label: negative Cosine distance: 0.431930026897
Label: negative Cosine distance: 0.357919126921
Label: negative Cosine distance: 0.286774938903
Label: negative Cosine distance: 0.404980974908
Label: negative Cosine distance: 0.38868424424
Label: negative Cosine distance: 0.382799792902
Label: negative Cosine distance: 0.35066226568
Label: negative Cosine distance: 0.328359462335
Label: negative Cosine distance: 0.321998980457
Label: negative Cosine distance: 0.391966491489
Label: negative Cosine distance: 0.511492385671
Label: negative Cosine distance: 0.284787930234
Label: negative Cosine distance: 0.300057187777
Label: negative Cosine distance

In [18]:
import scipy as sp

with open("../data/answerSelectionExperiments/data/dev-less-than-40.xml") as infile, open("../data/features/glove_embedding_sentence_similarities_dev.txt", "w") as outfile:
    num_questions = 0
    while infile:
        scores = []
        group = get_QA_group(infile)
        
        # Check for EOF
        if group is None:
            break
            
        question = group["question"]
        qvec = extract_vector(question[0], words_to_exclude, w2idx, vectors)

        print("Question {}".format(num_questions))
        scores_by_label = {"positive": list(), "negative": list()}
        for (label, sentence) in group["answers"]:
            vec = extract_vector(sentence, words_to_exclude, w2idx, vectors)
            # Compute similarity with qvec
            cosine_distance = sp.spatial.distance.cosine(qvec, vec)
            print("Label: {} Cosine distance: {}".format(label, cosine_distance))
            scores.append(cosine_distance)
            scores_by_label[label].append(cosine_distance)
            
            outfile.write("{},{}\n".format(label, cosine_distance))
        
        print("std: {}".format(np.std(scores)))
        print("std pos: {}".format(np.std(scores_by_label["positive"])))
        print("std neg: {}".format(np.std(scores_by_label["negative"])))        
        
        num_questions += 1

Question 0
Label: negative Cosine distance: 0.404847033372
Label: negative Cosine distance: 0.361319169162
Label: negative Cosine distance: 0.357262425965
Label: negative Cosine distance: 0.344851415224
Label: negative Cosine distance: 0.307754112551
Label: negative Cosine distance: 0.50350357202
Label: negative Cosine distance: 0.39133158553
Label: negative Cosine distance: 0.435430157228
std: 0.0568361944825
std pos: nan
std neg: 0.0568361944825
Question 1
Label: positive Cosine distance: 0.437965546501
Label: negative Cosine distance: 0.696477275108
Label: negative Cosine distance: 0.466853281142
Label: negative Cosine distance: 0.609482938777
Label: negative Cosine distance: 0.620044212533
Label: negative Cosine distance: 0.45802074677
Label: negative Cosine distance: 0.714234633149
Label: negative Cosine distance: 0.660924829807
Label: negative Cosine distance: 0.61235445271
Label: negative Cosine distance: 0.718799673215
Label: negative Cosine distance: 0.695751460999
Label: nega