# Extract similarity scores between question and answer sentence

---

In [1]:
import numpy as np

In [2]:
ls ../data/glove_embeddings/

glove.6B.100d.txt         glove.6B.50d.txt          glove.840B.300d.zip.cpgz
glove.6B.200d.txt         glove.6B.zip              train-less-than-40.xml
glove.6B.300d.txt         glove.840B.300d.zip


## Load the GloVe word embedding vectors

In [66]:
# Map each word to an index
ndim = 100
glove_path = "../data/glove_embeddings/glove.6B.100d.txt"
with open(glove_path, "rb") as lines:
    w2idx = {line.split()[0].decode("utf-8"): i for i, line in enumerate(lines)}

In [67]:
vectors = np.empty((len(w2idx), ndim), dtype=np.float)
with open(glove_path, "rb") as lines:
    for i, line in enumerate(lines):
        vectors[i] = np.asarray(map(float, line.split()[1:]))

In [None]:
w2idx.keys()[:5]  # First few words in Word2Vec

---

## Extract similarity score for question answer pair

For each question, get its group of candidate answers, then compute similarity scores for each one

In [9]:
import string
from nltk.corpus import stopwords

words_to_exclude = frozenset(string.punctuation) | frozenset(["..", "..."])
words_to_exclude |= frozenset(stopwords.words("english"))

In [54]:
# Get a question and its candidate answers
def get_QA_group(infile):
    question = []
    answers = []
    line = infile.readline().strip()
    if line == "":
        return None
    
    if not line.startswith("<QApairs"):
        raise Exception("Invalid data format: {}<-----".format(line))
    
    while not line.strip().startswith("</QApairs"):
        line = infile.readline().replace('\t', ' ')
        if line.strip().lower().startswith("<question"):
            line = infile.readline().replace('\t', ' ')
            question.append(line.strip())
        elif line.strip().lower().startswith("<positive"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("positive", line.strip()))
        elif line.strip().lower().startswith("<negative"):
            line = infile.readline().replace('\t', ' ')
            answers.append(("negative", line.strip()))
    
    return {"question": question, "answers": answers}            
                

In [6]:
from nltk.tokenize import WhitespaceTokenizer

# Compute the vector for a sentence by averaging the words in the sentence that has word embeddings
def extract_vector(sentence, exclude, w2idx, wordvectors):
    # Tokenize sentence
    splitter = WhitespaceTokenizer()
    tokens = splitter.tokenize(sentence)    
    # Remove stopwords and punctuation
    words = [t.lower() for t in tokens if t.lower() not in exclude ]
    
    # If we cannot find any words, we can consider returning a vector of 0
    # and set the resulting cosine similarity to 0 otherwise will result in nan
    # because cosine similarity will divide by 0.
    assert(len(words) > 0)
            
    # Average words in sentence that are in word matrix
    try:
        avg_vec = np.mean([wordvectors[w2idx[w]] for w in words if w in w2idx ] 
                                                 or [np.zeros(wordvectors.shape[1])], 
                           axis=0)
        if not np.any(avg_vec):
            print("Tokens cannot be found: {}".format(words))
        assert(np.any(avg_vec))
        return avg_vec
    except UnicodeDecodeError:
        print(line.strip())
        raise
    

In [70]:
import scipy as sp

# Compute the scores/features for a dataset
def generate_similarity_scores(input_file, output_file):
    with open(input_file) as infile, \
         open(output_file, 'w') as outfile:
        num_questions = 0
        while infile:
            print("Question {}".format(num_questions))
            group = get_QA_group(infile)

            # Check for EOF
            if group is None:
                break

            # Extract question vector
            question = group["question"]
            qvec = extract_vector(question[0], words_to_exclude, w2idx, vectors)

            scores = []
            scores_by_label = {"positive": list(), "negative": list()}
            for (label, sentence) in group["answers"]:
                # Compute similarity with question vector
                vec = extract_vector(sentence, words_to_exclude, w2idx, vectors)
                cosine_distance = sp.spatial.distance.cosine(qvec, vec)
                print("Label: {} Cosine distance: {}".format(label, cosine_distance))
                scores.append((label, cosine_distance))
                scores_by_label[label].append(cosine_distance)            
                outfile.write("{},{}\n".format(label, cosine_distance))                  

            num_questions += 1

## Generate similarity scores for training data

In [71]:
input_file = "../data/answerSelectionExperiments/data/train-less-than-40.xml"
output_file = "../data/features/glove_embedding_sentence_similarities_train_100.txt"
generate_similarity_scores(input_file, output_file)

Question 0
Label: positive Cosine distance: 0.195214402347
Label: positive Cosine distance: 0.202330703095
Label: positive Cosine distance: 0.13883810479
Label: positive Cosine distance: 0.118467421872
Label: negative Cosine distance: 0.195553795948
Label: negative Cosine distance: 0.317933601625
Label: negative Cosine distance: 0.222631034828
Label: negative Cosine distance: 0.257786314301
Label: negative Cosine distance: 0.226387749827
Label: negative Cosine distance: 0.185870551432
Label: negative Cosine distance: 0.264814087906
Label: negative Cosine distance: 0.292887505113
Label: negative Cosine distance: 0.212908651604
Label: negative Cosine distance: 0.192920231288
Label: negative Cosine distance: 0.201984060402
Label: negative Cosine distance: 0.175320667381
Label: negative Cosine distance: 0.221423231866
Label: negative Cosine distance: 0.358690378091
Label: negative Cosine distance: 0.142140072996
Label: negative Cosine distance: 0.183787367612
Label: negative Cosine distanc

## Generate similarity scores for test data

In [72]:
input_file = "../data/answerSelectionExperiments/data/test-less-than-40.xml"
output_file = "../data/features/glove_embedding_sentence_similarities_test_100.txt"
generate_similarity_scores(input_file, output_file)

Question 0
Label: positive Cosine distance: 0.271479329713
Label: positive Cosine distance: 0.530470934027
Label: negative Cosine distance: 0.638305460271
Label: negative Cosine distance: 0.60406325203
Label: negative Cosine distance: 0.600676980733
Label: negative Cosine distance: 0.605801869735
Label: negative Cosine distance: 0.571699703262
Label: negative Cosine distance: 0.572383550212
Label: negative Cosine distance: 0.622484530054
Label: negative Cosine distance: 0.616276352558
Question 1
Label: negative Cosine distance: 0.167946004937
Label: negative Cosine distance: 0.186839182078
Question 2
Label: positive Cosine distance: 0.449372366176
Label: positive Cosine distance: 0.156316569882
Label: positive Cosine distance: 0.404600341334
Label: positive Cosine distance: 0.330274164091
Label: negative Cosine distance: 0.529109514165
Label: negative Cosine distance: 0.527353982358
Label: negative Cosine distance: 0.491428231598
Question 3
Label: positive Cosine distance: 0.1305836997

In [16]:
num_questions, num_examples

(100, 1517)