# CS1671 Machine Comprehension Project
### Jacob Emmerson
11/30/23

In [92]:
import pandas as pd
import numpy as np
import re # regular expression
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import editdistance as ed


In [2]:
columns = [
    'story_id',
    'author',
    'story',
    'q1',
    'q1_a1',
    'q1_a2',
    'q1_a3',
    'q1_a4',
    'q2',
    'q2_a1',
    'q2_a2',
    'q2_a3',
    'q2_a4',
    'q3',
    'q3_a1',
    'q3_a2',
    'q3_a3',
    'q3_a4',
    'q4',
    'q4_a1',
    'q4_a2',
    'q4_a3',
    'q4_a4',
]

In [3]:
def answer_to_dict(row, question):
    return {'A': row[f'{question}_a1'], 'B': row[f'{question}_a2'], 'C': row[f'{question}_a3'], 'D': row[f'{question}_a4']}

In [4]:
def load_answers(path):
    return pd.read_csv(path, sep = '\t',  header = None)

In [5]:
def load_stories(path):
    stories = pd.read_csv(path, sep = '\t', names = columns, header=None)
    stories = stories.drop(columns = ['author'])
    for q in ['q1', 'q2', 'q3', 'q4']:
        stories[f'{q}_a'] = stories.apply(answer_to_dict, axis = 1, question = q)
        stories = stories.drop(columns=[f'{q}_a1',f'{q}_a2',f'{q}_a3',f'{q}_a4'])


    return stories.T

In [6]:
train_stories = load_stories('data/mc500.train.tsv')
train_stories.T.head(2)

Unnamed: 0,story_id,story,q1,q2,q3,q4,q1_a,q2_a,q3_a,q4_a
0,mc500.train.0,Alyssa got to the beach after a long trip. She...,one: What city is Alyssa in?,one: Why did Alyssa go to Miami?,multiple: How many friends does Alyssa have?,multiple: What did Alyssa eat at the restaurant?,"{'A': 'trip', 'B': 'Miami', 'C': 'Atlanta', 'D...","{'A': 'swim', 'B': 'travel', 'C': 'visit frien...","{'A': '1', 'B': '2', 'C': '3', 'D': '4'}","{'A': 'steak', 'B': 'soup', 'C': 'salad', 'D':..."
1,mc500.train.1,"One morning, Elena woke up, much like she did ...",multiple: What is the very first thing Elena d...,one: At what time of day does this story take ...,multiple: What happened to Mr. Fish in the end?,multiple: What was missing from the fish bowl?,"{'A': 'she says hello to the tree', 'B': 'she ...","{'A': 'Before the tree', 'B': 'At the end of t...","{'A': 'He got put back in the bowl, but he was...","{'A': 'the rocks', 'B': 'Mr. Fish', 'C': 'the ..."


In [7]:
train_answers = load_answers('data/mc500.train.ans')
train_answers.head()

Unnamed: 0,0,1,2,3
0,B,C,C,D
1,B,D,D,B
2,D,D,D,B
3,A,B,C,C
4,C,C,D,A


In [8]:
def create_td_mat(story):
    sentences = re.split('[.!?] ', story)
    vocab = set(re.split('[.!?] | ', story.lower())) # different vocab for each story to prevent matrix from getting too sparse (and save memory)
    vtoi = dict(zip(vocab, range(len(vocab)))) # vocab to index
    td_mat = np.zeros(shape = (len(vocab), len(sentences))) # initialize term-document matrix

    for s in range(len(sentences)):
        words = sentences[s].lower().split(' ')
        s_index = s

        for w in words:
            w_index = vtoi[w]
            td_mat[w_index, s_index] += 1

    return td_mat, vocab

In [9]:
def convert_question(question, vocab):
    vtoi = dict(zip(vocab, range(len(vocab)))) # vocab to index
    q = np.zeros(shape = len(vocab))

    for w in question.lower().split(' '):
        try:
            q[vtoi[w]] += 1
        except:
            continue
        
    return q

In [10]:
def cosine_sim(vector1, vector2):
    len1 = np.sqrt(vector1.dot(vector1))
    len2 = np.sqrt(vector2.dot(vector2))

    if len1 == 0 or len2 == 0: # prevent divison by zero; caused by frequent terms using tf-idf or infrequent words in tc matrix
        return 0

    return vector1.dot(vector2)/(len1 * len2)

In [11]:
def sort_by_column(matrix, col_index, ascending = False):
    mat = matrix
    if ascending:
        a = 1
    else:
        a = -1
    # uses quicksort (not stable), non issue with continuous data
    return mat[mat[:,col_index].argsort(kind = "quicksort")[::a]]

In [12]:
def rank_sentences(matrix, question):
    q = question
    mat = matrix
    ncols = mat.shape[1] # match sentences to the question

    for r in range(ncols):
        s = cosine_sim(q, mat[:,r])

        if r == 0:
            sims = np.array([[r,s]])

        else:
            sims = np.append(sims, [[r,s]], axis = 0)

    sims = sort_by_column(sims, 1)

    return sims[:,0].astype(np.int32), sims[:,1]

In [114]:
def preprocess_text(text):
    sentence = text.split(': ')[-1] # for questions
    stop_words = set(stopwords.words('english')) | set(['!','?','.',"'s"])
    stemmer = PorterStemmer()

    # Tokenize the text
    tokens = word_tokenize(sentence)
    tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stop_words]
    
    # Join tokens back into a sentence
    processed_text = ' '.join(tokens)
    
    return processed_text

def create_hypothesis(question, candidate_answers):
    # Preprocess question and candidate answer
    processed_question = preprocess_text(question)

    hyp = {}
    for num,answer in candidate_answers.items():
        processed_answer = preprocess_text(answer)
        hyp[num] = f'{processed_question} {processed_answer}'
    
    # Formulate hypothesis

    return hyp

---
## Baseline

In [115]:
def top_hypothesis_match(story_data, correct_answers = None):
    story_answers = {}

    for story_id in story_data:

        story = story_data[story_id]
        passage = preprocess_text(story['story'])
        td_mat, vocab = create_td_mat(passage)

        ans = []
        for question in ['q1','q2','q3','q4']:
            lambdas = np.array([1,2,5,2,1], dtype = np.float64)
            lambdas /= lambdas.sum()
            query = story[question]
            q = query.split(': ')[1] # question without prefix

            # rank index and scores
            r_index, r_score = rank_sentences(td_mat, convert_question(q, vocab))
            top_index = r_index[0]
            comb = td_mat[:,[
                max(0, top_index-2),
                max(0, top_index-1),
                top_index,
                min(td_mat.shape[1]-1, top_index + 1),
                min(td_mat.shape[1]-1, top_index + 2)
            ]]
            comb = (comb * lambdas).sum(axis = 1)


            candidate_answers = story[f'{question}_a']
            best = (0,0)
            for num,answer in candidate_answers.items():
                sim = cosine_sim(comb, convert_question(preprocess_text(answer), vocab))
                if sim >= best[1]:
                    best = (num, sim)

            ans.append(best[0])

        story_answers[story_id] = ans

    pred_df = pd.DataFrame(story_answers).T

    return pred_df

In [116]:
def get_accuracy(pred_df, answer_df):

    N_questions = answer_df.shape[0] * answer_df.shape[1]
    pred_df = answer_df == pred_df
    pred_df[pred_df == False] = 0
    pred_df[pred_df == True] = 1
    accuracy = pred_df.values.sum() / N_questions

    return accuracy

In [117]:
train_preds = top_hypothesis_match(train_stories, train_answers)
get_accuracy(train_preds, train_answers)

0.41

In [16]:
test_stories = load_stories('data/mc500.test.tsv')
test_preds = top_hypothesis_match(test_stories)
test_preds['story_id'] = test_stories.T['story_id']
test_preds

Unnamed: 0,0,1,2,3,story_id
0,D,D,D,C,mc500.test.0
1,D,B,A,D,mc500.test.1
2,B,C,D,D,mc500.test.2
3,D,D,A,C,mc500.test.3
4,A,C,D,A,mc500.test.4
...,...,...,...,...,...
145,C,D,D,D,mc500.test.145
146,B,D,C,B,mc500.test.146
147,C,D,D,D,mc500.test.147
148,D,A,B,B,mc500.test.148


In [17]:
temp = test_preds.melt(id_vars = ['story_id'])
temp['variable'] += 1
temp['story_id'] = temp['story_id'] + '.' + temp['variable'].astype(str)
temp['sort'] = temp['story_id'].str.extract(r'\.(\d+)').astype(int)
temp = temp.sort_values('sort', kind = 'stable').drop(columns=['sort','variable']).rename(columns={'story_id' : 'id', 'value' : 'answer'})
temp

Unnamed: 0,id,answer
0,mc500.test.0.1,D
150,mc500.test.0.2,D
300,mc500.test.0.3,D
450,mc500.test.0.4,C
1,mc500.test.1.1,D
...,...,...
598,mc500.test.148.4,B
149,mc500.test.149.1,C
299,mc500.test.149.2,D
449,mc500.test.149.3,C


In [74]:
#temp.to_csv('./test_answers.csv', sep = ',', header = True, index = False)

---
## Heirarchical Entailment Model

In [111]:
story_answers = {}
for story_id in train_stories:
    story = train_stories[story_id]
    passage = story['story'] # text

    ans = []
    for question in ['q1','q2','q3','q4']:
        hypotheses = create_hypothesis(story[question],story[f'{question}_a'])

        best = (0,np.inf)
        # check each hypothesis against the text
        for num, hyp in hypotheses.items():
            
            dists = []
            for sentence in passage.split('. '):
                processed_sentence = preprocess_text(sentence)
                dists.append(ed.eval(processed_sentence, hyp))

            score = np.sort(dists)[:3].sum()
            if score < best[1]:
                best = (num, score)

        ans.append(best[0])

    story_answers[story_id] = ans

pred_df = pd.DataFrame(story_answers).T

In [112]:
get_accuracy(pred_df, train_answers)

0.25916666666666666