# CS1671 Machine Comprehension Project
### Jacob Emmerson
11/30/23

In [1]:
import pandas as pd
import numpy as np
import re # regular expression
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import editdistance as ed
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
columns = [
    'story_id',
    'author',
    'story',
    'q1',
    'q1_a1',
    'q1_a2',
    'q1_a3',
    'q1_a4',
    'q2',
    'q2_a1',
    'q2_a2',
    'q2_a3',
    'q2_a4',
    'q3',
    'q3_a1',
    'q3_a2',
    'q3_a3',
    'q3_a4',
    'q4',
    'q4_a1',
    'q4_a2',
    'q4_a3',
    'q4_a4',
]

In [3]:
def standardize(df):
    stand = df.copy()
    for col in stand:
        if col.lower() == 'y': continue
        
        mu = stand[col].mean()
        std = stand[col].std()
        stand[col] = (stand[col]-mu)/std
    return stand

def answer_to_dict(row, question):
    return {'A': row[f'{question}_a1'], 'B': row[f'{question}_a2'], 'C': row[f'{question}_a3'], 'D': row[f'{question}_a4']}

def load_answers(path):
    return pd.read_csv(path, sep = '\t',  header = None)

def load_stories(path):
    stories = pd.read_csv(path, sep = '\t', names = columns, header=None)
    stories = stories.drop(columns = ['author'])
    for q in ['q1', 'q2', 'q3', 'q4']:
        stories[f'{q}_a'] = stories.apply(answer_to_dict, axis = 1, question = q)
        stories = stories.drop(columns=[f'{q}_a1',f'{q}_a2',f'{q}_a3',f'{q}_a4'])

    return stories.T

In [4]:
train_stories = load_stories('data/mc500.train.tsv')
train_stories.T.head(2)

Unnamed: 0,story_id,story,q1,q2,q3,q4,q1_a,q2_a,q3_a,q4_a
0,mc500.train.0,Alyssa got to the beach after a long trip. She...,one: What city is Alyssa in?,one: Why did Alyssa go to Miami?,multiple: How many friends does Alyssa have?,multiple: What did Alyssa eat at the restaurant?,"{'A': 'trip', 'B': 'Miami', 'C': 'Atlanta', 'D...","{'A': 'swim', 'B': 'travel', 'C': 'visit frien...","{'A': '1', 'B': '2', 'C': '3', 'D': '4'}","{'A': 'steak', 'B': 'soup', 'C': 'salad', 'D':..."
1,mc500.train.1,"One morning, Elena woke up, much like she did ...",multiple: What is the very first thing Elena d...,one: At what time of day does this story take ...,multiple: What happened to Mr. Fish in the end?,multiple: What was missing from the fish bowl?,"{'A': 'she says hello to the tree', 'B': 'she ...","{'A': 'Before the tree', 'B': 'At the end of t...","{'A': 'He got put back in the bowl, but he was...","{'A': 'the rocks', 'B': 'Mr. Fish', 'C': 'the ..."


In [5]:
train_answers = load_answers('data/mc500.train.ans')
train_answers.head()

Unnamed: 0,0,1,2,3
0,B,C,C,D
1,B,D,D,B
2,D,D,D,B
3,A,B,C,C
4,C,C,D,A


In [6]:
test_stories = load_stories('data/mc500.test.tsv')
test_stories.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
story_id,mc500.test.0,mc500.test.1,mc500.test.2,mc500.test.3,mc500.test.4,mc500.test.5,mc500.test.6,mc500.test.7,mc500.test.8,mc500.test.9,...,mc500.test.140,mc500.test.141,mc500.test.142,mc500.test.143,mc500.test.144,mc500.test.145,mc500.test.146,mc500.test.147,mc500.test.148,mc500.test.149
story,It was Sally's birthday. She was very excited....,On the farm there was a little piggy named And...,A little boy named Justin lived on a farm. Hi...,Jamie was on the playground. As he was running...,Once there was a beautiful fish named Asta. As...,Derek was sad. He was playing in the school ya...,I have a friend who is a princess of the piggi...,There was once a group of kittens who all like...,The sound of the women washing laundry down at...,Billy and Sally are brother and sister. Billy ...,...,The dinosaur wants to have a pet goldfish. He ...,Sue was going to make her auntie Wendy toast. ...,John was very hungry. So he chose to make a s...,A man named Albert had the choice to build a t...,A man got fired from his job. He was in such ...,Anna's parents told her they were going to hav...,One day a man named John was walking down the ...,All of Larry's friends have a favorite animal....,Greg and his mother were building a racing car...,Joey went to a baseball game during the winter...
q1,one: What time did the party start?,multiple: What did the piggies do when Andy go...,multiple: Why was Justin's father proud of him?,one: What else happened to Jamie along with br...,multiple: How did Asta and Sharkie become frie...,multiple: Who was being mean to Derek?,one: What does my friends want to be?,one: Which kitten was the leader?,one: What woke Lizzie up?,multiple: Billy has blonde hair like his mothe...,...,multiple: Which fish did the pet store have fo...,multiple: What did Sue put on the tray?,multiple: What did Susan put down?,one: How long did it take Albert to build the ...,one: What did the man put on his pancakes?,multiple: Why did Anna buy a rattle?,multiple: Why did John pick up the telephone?,one: What is Larry's favorite animal?,one: What day was the race?,multiple: Who went to the baseball game and wi...
q2,multiple: Who got hurt at the party?,multiple: What did Andy see on his walk?,one: What was Justin's favorite animal?,one: Where did Jamie get his watch from?,multiple: Why did they take the note to Asta's...,one: Who called the police?,multiple: What is my friend's favorite kind of...,multiple: What did Fluffy want to do the first...,multiple: Who does Lizzie live with?,multiple: When Billy and Sally answer in a lou...,...,multiple: Why did the worker put the fish in a...,multiple: Where did Sue look for Auntie Wendy?,one: What did Tim put down?,multiple: What did Albert make the tree house ...,multiple: Why was the baby crying?,one: Where did the crib come from?,multiple: What did John do on Otonga?,one: What is a panda's favorite food?,one: What number did Greg put on his racing car?,one: what kind of store did Joey turn into?
q3,one: Whose birthday is it?,one: What did Andy love to roll around in?,one: What was Justin's father's name?,one: What was the name of the child who had th...,one: What did the bottle look like?,multiple: Where was Derek when the stranger wa...,multiple: What doesn't my piggy princess frien...,multiple: What did Larry think was a bad idea?,multiple: Where does Lizzie write her letter?,one: What is the name of the mother?,...,multiple: Why is the dinosaur reading his book?,multiple: What did Sue put on the toast?,multiple: What did John put down?,multiple: What did Albert choose to build?,multiple: What made the man forget about being...,one: What did Anna's dad say he would do to a ...,one: What did Jill do?,multiple: Which of the following did Larry not...,multiple: How did Greg's father help with the ...,multiple: Which team won the game Joey went to...


In [7]:
def create_td_mat(story):
    sentences = re.split('[.!?] ', story)
    vocab = set(re.split('[.!?] | ', story.lower())) # different vocab for each story to prevent matrix from getting too sparse (and save memory)
    vtoi = dict(zip(vocab, range(len(vocab)))) # vocab to index
    td_mat = np.zeros(shape = (len(vocab), len(sentences))) # initialize term-document matrix

    for s in range(len(sentences)):
        words = sentences[s].lower().split(' ')
        s_index = s

        for w in words:
            w_index = vtoi[w]
            td_mat[w_index, s_index] += 1

    return td_mat, vocab

def convert_question(question, vocab):
    vtoi = dict(zip(vocab, range(len(vocab)))) # vocab to index
    q = np.zeros(shape = len(vocab))

    for w in question.lower().split(' '):
        try:
            q[vtoi[w]] += 1
        except:
            continue
        
    return q

In [8]:
def cosine_sim(vector1, vector2):
    len1 = np.sqrt(vector1.dot(vector1))
    len2 = np.sqrt(vector2.dot(vector2))

    if len1 == 0 or len2 == 0: # prevent divison by zero; caused by frequent terms using tf-idf or infrequent words in tc matrix
        return 0

    return vector1.dot(vector2)/(len1 * len2)

def unigram_match(bag1, bag2):
    # normalized by bag1
    matches = 0
    for gram in bag1:
        if gram in bag2: matches += 1

    return matches / len(bag1)

def lev_similarity(passage, hypothesis):
    scores = np.array([ed.eval(x, hypothesis)/max(len(x),len(hypothesis)) for x in passage.split(' ')])
    return np.median(scores)

def jaccard_similarity(set1, set2):
    cap = set1.intersection(set2)
    cup = set1.union(set2)
    return len(cap)/len(cup)

In [9]:
def sort_by_column(matrix, col_index, ascending = False):
    mat = matrix
    if ascending:
        a = 1
    else:
        a = -1
    # uses quicksort (not stable), non issue with continuous data
    return mat[mat[:,col_index].argsort(kind = "quicksort")[::a]]

def rank_sentences(matrix, question):
    q = question
    mat = matrix
    ncols = mat.shape[1] # match sentences to the question

    for r in range(ncols):
        s = cosine_sim(q, mat[:,r])

        if r == 0:
            sims = np.array([[r,s]])

        else:
            sims = np.append(sims, [[r,s]], axis = 0)

    sims = sort_by_column(sims, 1)

    return sims[:,0].astype(np.int32), sims[:,1]

In [88]:
def preprocess_text(text):
    #stop_words = set(stopwords.words('english'))
    num_to_word = {
        '1' : 'one',
        '2' : 'two',
        '3' : 'three',
        '4' : 'four',
        '5' : 'five',
        '6' : 'six',
        '7' : 'seven',
        '8' : 'eight',
        '9' : 'nine',
        '0' : 'zero',
    }

    questions = [
        'who', 'what', 'why', 'when', 'did', 'how many', 'where', '...', '?'
    ]

    sentence = text.split(': ')[-1] # for questions
    stemmer = PorterStemmer()

    # tokeize
    tokens = word_tokenize(sentence)
    qtype = tokens[0]
    tokens = [token for token in tokens if token.lower() not in questions]
    tokens = [stemmer.stem(token) if token not in num_to_word else num_to_word[token] for token in tokens] #if not in stop_words]

    # return string
    return ' '.join(tokens), qtype

def create_hypothesis(question, candidate_answers):
    # Preprocess question and candidate answer
    processed_question, qtype = preprocess_text(question)

    hyp = {}
    for num,answer in candidate_answers.items():
        processed_answer = preprocess_text(answer)[0]
        if qtype == 'who':
            hyp[num] = f'{processed_answer} {processed_answer}'
        else:
            hyp[num] = f'{processed_question} {processed_answer}'
    
    # Formulate hypothesis

    return hyp

In [11]:
def get_accuracy(pred_df, answer_df):

    N_questions = answer_df.shape[0] * answer_df.shape[1]
    pred_df = answer_df == pred_df
    pred_df[pred_df == False] = 0
    pred_df[pred_df == True] = 1
    accuracy = pred_df.values.sum() / N_questions

    return accuracy

---
## Baseline

In [12]:
class THM_classifier():
    def __init__(self, story_data, weights = [1,1,1,1,1]):
        self.data = story_data
        self.lambdas = weights

    def predict(self):
        story_answers = {}
        story_data = self.data

        # for every story
        for story_id in story_data:

            # extract column
            story = story_data[story_id]

            # convert text into a term-document matrix and vocabulary set
            # each 'document' is a sentence
            passage = preprocess_text(story['story'])[0]
            td_mat, vocab = create_td_mat(passage)

            # len(ans) = 4 where i-th index corresponds to i+1 question
            ans = []
            for question in ['q1','q2','q3','q4']:

                # weights
                lambdas = np.array(self.lambdas, dtype = np.float64)
                lambdas /= lambdas.sum()
                
                query = story[question]
                q = preprocess_text(query)[0]

                # rank index and scores
                r_index, _ = rank_sentences(td_mat, convert_question(q, vocab))

                # create a combination of the window +/- 2 around the top sentence
                top_index = r_index[0]
                comb = td_mat[:,[
                    max(0, top_index-2),
                    max(0, top_index-1),
                    top_index,
                    min(td_mat.shape[1]-1, top_index + 1),
                    min(td_mat.shape[1]-1, top_index + 2)
                ]]
                comb = (comb * lambdas).sum(axis = 1)

                candidate_answers = story[f'{question}_a']
                best = (0,0)
                for num,answer in candidate_answers.items():
                    sim = cosine_sim(comb, convert_question(preprocess_text(answer)[0], vocab))
                    if sim >= best[1]:
                        best = (num, sim)

                # use the answer with the highest similarity to the combined window
                ans.append(best[0])

            story_answers[story_id] = ans

        return pd.DataFrame(story_answers).T

In [13]:
thm_baseline = THM_classifier(train_stories, weights = [1,2,5,2,1])
train_preds = thm_baseline.predict()
print(f"Baseline Accuracy on Training Data: {get_accuracy(train_preds, train_answers)}")

Baseline Accuracy on Training Data: 0.455


In [14]:
thm_baseline_test = THM_classifier(test_stories, weights = [1,2,5,2,1])
baseline_test_preds = thm_baseline_test.predict()
baseline_test_preds['story_id'] = test_stories.T['story_id']
baseline_test_preds = baseline_test_preds.melt(id_vars = ['story_id'])

baseline_test_preds['variable'] += 1
baseline_test_preds['story_id'] = baseline_test_preds['story_id'] + '.' + baseline_test_preds['variable'].astype(str)
baseline_test_preds['sort'] = baseline_test_preds['story_id'].str.extract(r'\.(\d+)').astype(int)
baseline_test_preds = baseline_test_preds.sort_values('sort', kind = 'stable').drop(columns=['sort','variable']).rename(columns={'story_id' : 'id', 'value' : 'answer'})
print(baseline_test_preds.head())

#temp.to_csv('./test_answers.csv', sep = ',', header = True, index = False)

                 id answer
0    mc500.test.0.1      D
150  mc500.test.0.2      B
300  mc500.test.0.3      C
450  mc500.test.0.4      C
1    mc500.test.1.1      D


---
## Entailment Model

In [89]:
class RTE_classifier():
    def __init__(self, training_answers, window_size = 3):
        self.win_size = window_size
        self.answers = training_answers

    def create_features(self, text_data):
        cos = []
        jac = []
        tthmatch = []
        httmatch = []
        lev = []
        dot = []
        qtype = []
        neg = []
        
        train_answers = self.answers
        story_data = text_data
        
        for story_id in story_data:
            story = story_data[story_id]
            passage = preprocess_text(story['story'])[0] # text
            td_mat, vocab = create_td_mat(passage)

            # convert each question 
            for question in ['q1','q2','q3','q4']:
                hypotheses = create_hypothesis(story[question],story[f'{question}_a'])

                # check each hypothesis against the text
                for num, hyp in hypotheses.items():
                    hyp_vocab = set(hyp.split(' '))
                    _, r_score = rank_sentences(td_mat, convert_question(hyp, vocab))

                    # BoW
                    cos.append(np.mean(r_score[:self.win_size]))
                    dot.append(convert_question(hyp, vocab).dot(td_mat.sum(axis = 1)))

                    # Sets
                    jac.append(jaccard_similarity(vocab, hyp_vocab))
                    tthmatch.append(unigram_match(vocab, hyp_vocab))
                    httmatch.append(unigram_match(hyp_vocab, vocab))

                    # Edit
                    lev.append(lev_similarity(passage, hyp))
            
                    if 'not' in hyp:
                        neg.append(1)
                    else:
                        neg.append(0)

                    if train_answers.T[story_id][int(question[-1])-1] == num:
                        qtype.append(1)
                    else:
                        qtype.append(0)

        # creating dataframe
        D_mat = pd.DataFrame(
            {
                'x1' : cos,
                'x2' : jac,
                'x3' : lev,
                'x4' : tthmatch,
                'x5' : httmatch,
                'x6' : dot,
                'x7' : neg,
                'y' : qtype
            }
        )

        return standardize(D_mat)
    
    def train(self, dataframe):
        # fits a support vector machine with a linear kernel on the feature set of similarity metrics
        X_mat = dataframe.drop(columns = ['y']).values
        Y_mat = dataframe['y'].values

        param_grid = [
            {
                'C' : np.logspace(-9, 0, num = 10),
                'penalty' : ['l2'],
                'loss' : ['squared_hinge'],
                'dual' : [False],
                'max_iter' : [10000],
                'tol' : [1e-5]
            }
        ]

        lsvc = GridSearchCV(
            LinearSVC(),
            param_grid,
            cv = 10,
            scoring = 'accuracy',
            verbose = 0
        ).fit(
            X_mat,
            Y_mat
        )

        print(lsvc.best_params_)
        self.model = lsvc

    def predict(self, X_matrix):
        lsvc = self.model

        scores = pd.DataFrame( 
            lsvc.decision_function(X_matrix.values).reshape(-4,4),
            columns = ['A', 'B', 'C', 'D']
        )
        scores['best'] = scores.idxmax(axis = 1)
        return scores

In [90]:
rte_model = RTE_classifier(train_answers)
train_df = rte_model.create_features(train_stories)
train_df.head()

TypeError: expected string or bytes-like object, got 'tuple'

In [None]:
rte_model.train(train_df)
train_preds = rte_model.predict(train_df.drop(columns = 'y'))
print(f"RTE Training Accuracy on MC500: {get_accuracy(train_preds['best'].values.reshape(-4,4), train_answers)}")

{'C': 0.0001, 'dual': False, 'loss': 'squared_hinge', 'max_iter': 10000, 'penalty': 'l2', 'tol': 1e-05}
RTE Training Accuracy on MC500: 0.5533333333333333


In [51]:
test_df = rte_model.create_features(test_stories).drop(columns = 'y')
test_preds = rte_model.predict(test_df)
test_preds.head()

Unnamed: 0,A,B,C,D,best
0,-0.371555,-0.375864,-0.376393,-0.36106,D
1,-0.246816,-0.251479,-0.242097,-0.242457,C
2,-0.342345,-0.34161,-0.317052,-0.340935,C
3,-0.295638,-0.306337,-0.298249,-0.306597,A
4,-0.229835,-0.158044,-0.204798,-0.163174,B


In [53]:
pd.DataFrame(
    {
        'id' : baseline_test_preds['id'].values,
        'answer' : test_preds['best'].values
    }
)#.to_csv('./test_answers.csv', sep = ',', header = True, index = False)

Unnamed: 0,id,answer
0,mc500.test.0.1,D
1,mc500.test.0.2,C
2,mc500.test.0.3,C
3,mc500.test.0.4,A
4,mc500.test.1.1,B
...,...,...
595,mc500.test.148.4,B
596,mc500.test.149.1,D
597,mc500.test.149.2,D
598,mc500.test.149.3,D
