In [None]:
#A Study on Efficiency, Accuracy and Document Structure for Answer Sentence Selection
import numpy as np
import pandas as pd
QUESTION_LEN = 23
CANDIDATE_LEN = 503

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/"

Mounted at /content/drive
/content/drive


In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks/NLP-Project/data"
%ls

/content/drive/MyDrive/Colab Notebooks/NLP-Project/data
23-50-test_maps.pkl   qc_embeddings_dev_paper1.pkl    test_mrrs.pkl
23-50-test_mrrs.pkl   qc_embeddings_paper1.pkl        train_maps.pkl
23-50-train_maps.pkl  qc_embeddings_test_paper1.pkl   train_mrrs.pkl
23-50-train_mrrs.pkl  qc_embeddings_train_paper1.pkl  [0m[01;34mWikiQACorpus[0m/
numberbatch-en.txt    test_maps.pkl


## Number Batch

In [None]:
"""
This Python module provides just the code from the 'conceptnet5' module that
you need to represent terms, possibly with multiple words, as ConceptNet URIs.

It depends on 'wordfreq', a Python 3 library, so it can tokenize multilingual
text consistently: https://pypi.org/project/wordfreq/

Example:

>>> standardized_uri('es', 'ayudar')
'/c/es/ayudar'
>>> standardized_uri('en', 'a test phrase')
'/c/en/test_phrase'
>>> standardized_uri('en', '24 hours')
'/c/en/##_hours'
"""
!pip install wordfreq
import wordfreq
import re


# English-specific stopword handling
STOPWORDS = ['the', 'a', 'an']
DROP_FIRST = ['to']
DOUBLE_DIGIT_RE = re.compile(r'[0-9][0-9]')
DIGIT_RE = re.compile(r'[0-9]')


def standardized_uri(language, term):
    """
    Get a URI that is suitable to label a row of a vector space, by making sure
    that both ConceptNet's and word2vec's normalizations are applied to it.

    'language' should be a BCP 47 language code, such as 'en' for English.

    If the term already looks like a ConceptNet URI, it will only have its
    sequences of digits replaced by #. Otherwise, it will be turned into a
    ConceptNet URI in the given language, and then have its sequences of digits
    replaced.
    """
    if not (term.startswith('/') and term.count('/') >= 2):
        term = _standardized_concept_uri(language, term)
    return replace_numbers(term)


def english_filter(tokens):
    """
    Given a list of tokens, remove a small list of English stopwords. This
    helps to work with previous versions of ConceptNet, which often provided
    phrases such as 'an apple' and assumed they would be standardized to
	'apple'.
    """
    non_stopwords = [token for token in tokens if token not in STOPWORDS]
    while non_stopwords and non_stopwords[0] in DROP_FIRST:
        non_stopwords = non_stopwords[1:]
    if non_stopwords:
        return non_stopwords
    else:
        return tokens


def replace_numbers(s):
    """
    Replace digits with # in any term where a sequence of two digits appears.

    This operation is applied to text that passes through word2vec, so we
    should match it.
    """
    if DOUBLE_DIGIT_RE.search(s):
        return DIGIT_RE.sub('#', s)
    else:
        return s


def _standardized_concept_uri(language, term):
    if language == 'en':
        token_filter = english_filter
    else:
        token_filter = None
    language = language.lower()
    norm_text = _standardized_text(term, token_filter)
    return '/c/{}/{}'.format(language, norm_text)
    # return ''.format(language, norm_text)


def _standardized_text(text, token_filter):
    tokens = simple_tokenize(text.replace('_', ' '))
    if token_filter is not None:
        tokens = token_filter(tokens)
    return '_'.join(tokens)


def simple_tokenize(text):
    """
    Tokenize text using the default wordfreq rules.
    """
    return wordfreq.tokenize(text, 'xx')



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
embedding_dict = {}
word_dict={}
limit = 0
with open("numberbatch-en.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embedding_dict[word] = vector

In [None]:
#given a word, standardise it and return the embedding
def get_embedding(word):
    word = standardized_uri('en', word)
    #truncate first 6 characters
    word = word[6:]
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        return np.ones(300, dtype=np.float32)

## DATA

In [None]:
df_train = pd.read_csv('WikiQACorpus/WikiQA-train.tsv', sep='\t')
df_train.head(30)

df_test = pd.read_csv('WikiQACorpus/WikiQA-test.tsv', sep='\t')
df_test.head(30)

df_dev = pd.read_csv('WikiQACorpus/WikiQA-dev.tsv', sep='\t')
df_dev.head(30)

print("train size : ",df_train.shape)
print("test size : ",df_test.shape)
print("dev size : ",df_dev.shape)


train size :  (20347, 7)
test size :  (6116, 7)
dev size :  (2733, 7)


In [None]:
# for each Question, sentence, remove special characters

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df_train['Question'] = df_train['Question'].apply(lambda x: clean_text(x))
df_train['Sentence'] = df_train['Sentence'].apply(lambda x: clean_text(x))

df_dev['Question'] = df_dev['Question'].apply(lambda x: clean_text(x))
df_dev['Sentence'] = df_dev['Sentence'].apply(lambda x: clean_text(x))

df_test['Question'] = df_test['Question'].apply(lambda x: clean_text(x))
df_test['Sentence'] = df_test['Sentence'].apply(lambda x: clean_text(x))



In [None]:
#for each question, find the number of the words in the question
def get_question_length(row):
    return len(row['Question'].split())

def get_sentence_length(row):
    return len(row['Sentence'].split())

df_train['Question_length'] = df_train.apply(get_question_length, axis=1)
df_train['Candidate_length'] = df_train.apply(get_sentence_length, axis=1)
df_train = df_train.drop(['DocumentID', 'DocumentTitle', 'SentenceID'], axis=1)

df_test['Question_length'] = df_test.apply(get_question_length, axis=1)
df_test['Candidate_length'] = df_test.apply(get_sentence_length, axis=1)
df_test = df_test.drop(['DocumentID', 'DocumentTitle', 'SentenceID'], axis=1)

df_dev['Question_length'] = df_dev.apply(get_question_length, axis=1)
df_dev['Candidate_length'] = df_dev.apply(get_sentence_length, axis=1)
df_dev = df_dev.drop(['DocumentID', 'DocumentTitle', 'SentenceID'], axis=1)


In [None]:
# find the maximum length of the question and the candidate sentence
max_question_length = df_train['Question_length'].max()
max_candidate_length = df_train['Candidate_length'].max()
#find the number of unique questions 
unique_questions = df_train['QuestionID'].unique()
print("Max question length in train data set: ", max_question_length)
print("Max candidate length in train data set: ", max_candidate_length)
print("Number of unique questions in train data set: ", len(unique_questions))

total_candidates = len(df_train)
print("Total number of candidates in train data set: ", total_candidates)

Max question length in train data set:  23
Max candidate length in train data set:  305
Number of unique questions in train data set:  2117
Total number of candidates in train data set:  20347


In [None]:
# delete the row if sentence length is greater than 50 and label is 0

df_train = df_train.drop(df_train[(df_train['Label'] == 0) & (df_train['Candidate_length'] > CANDIDATE_LEN)].index)
df_test = df_test.drop(df_test[(df_test['Label'] == 0) & (df_test['Candidate_length'] > CANDIDATE_LEN)].index)
df_dev = df_dev.drop(df_dev[(df_dev['Label'] == 0) & (df_dev['Candidate_length'] > CANDIDATE_LEN)].index)



print("train size : ",df_train.shape)
print("test size : ",df_test.shape)
print("dev size : ",df_dev.shape)


train size :  (20347, 6)
test size :  (6115, 6)
dev size :  (2733, 6)


In [None]:
QuestionID_greater_than_CANDIDATE_LEN = []

# for candidate if length is greater than CANDIDATE_LEN, add the question id to the list
for index, row in df_train.iterrows():
    if row['Candidate_length'] > CANDIDATE_LEN:
        QuestionID_greater_than_CANDIDATE_LEN.append(row['QuestionID'])

# delete the rows where the question id is in the list
df_train = df_train[~df_train['QuestionID'].isin(QuestionID_greater_than_CANDIDATE_LEN)]

#print the nubmer of unique questions
unique_questions = df_train['QuestionID'].unique()
print("Number of unique questions in train data set: ", len(unique_questions))


QuestionID_greater_than_CANDIDATE_LEN = []

# for candidate if length is greater than CANDIDATE_LEN, add the question id to the list
for index, row in df_test.iterrows():
    if row['Candidate_length'] > CANDIDATE_LEN:
        QuestionID_greater_than_CANDIDATE_LEN.append(row['QuestionID'])

# delete the rows where the question id is in the list
df_test = df_test[~df_test['QuestionID'].isin(QuestionID_greater_than_CANDIDATE_LEN)]

#print the nubmer of unique questions
unique_questions = df_test['QuestionID'].unique()
print("Number of unique questions in test data set: ", len(unique_questions))

QuestionID_greater_than_CANDIDATE_LEN = []

# for candidate if length is greater than CANDIDATE_LEN, add the question id to the list
for index, row in df_dev.iterrows():
    if row['Candidate_length'] > CANDIDATE_LEN:
        QuestionID_greater_than_CANDIDATE_LEN.append(row['QuestionID'])

# delete the rows where the question id is in the list
df_dev = df_dev[~df_dev['QuestionID'].isin(QuestionID_greater_than_CANDIDATE_LEN)]

#print the nubmer of unique questions
unique_questions = df_dev['QuestionID'].unique()
print("Number of unique questions in dev data set: ", len(unique_questions))




Number of unique questions in train data set:  2117
Number of unique questions in test data set:  630
Number of unique questions in dev data set:  296


In [None]:
# dind the maximum length of the question and the candidate sentence
max_question_length = df_train['Question_length'].max()
max_candidate_length = df_train['Candidate_length'].max()

print("Max question length in train dataset: ", max_question_length)
print("Max candidate length in train dataset: ", max_candidate_length)

# dind the maximum length of the question and the candidate sentence
max_question_length = df_test['Question_length'].max()
max_candidate_length = df_test['Candidate_length'].max()

print("Max question length in test dataset: ", max_question_length)
print("Max candidate length in test dataset: ", max_candidate_length)

# dind the maximum length of the question and the candidate sentence
max_question_length = df_dev['Question_length'].max()
max_candidate_length = df_dev['Candidate_length'].max()

print("Max question length in dev dataset: ", max_question_length)
print("Max candidate length in dev dataset: ", max_candidate_length)

Max question length in train dataset:  23
Max candidate length in train dataset:  305
Max question length in test dataset:  19
Max candidate length in test dataset:  132
Max question length in dev dataset:  21
Max candidate length in dev dataset:  120


In [None]:
# reformat the data where each question has a list of candidate sentences and a list of labels
# 1 if the sentence is the answer, 0 otherwise
def reformat_data(df):
    questions = []
    candidates = []
    labels = []
    for index, row in df.iterrows():
        if row['Question'] not in questions:
            questions.append(row['Question'])
            candidates.append([row['Sentence']])
            labels.append([row['Label']])
        else:
            candidates[questions.index(row['Question'])].append(row['Sentence'])
            labels[questions.index(row['Question'])].append(row['Label'])
    return questions, candidates, labels
    


In [None]:
text_questions_train, text_candidates_train, labels_train = reformat_data(df_train)
text_questions_test, text_candidates_test, labels_test = reformat_data(df_test)
text_questions_dev, text_candidates_dev, labels_dev = reformat_data(df_dev)

max_question_length = QUESTION_LEN
max_candidate_length = CANDIDATE_LEN
print(max_question_length)
print(max_candidate_length)

23
503


In [None]:
emb_questions_train = []
emb_candidates_train = []

# get the embedding for each word in the question and candidate sentence using the get_embedding function
for question in text_questions_train:
    emb_questions_train.append([get_embedding(word) for word in question.split()])
for candidate in text_candidates_train:
    emb_candidates_train.append([[get_embedding(word) for word in sentence.split()] for sentence in candidate])

emb_questions_test = []
emb_candidates_test = []

# get the embedding for each word in the question and candidate sentence using the get_embedding function
for question in text_questions_test:
    emb_questions_test.append([get_embedding(word) for word in question.split()])
for candidate in text_candidates_test:
    emb_candidates_test.append([[get_embedding(word) for word in sentence.split()] for sentence in candidate])

emb_questions_dev = []
emb_candidates_dev = []

# get the embedding for each word in the question and candidate sentence using the get_embedding function
for question in text_questions_dev:
    emb_questions_dev.append([get_embedding(word) for word in question.split()])
for candidate in text_candidates_dev:
    emb_candidates_dev.append([[get_embedding(word) for word in sentence.split()] for sentence in candidate])


In [None]:
# print the total number of candidates for all questions
total_candidates_train = 0
for candidate in emb_candidates_train:
    total_candidates_train += len(candidate)
print("Total number of candidates train dataset: ", total_candidates_train)

print("Total number of questions train dataset: ", len(emb_questions_train))

Total number of candidates train dataset:  20347
Total number of questions train dataset:  2117


In [None]:
# if all the labels are zero, remove the question and candidate sentence
def remove_all_zeros(labels,emb_questions,emb_candidates):
  for i in range(len(labels)):
      if sum(labels[i]) == 0:
          emb_questions[i] = []
          emb_candidates[i] = []
          labels[i] = []

  # remove the empty lists
  return [x for x in emb_questions if x != []] ,[x for x in emb_candidates if x != []] ,[x for x in labels if x != []]

emb_questions_train,emb_candidates_train,labels_train = remove_all_zeros(labels_train,emb_questions_train,emb_candidates_train)
emb_questions_test,emb_candidates_test,labels_test = remove_all_zeros(labels_test,emb_questions_test,emb_candidates_test)
emb_questions_dev,emb_candidates_dev,labels_dev = remove_all_zeros(labels_dev,emb_questions_dev,emb_candidates_dev)

# print the total number of candidates for all questions
total_candidates = 0
for candidate in emb_candidates_train:
    total_candidates += len(candidate)
print("Total number of candidates in train dataset: ", total_candidates)

print("Total number of questions in train dataset: ", len(emb_questions_train))

Total number of candidates in train dataset:  8666
Total number of questions in train dataset:  872


In [None]:
# for each question, candiate sentence pair, perform a cosinet between each embedding of question to each embedding of sentence to identify the most similar word

def get_cosine_similarity(emb_question, emb_candidate):
    question_cosines = []
    candidate_cosines = []
    # for each word in question, find the word in the candidate sentence that is most similar and calculate the maximum cosine similarity
    for word in emb_question:
        question_cosines.append(max([np.dot(word, candidate_word)/(np.linalg.norm(word)*np.linalg.norm(candidate_word)) for candidate_word in emb_candidate]))
        

    for word in emb_candidate:
        candidate_cosines.append(max([np.dot(word, question_word)/(np.linalg.norm(word)*np.linalg.norm(question_word)) for question_word in emb_question]))
        

    return question_cosines, candidate_cosines

In [None]:
import copy

def embeddings_q_c_l(emb_questions,emb_candidates,labels):
  r_emb_question_candidate_pairs = []


  # for each question, candidate sentence pair, get the cosine similarity between each word in the question and each word in the candidate sentence
  # append the cosine similarity values to each question and candidate sentence embedding
  for i in range(len(emb_questions)):
      temp = []
      for j in range(len(emb_candidates[i])):
          question_cosines, candidate_cosines = get_cosine_similarity(emb_questions[i], emb_candidates[i][j])

          # take a deep copy of emb_question[i]
          temp_emb_question = copy.deepcopy(emb_questions[i])
          temp_emb_candidate = copy.deepcopy(emb_candidates[i][j])
          for k in range(len(temp_emb_question)):
              temp_emb_question[k] = np.append(temp_emb_question[k], question_cosines[k])
          for k in range(len(temp_emb_candidate)):
              temp_emb_candidate[k] = np.append(temp_emb_candidate[k], candidate_cosines[k])
          temp.append([temp_emb_question, temp_emb_candidate, labels[i][j]])
      r_emb_question_candidate_pairs.append(temp)
  return r_emb_question_candidate_pairs

r_emb_question_candidate_pairs_train = embeddings_q_c_l(emb_questions_train,emb_candidates_train,labels_train)
r_emb_question_candidate_pairs_test = embeddings_q_c_l(emb_questions_test,emb_candidates_test,labels_test)
r_emb_question_candidate_pairs_dev = embeddings_q_c_l(emb_questions_dev,emb_candidates_dev,labels_dev)




        

In [None]:
#pad the sentence based on given required length from the r_emb_question_candidate_pairs
def pad_sentence_embedding(sentence_embedding, required_length):

    if len(sentence_embedding) < required_length:
        for i in range(required_length - len(sentence_embedding)):
            sentence_embedding.append(np.zeros(len(sentence_embedding[0])))

    return sentence_embedding


# pad the question and candidate sentence embedding based on the max length of the question and candidate sentence
def get_pr_emb_question_candidate_pairs(r_emb_question_candidate_pairs):
  pr_emb_question_candidate_pairs = []
  for entry in r_emb_question_candidate_pairs:
      temp = []
      for question, candidate, label in entry:
          temp.append([pad_sentence_embedding(question, max_question_length), pad_sentence_embedding(candidate, max_candidate_length), label])
      pr_emb_question_candidate_pairs.append(temp)
  return pr_emb_question_candidate_pairs

pr_emb_question_candidate_pairs_train = get_pr_emb_question_candidate_pairs(r_emb_question_candidate_pairs_train)
pr_emb_question_candidate_pairs_test = get_pr_emb_question_candidate_pairs(r_emb_question_candidate_pairs_test)
pr_emb_question_candidate_pairs_dev = get_pr_emb_question_candidate_pairs(r_emb_question_candidate_pairs_dev)

In [None]:
#find the number of candidates for each question
# num_candidates = []
# for i in range(len(r_emb_question_candidate_pairs)):
#     num_candidates.append(len(r_emb_question_candidate_pairs[i]))


# # max number of candidates for a question
# max_num_candidates = max(num_candidates)

# # generate a dummy candidate with padding = max_candidate_length by using <PAD> tokens
# dummy_candidate = []
# for i in range(max_candidate_length):
#     dummy_candidate.append(np.zeros(len(sample_candidate[0])))

# # pad the candidate sentence embedding with dummy candidate
# for i in range(len(pr_emb_question_candidate_pairs)):
#     if len(pr_emb_question_candidate_pairs[i]) < max_num_candidates:
#         pr_emb_question_candidate_pairs[i].extend([[pr_emb_question_candidate_pairs[i][0][0], dummy_candidate, 0]]*(max_num_candidates - len(pr_emb_question_candidate_pairs[i])))


In [None]:
# change every list into a numpy array
for i in range(len(pr_emb_question_candidate_pairs_train)):
    for j in range(len(pr_emb_question_candidate_pairs_train[i])):
        pr_emb_question_candidate_pairs_train[i][j][0] = np.asarray(pr_emb_question_candidate_pairs_train[i][j][0], dtype=np.float32)
        pr_emb_question_candidate_pairs_train[i][j][1] = np.asarray(pr_emb_question_candidate_pairs_train[i][j][1], dtype=np.float32)

for i in range(len(pr_emb_question_candidate_pairs_test)):
    for j in range(len(pr_emb_question_candidate_pairs_test[i])):
        pr_emb_question_candidate_pairs_test[i][j][0] = np.asarray(pr_emb_question_candidate_pairs_test[i][j][0], dtype=np.float32)
        pr_emb_question_candidate_pairs_test[i][j][1] = np.asarray(pr_emb_question_candidate_pairs_test[i][j][1], dtype=np.float32)

for i in range(len(pr_emb_question_candidate_pairs_dev)):
    for j in range(len(pr_emb_question_candidate_pairs_dev[i])):
        pr_emb_question_candidate_pairs_dev[i][j][0] = np.asarray(pr_emb_question_candidate_pairs_dev[i][j][0], dtype=np.float32)
        pr_emb_question_candidate_pairs_dev[i][j][1] = np.asarray(pr_emb_question_candidate_pairs_dev[i][j][1], dtype=np.float32)

In [None]:
# find the shape of the question and candidate sentence embedding
# print the number of questions and number of candidates for each question
print("Shape of question embedding: ", pr_emb_question_candidate_pairs_train[0][0][0].shape)
print("Shape of candidate embedding: ", pr_emb_question_candidate_pairs_train[0][0][1].shape)
print("Number of questions: ", len(pr_emb_question_candidate_pairs_train))
print("Number of Questions: ", len(pr_emb_question_candidate_pairs_train))

In [None]:
# save list as a pickle file and load
import pickle

def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [None]:
save_pickle(pr_emb_question_candidate_pairs_train,'./qc_embeddings_train_paper1.pkl')
save_pickle(pr_emb_question_candidate_pairs_test,'./qc_embeddings_test_paper1.pkl')
save_pickle(pr_emb_question_candidate_pairs_dev,'./qc_embeddings_dev_paper1.pkl')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/NLP-Project/data

In [None]:
# load a pickle file to a list
import pickle
# load a pickle file to a list
import pickle
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

pr_emb_question_candidate_pairs_train = load_pickle("qc_embeddings_train_paper1.pkl")
pr_emb_question_candidate_pairs_test = load_pickle("qc_embeddings_test_paper1.pkl")
pr_emb_question_candidate_pairs_dev = load_pickle("qc_embeddings_dev_paper1.pkl")

In [None]:
import numpy as np

In [None]:
# convert pr_emb_question_candidate_pairs to list of question_embedding, candidate_embedding, label

# list of question_embedding, candidate_embedding, label
def convert_to_qcl(pr_emb_question_candidate_pairs):
  qcl = []

  for i in range(len(pr_emb_question_candidate_pairs)):
      for j in range(len(pr_emb_question_candidate_pairs[i])):
          qcl.append([np.array(pr_emb_question_candidate_pairs[i][j][0]), np.array(pr_emb_question_candidate_pairs[i][j][1]), np.array(pr_emb_question_candidate_pairs[i][j][2])])
  return qcl

qcl_train = convert_to_qcl(pr_emb_question_candidate_pairs_train)
qcl_test = convert_to_qcl(pr_emb_question_candidate_pairs_test)
qcl_dev = convert_to_qcl(pr_emb_question_candidate_pairs_dev)

def seperate_qcl(qcl):
  questions_emb = []
  candidates_emb = []
  labels = []

  for i in range(len(qcl)):
      questions_emb.append(qcl[i][0])
      candidates_emb.append(qcl[i][1])
      labels.append(qcl[i][2])


  questions_emb = np.array(questions_emb)
  candidates_emb = np.array(candidates_emb)
  labels = np.array(labels)

  return questions_emb,candidates_emb,labels

questions_emb_train,candidates_emb_train,labels_train = seperate_qcl(qcl_train)
questions_emb_test,candidates_emb_test,labels_test = seperate_qcl(qcl_test)
questions_emb_dev,candidates_emb_dev,labels_dev = seperate_qcl(qcl_dev)


In [None]:
labels_train.shape

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as data_utils
import torch.nn.utils.rnn as rnn_utils
import random

class CNN_RNN(nn.Module):
    def __init__(self):
        super(CNN_RNN, self).__init__()
        # question size (301,32)
        # candidate size (301,64)
        
        # apply conv2d on question embedding with kernel size 5
        self.conv1 = nn.Conv2d(1, 1, 5)
        # apply conv2d on candidate embedding with kernel size 5
        self.conv2 = nn.Conv2d(1, 1, 5)

        # output size of conv1 = (301-5+1, 32-5+1) = (297, 28)
        # output size of conv2 = (301-5+1, 64-5+1) = (297, 60)

        # apply global max pooling along such that the size is (297,1)
        self.pool1 = nn.MaxPool2d((1, QUESTION_LEN-5+1))
        self.pool2 = nn.MaxPool2d((1, CANDIDATE_LEN-5+1))

        # output size of pool1 = (297,1)
        # output size of pool2 = (297,1)

        # concatenate the output of pool1 and pool2 
        # output size of concat = (297+297, 1) = (594, 1)

        # apply RNN on the concatenated output
        self.rnn = nn.RNN(594, 594, 1, batch_first=True)

        # classify the output of RNN as two classes
        self.fc = nn.Linear(594, 2)



    def forward(self,q,c):
        # q = q.view(-1, 1, 301, 32)
        # c = c.view(-1, 1, 301, 64)
        q = q.view(-1, 1, 301, QUESTION_LEN)
        c = c.view(-1, 1, 301, CANDIDATE_LEN)
        q = F.relu(self.conv1(q))
        c = F.relu(self.conv2(c))
        q = self.pool1(q)
        c = self.pool2(c)
        q = q.view(-1, 297)
        c = c.view(-1, 297)
        mul = torch.mul(q,c)
        sub = torch.sub(q,c)
        x = torch.cat((mul,sub),1)
        # x size = (1, 594)
        x = x.view(-1, 1, 594)
        x, _ = self.rnn(x)
        x = x.view(-1, 594)
        x = self.fc(x)
        # x size = (-1, 2)
        return x




In [None]:
# train the model with batc size of 64
from tqdm import trange
def train(qcl,questions_emb,candidates_emb,labels,epochs=3):
  # define the model
  model = CNN_RNN()
  # define the loss function
  criterion = nn.CrossEntropyLoss()
  # define the optimizer
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  # train the model on qcl ( question_embedding, candidate_embedding, label)

  for epoch in trange(epochs,desc='Epochs'):  # loop over the dataset multiple times
      running_loss = []
      for i in range(0,len(qcl),64):
      
          questions = questions_emb[i:i+64]
          candidates = candidates_emb[i:i+64]
          labels_64 = labels[i:i+64]


          # shape of question = (64, 301, 32)
          # shape of candidate = (64, 301, 64)
          # shape of label = (64,)
          
          # convert question to numpy array and change the size to (64,32,301)

          questions = questions.reshape(-1,QUESTION_LEN,301)
          # convert candidate to numpy array and change the size to (64,64,301)
          candidates = candidates.reshape(-1,CANDIDATE_LEN,301)

          # convert question to torch tensor
          questions = torch.from_numpy(questions)
          # convert candidates to torch tensor
          candidates = torch.from_numpy(candidates)
          # convert label to torch tensor

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = model(questions, candidates)
          loss = criterion(outputs, torch.from_numpy(labels_64))
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss.append(loss.item())

      print(f"Epoch : {epoch} , loss : {np.mean(running_loss)}")    
  return model

In [None]:
def metric(model,pr_emb_question_candidate_pairs):
    model.eval()

    precisions = []
    ranks = []


    # extract question and candidate embeddings
    for q in pr_emb_question_candidate_pairs:
        real_labels = []
        predicted_scores = []
        for c_ind,c in enumerate(q):
            

            candidate_embedding = np.array(c[1])
            real_labels.append(c[2])
            # shape of qe = (1, 301, 32)
            # shape of c = (1, 301, 64)
            q_emb = np.array(c[0])
            q_emb = q_emb.reshape(1,QUESTION_LEN,301)
            candidate_embedding = candidate_embedding.reshape(1,CANDIDATE_LEN,301)
            # convert q_emb to torch tensor
            q_emb = torch.from_numpy(q_emb)
            # convert c to torch tensor
            candidate_embedding = torch.from_numpy(candidate_embedding)
            # forward
            outputs = model(q_emb, candidate_embedding)
            outputs = torch.nn.functional.softmax(outputs, dim=1)
            # convert outputs to numpy array
            outputs = outputs.detach().numpy()
            predicted_scores.append(outputs)

        
        real_labels = np.array(real_labels)
        question_real_labels_indices = np.where(real_labels == 1)[0]
        predicted_scores = np.array(predicted_scores)
        # print(predicted_scores.shape)
        predicted_scores = predicted_scores.squeeze(axis=1)
        # print(predicted_scores.shape)
        predicted_scores = predicted_scores[:,1] - predicted_scores[:,0]
        predicted_scores = np.array(predicted_scores)
        sorted_indices = np.argsort(predicted_scores)
        sorted_indices = sorted_indices[::-1]
        for i in range(len(sorted_indices)):
            if sorted_indices[i] in question_real_labels_indices:
                sorted_indices[i] = -1

        precision = 0
        rank = 0
        c = 0
        for i,ind in enumerate(sorted_indices):
            if ind == -1:
                rank = 1/(i+1)
                c += 1
                precision += (c/(i+1))
        
        
        
        precision = precision/c
        precisions.append(precision)
        ranks.append(rank)
    

        
    model.train()
    return np.mean(precisions), np.mean(ranks)

In [None]:
model_3 = train(qcl_train,questions_emb_train,candidates_emb_train,labels_train,3)

In [None]:
train_MAP, train_MRR = metric(model_3,pr_emb_question_candidate_pairs_train)
print("train MAP : ",train_MAP)
print("train MRR : ",train_MRR)

In [None]:
pr_emb_question_candidate_pairs_test[0][0][1].shape

In [None]:
test_MAP, test_MRR = metric(model_3,pr_emb_question_candidate_pairs_test)
print("test MAP : ",test_MAP)
print("test MRR : ",test_MRR)

In [None]:
MODELS = []
TRAIN_MAPS = []
TRAIN_MRRS = []
TEST_MAPS = []
TEST_MRRS = []

for epoch in range(1,11):
  print("Currently EPOCH IS ", epoch)
  print(" ")
  temp_model = train(qcl_train,questions_emb_train,candidates_emb_train,labels_train,epoch)
  train_MAP, train_MRR = metric(temp_model,pr_emb_question_candidate_pairs_train)
  TRAIN_MAPS.append(train_MAP)
  TRAIN_MRRS.append(train_MRR)


  test_MAP, test_MRR = metric(temp_model,pr_emb_question_candidate_pairs_test)
  print("For epoch " + str(epoch) + ": train MAP : " + str(train_MAP) + " train MRR : " + str(train_MRR) )
  print("For epoch " + str(epoch) + ": test MAP : " + str(test_MAP) + " test MRR : " + str(test_MRR) )
  print(" ")
  MODELS.append(temp_model)
  TEST_MAPS.append(test_MAP)
  TEST_MRRS.append(test_MRR)




In [None]:
save_pickle(TRAIN_MAPS,'./23-503-train_maps.pkl')
save_pickle(TRAIN_MRRS,'./23-503-train_mrrs.pkl')
save_pickle(TEST_MAPS,'./23-503-test_maps.pkl')
save_pickle(TEST_MRRS,'./23-503-test_mrrs.pkl')

In [None]:
TRAIN_MAPS=load_pickle('./train_maps.pkl')
TRAIN_MRRS=load_pickle('./train_mrrs.pkl')
TEST_MAPS=load_pickle('./test_maps.pkl')
TEST_MRRS=load_pickle('./test_mrrs.pkl')

new_TRAIN_MAPS=load_pickle('./23-503-train_maps.pkl')
new_TRAIN_MRRS=load_pickle('./23-503-train_mrrs.pkl')
new_TEST_MAPS=load_pickle('./23-503-test_maps.pkl')
new_TEST_MRRS=load_pickle('./23-503-test_mrrs.pkl')

In [None]:
xpoints = [i for i in range(1,11)]

import matplotlib.pyplot as plt

# plt.plot(xpoints, new_TRAIN_MAPS, label="new_Train Map")
# plt.plot(xpoints, TRAIN_MAPS, label="Train Map")
plt.plot(xpoints, new_TEST_MAPS, label = "new_Test Map")
plt.plot(xpoints, TEST_MAPS, label = "Test Map")
plt.legend()
plt.show()


In [None]:
plt.plot(xpoints, new_TRAIN_MRRS, label="new_Train MRR")
plt.plot(xpoints, TRAIN_MRRS, label="Train MRR")
plt.plot(xpoints, new_TEST_MRRS, label = "new_Test MRR")
plt.plot(xpoints, TEST_MRRS, label = "Test MRR")
plt.legend()
plt.show()
