In [35]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

import pandas as pd
import numpy as np
import re
import ast
import torch
import random

from sklearn.metrics import accuracy_score

In [9]:
from torch.nn import Softmax

In [10]:
device = 'cpu'

In [11]:
es = Elasticsearch([{'host':'localhost', 'port':9200}])

In [12]:
es.ping()

True

In [13]:
corpus = open('ARC_Corpus.txt', 'r', encoding='utf-8')
corpus_1 = open('Aristo-Mini-Corpus-Dec2016.txt', 'r', encoding='utf-8')

In [14]:
b = {"mappings":{
        "properties":{
            "sentence":{
                "type":"text"
            }
        }
}
    }

ret = es.indices.create(index='corpus', ignore=400, body=b) #ARC Corpus.
ret_2 = es.indices.create(index='corpus_1', ignore=400, body=b) #Aristo Mini Corpus.

In [15]:
def gen_data_arc():
    '''
    This function will yield a sentence from the ARC corpus and the index name.
    '''
    with open('ARC_Corpus.txt', 'r', encoding='utf-8') as viola:
        for row in viola: 
            yield{
                "_index":"corpus",
                 "sentence":row
                 }  

parallel_bulk(es, gen_data_arc())

<generator object parallel_bulk at 0x000002A909237E48>

In [16]:
def gen_data_aristo():
    '''
    This function will yield a sentence from the ARC corpus and the index name.
    '''
    with open('Aristo-Mini-Corpus-Dec2016.txt', 'r', encoding='utf-8') as viola:
        for row in viola: 
            yield{
                "_index":"corpus_1",
                 "sentence":row
                 }

parallel_bulk(es, gen_data_aristo())

<generator object parallel_bulk at 0x000002A909237D48>

In [17]:
def extract_question(question):
    '''
    This function will take a value in question column then, will split the string 
    at (A) or (1). After splitting we will take only first part which is the question.
    '''
    if '(A)' in question:
        x = question.split('(A)')[0]
    elif '(1)' in question:
        x = question.split('(1)')[0]

    return x

In [18]:
def extract_answers(question):
    '''
    This function will take a value in question column then, will split the string 
    at (A) or (1). After splitting we will take only first part which is the question.
    '''
    if '(A)' in question:
        x = '(A) ' + question.split('(A)')[1]
    elif '(1)' in question:
        x = '(1) ' + question.split('(1)')[1]

    return x

In [19]:
def data_generator(a):
    '''
    This function will accept a data point and returns a list of options
    converts the only_options list of options.

    Output will be as follows:

    options = [option_1, option_2, option_3, option_4, option_5]                   

    Note: If there's no option E then it will be written as 'None of the above'.
    '''
    options = []

    if '(A)' in a:
        option_1 = a.split('(B)')[0].replace('(A)','').lstrip()

        b = ' '.join(i for i in a.split() if i not in option_1)
        option_2 = b.split('(C)')[0].replace('(B)','').replace('(A)','').lstrip()

        c = ' '.join(i for i in b.split() if i not in option_2)
        option_3 = c.split('(D)')[0].replace('(C)','').replace('(B)','').replace('(A)','').lstrip()

        if '(D)' not in c:
            option_4 = 'None of the above'
            option_5 = 'None of the above'
        else:
            if '(E)' not in c:
                option_4 = c.split('(D)')[1].lstrip()
                option_5 = 'None of the above'
            else:
                d = ' '.join(i for i in c.split() if i not in option_3)
                option_4 = d.split('(E)')[0].replace('(D)','').replace('(C)','').replace('(B)','').replace('(A)','').lstrip()
                option_5 = d.split('(E)')[1].lstrip()
    else:
        option_1 = a.split('(2)')[0].replace('(1)','').lstrip()

        b = ' '.join(i for i in a.split() if i not in option_1)
        option_2 = b.split('(3)')[0].replace('(2)','').replace('(1)','').lstrip()

        c = ' '.join(i for i in b.split() if i not in option_2)
        option_3 = c.split('(4)')[0].replace('(3)','').replace('(2)','').replace('(1)','').lstrip()

        if '(4)' in c:
            option_4 = c.split('(4)')[1].lstrip()
        else:
            option_4 = 'None of the above'
        option_5 = 'None of the above'

    options = [option_1, option_2, option_3, option_4, option_5]

    return options

In [20]:
def get_context_for_each_candidate(question, options_list):
    '''
    This function will return a context after joining a question
    and its options separately. 
    For example: question + option_1
                 question + option_2
                 question + option_3
                 question + option_4
   '''
    option_1_docs = dict()
    option_2_docs = dict()
    option_3_docs = dict()
    option_4_docs = dict()
    option_5_docs = dict()
    choices = options_list
    
    
    for j in range(len(choices)):
        
        #Defining query to be searched for.
        search_param = {"size": 50,
                        "query": {
                            "bool": {
                                "must": [
                                    {"match": {
                                        "sentence": question + ' ' + f'{choices[j]}'
                                    }}
                                ],
                                "filter": [
                                    {"match": {"sentence": f'{choices[j]}'}}
                                ]
                            }
                        }}

        #Searching in the 'corpus' and 'corpus_1' index.
        a = es.search(index='corpus', body=search_param)
        b = es.search(index='corpus_1', body=search_param)
        
        sentences = []
        for i in a['hits']['hits']:
            
            sent_tf_idf_score = dict()
            sentence = i['_source']['sentence']
            if sentence in sentences:
                continue
            else:
                sentences.append(sentence)
                score = i['_score']
                
                if j == 0:
                    option_1_docs[sentence] = score
                elif j == 1:
                    option_2_docs[sentence] = score
                elif j == 2:
                    option_3_docs[sentence] = score
                elif j == 3:
                    option_4_docs[sentence] = score
                elif j == 4:
                    option_5_docs[sentence] = score
        
        sentences = []
        for i in b['hits']['hits']:
            
            sent_tf_idf_score = dict()
            sentence = i['_source']['sentence']
            if sentence in sentences:
                continue
            else:
                sentences.append(sentence)
                score = i['_score']
                
                if j == 0:
                    option_1_docs[sentence] = score
                elif j == 1:
                    option_2_docs[sentence] = score
                elif j == 2:
                    option_3_docs[sentence] = score
                elif j == 3:
                    option_4_docs[sentence] = score
                elif j == 4:
                    option_5_docs[sentence] = score
                
    
    return pd.Series([option_1_docs, option_2_docs, option_3_docs, option_4_docs, option_5_docs])

In [21]:
from transformers import BertForSequenceClassification, BertTokenizer

tokenizer_scorer = BertTokenizer.from_pretrained('bert-base-uncased')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [22]:
drd_model = BertForSequenceClassification.from_pretrained('DRD_cpu', return_dict=True).to(device)
ard_model = BertForSequenceClassification.from_pretrained('ARD_cpu', return_dict=True).to(device)

In [23]:
def scorer(question, option, document):
    '''
    This function will take 3 things: question, option and document.

    Then it will pass question + document into DRD model and get a score.
    It will also pass question + option + document into ARD model and get another score.
    
    This function returns DRD and ARD scores, i.e., logits values for label 1.
    '''
    ard_input_string = f'[CLS] {question} [SEP] {option} [SEP] {document} [SEP]'

    drd_inputs = tokenizer_scorer(question, document, truncation='only_second', max_length=512, add_special_tokens=True, padding='max_length', return_tensors='pt')
    drd_inputs = drd_inputs.to(device)

    drd_outputs = drd_model(**drd_inputs)
    drd_softmax = Softmax(dim=1)(drd_outputs.logits)
    drd_score = drd_softmax.detach().cpu().numpy()[0][1]

    ard_input_ids = tokenizer_scorer.encode(ard_input_string, add_special_tokens=False, truncation=True, max_length=512, return_tensors='pt')
    ard_attention_mask = torch.ones([1,ard_input_ids.size()[1]])

    ard_input_ids = ard_input_ids.to(device)
    ard_attention_mask = ard_attention_mask.to(device)
    ard_outputs = ard_model(input_ids=ard_input_ids, attention_mask=ard_attention_mask)
    ard_softmax = Softmax(dim=1)(ard_outputs.logits)
    ard_score = ard_softmax.detach().cpu().numpy()[0][1]

    return drd_score, ard_score

In [24]:
def context_retreiver_2(dataset, only_question, options_list):
    '''
    This function will return context for every datapoint in the dataset.
    ''' 
    question = only_question
    options_list = options_list

    scores = dict()
    for j in range(len(options_list)):
        if j == 0:
            current_option = options_list[0]

            option_1_docs = dataset[0]
            option_1_docs_keys = list(option_1_docs.keys())
            option_1_docs_values = list(option_1_docs.values())
            option_1_docs_values = [float(i)+1e-6/(sum(option_1_docs_values)+1e-6) for i in option_1_docs_values]

            for k in range(len(option_1_docs_keys)):
                current_doc = option_1_docs_keys[k]
                current_doc = re.sub('\s+',' ', current_doc)
                drd_score, ard_score = scorer(question, current_option, current_doc)

                tf_idf_score = option_1_docs_values[k]

                total_score = tf_idf_score + drd_score + ard_score

                if current_doc not in scores.keys():
                    scores[current_doc] = total_score
                else:
                    if total_score > scores[current_doc]:
                        scores[current_doc] = total_score
                    else:
                        continue
        elif j == 1:
            current_option = options_list[1]
            option_2_docs = dataset[1]
            option_2_docs_keys = list(option_2_docs.keys())
            option_2_docs_values = list(option_2_docs.values())
            option_2_docs_values = [float(i)+1e-6/(sum(option_2_docs_values)+1e-6) for i in option_2_docs_values]

            for k in range(len(option_2_docs_keys)):
                current_doc = option_2_docs_keys[k]
                current_doc = re.sub('\s+',' ', current_doc)
                drd_score, ard_score = scorer(question, current_option, current_doc)

                tf_idf_score = option_2_docs_values[k]

                total_score = tf_idf_score + drd_score + ard_score

                if current_doc not in scores.keys():
                    scores[current_doc] = total_score
                else:
                    if total_score > scores[current_doc]:
                        scores[current_doc] = total_score
                    else:
                        continue
        elif j == 2:
            current_option = options_list[2]
            option_3_docs = dataset[2]
            option_3_docs_keys = list(option_3_docs.keys())
            option_3_docs_values = list(option_3_docs.values())
            option_3_docs_values = [float(i)+1e-6/(sum(option_3_docs_values)+1e-6) for i in option_3_docs_values]

            for k in range(len(option_3_docs_keys)):
                current_doc = option_3_docs_keys[k]
                current_doc = re.sub('\s+',' ', current_doc)
                drd_score, ard_score = scorer(question, current_option, current_doc)

                tf_idf_score = option_3_docs_values[k]

                total_score = tf_idf_score + drd_score + ard_score

                if current_doc not in scores.keys():
                    scores[current_doc] = total_score
                else:
                    if total_score > scores[current_doc]:
                        scores[current_doc] = total_score
                    else:
                        continue
        elif j == 3:
            current_option = options_list[3]
            option_4_docs = dataset[3]
            option_4_docs_keys = list(option_4_docs.keys())
            option_4_docs_values = list(option_4_docs.values())
            option_4_docs_values = [float(i)+1e-6/(sum(option_4_docs_values)+1e-6) for i in option_4_docs_values]

            for k in range(len(option_4_docs_keys)):
                current_doc = option_4_docs_keys[k]
                current_doc = re.sub('\s+',' ', current_doc)
                drd_score, ard_score = scorer(question, current_option, current_doc)

                tf_idf_score = option_4_docs_values[k]

                total_score = tf_idf_score + drd_score + ard_score

                if current_doc not in scores.keys():
                    scores[current_doc] = total_score
                else:
                    if total_score > scores[current_doc]:
                        scores[current_doc] = total_score
                    else:
                        continue
        elif j == 4:
            current_option = options_list[4]
            option_5_docs = dataset[4]
            option_5_docs_keys = list(option_5_docs.keys())
            option_5_docs_values = list(option_5_docs.values())
            option_5_docs_values = [float(i)+1e-6/(sum(option_5_docs_values)+1e-6) for i in option_5_docs_values]

            for k in range(len(option_5_docs_keys)):
                current_doc = option_5_docs_keys[k]
                current_doc = re.sub('\s+',' ', current_doc)
                drd_score, ard_score = scorer(question, current_option, current_doc)

                tf_idf_score = option_5_docs_values[k]

                total_score = tf_idf_score + drd_score + ard_score

                if current_doc not in scores.keys():
                    scores[current_doc] = total_score
                else:
                    if total_score > scores[current_doc]:
                        scores[current_doc] = total_score
                    else:
                        continue

        context = []
        for l, w in enumerate(sorted(scores, key=scores.get, reverse=True)):
            if l == 20:
                break
            else:
                context.append(w)
    joined_context = ' '.join(i for i in context)
    return joined_context

In [25]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained('Unified_QA_with_context')

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at Unified_QA_with_context and are newly initialized: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [27]:
def run_model(input_ids, **generator_args):
    global tokenizer
    res = model.generate(input_ids, **generator_args)
    return [tokenizer.decode(x) for x in res]

In [36]:
def predict(array_of_input_ids, options_list):
    '''
    This function takes list of string. Each string is a pair of question and 
    answers separated using \n(all lower cased). 
    For example: 
    which of the following is an example of a physical change? \n (a) lighting a match (b) breaking a glass (c) burning of gasoline (d) rusting of iron

    label_map = {'A':0,
                'B':1,
                'C':2,
                'D':3,
                'E':4,
                '1':0,
                '2':1,
                '3':2,
                '4':3,
                '5':4}

    This function will return predicted option. All answers are converted to 0, 1, 2, 3 and 4. 

    '''

    # Converting input ids to torch tensor of Long type.
    input_ids = array_of_input_ids.type(torch.LongTensor).to(device)

    pred = run_model(input_ids.reshape((1,-1)))[0] #Reshaping because model takes input like (batch_size, sequence_length)

    #Pred is in the form like '<pad> answer </s>'. So we have to remove <pad> and </s>.
    pred = pred.replace('<pad>','')
    pred = pred.replace('</s>','')
    pred = pred.lstrip()

    label = None
    for j in range(len(options_list)):
        if pred in options_list[j]:
            label = j

    if label == None:
        label = random.randint(0,4)       

    return pred, label

In [29]:
MAX_LEN = 512

In [56]:
def final_fun_1(question):
    '''
    The ARC dataset contains question in the following format:
    
    Which factor will most likely cause a person to develop a fever?  (A) a leg muscle relaxing after exercise \
    (B) a bacterial population in the bloodstream (C) several viral particles on the skin \
    (D) carbohydrates being digested in the stomach
    
    As we can see here in raw form question contains options within it.
    
    Thus we will take question in same format.
    '''
    only_question = extract_question(question)
    only_answers = extract_answers(question)
    options_list = data_generator(only_answers)
    
    extracted_context = get_context_for_each_candidate(only_question, options_list)
    
    final_context = context_retreiver_2(extracted_context, only_question, options_list)
    
    input_string = only_question + '\\n ' + only_answers + '\\n ' + final_context + ' </s>'
    
    input_ids = tokenizer.encode(input_string, truncation=True, max_length=MAX_LEN, return_tensors='pt')
    
    pred = predict(input_ids, options_list)
    
    return pred

In [59]:
pred_1 = final_fun_1('Which factor will most likely cause a person to develop a fever?  (A) a leg muscle relaxing after exercise (B) a bacterial population in the bloodstream (C) several viral particles on the skin (D) carbohydrates being digested in the stomach')

In [60]:
pred_1

('bacterial population the bloodstream', 1)

In [37]:
def final_fun_2(questions, labels):
    '''
    The ARC dataset contains question in the following format:
    
    Which factor will most likely cause a person to develop a fever?  (A) a leg muscle relaxing after exercise \
    (B) a bacterial population in the bloodstream (C) several viral particles on the skin \
    (D) carbohydrates being digested in the stomach
    
    As we can see here in raw form question contains options within it.
    
    Thus we will take question in same format.
    '''
    pred_labels = []
    
    for i in range(len(questions)):
        
        only_question = extract_question(questions[i])
        only_answers = extract_answers(questions[i])
        options_list = data_generator(only_answers)

        extracted_context = get_context_for_each_candidate(only_question, options_list)

        final_context = context_retreiver_2(extracted_context, only_question, options_list)

        input_string = only_question + '\\n ' + only_answers + '\\n ' + final_context + ' </s>'

        input_ids = tokenizer.encode(input_string, truncation=True, max_length=MAX_LEN, return_tensors='pt')

        pred, label = predict(input_ids, options_list)
        
        pred_labels.append(label)
    
    accuracy = accuracy_score(labels, pred_labels)
    
    return accuracy

In [24]:
questions = ['Which factor will most likely cause a person to develop a fever?  (A) a leg muscle relaxing after exercise (B) a bacterial population in the bloodstream (C) several viral particles on the skin (D) carbohydrates being digested in the stomach',
            'Lichens are symbiotic organisms made of green algae and fungi. What do the green algae supply to the fungi in this symbiotic relationship? (A) carbon dioxide (B) food (C) protection (D) water',
            'Rocks are classified as igneous, metamorphic, or sedimentary according to (1) their color (2) their shape (3) how they formed (4) the minerals they contain']
labels = [1,1,2]

In [29]:
accuracy = final_fun_2(questions, labels)

In [30]:
accuracy

1.0

# Conclusion

By undertaking this project I found following things:

1. Model size does have affect on the final performance.
2. Context is very important to answer the questions. Thus better context retreival techniques should be used.
3. By adding dicriminators we got better performance on the cost of computational speed.
4. Transfer learning in NLP has made many things easy to accomplish.

# Future Work

There are ways by which perfomance can be improved. Some of them are as follows:

1. Using bigger models. I have used only base models due to less computational resources.
2. For scoring context using ARD and DRD, we can use more than one DRD or ARD and take their average scores as final scores.
3. Instead of taking average of ARD and DRD scores, we can use KV attention to get better context(as described in Attentive Ranker paper).
4. Try with decoder based models like GPT, GPT-2, etc.