In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

df_easy_train = pd.read_csv('drive/MyDrive/ARC-V1-Feb2018-2/ARC-Easy-Train.csv')
df_easy_dev = pd.read_csv('drive/MyDrive/ARC-V1-Feb2018-2/ARC-Easy-Dev.csv')
df_easy_test = pd.read_csv('drive/MyDrive/ARC-V1-Feb2018-2/ARC-Easy-Test.csv')

In [4]:
df_challenge_train = pd.read_csv('drive/My Drive/ARC-V1-Feb2018-2/ARC-Challenge-Train.csv')
df_challenge_dev = pd.read_csv('drive/My Drive/ARC-V1-Feb2018-2/ARC-Challenge-Dev.csv')
df_challenge_test = pd.read_csv('drive/My Drive/ARC-V1-Feb2018-2/ARC-Challenge-Test.csv')

In [5]:
print(f'Shape of train easy dataset is {df_easy_train.shape}')
print(f'Shape of dev easy dataset is {df_easy_dev.shape}')
print(f'Shape of test easy dataset is {df_easy_test.shape}')

Shape of train easy dataset is (2251, 12)
Shape of dev easy dataset is (570, 12)
Shape of test easy dataset is (2376, 12)


In [6]:
print(f'Shape of train challenge dataset is {df_challenge_train.shape}')
print(f'Shape of dev challenge dataset is {df_challenge_dev.shape}')
print(f'Shape of test challenge dataset is {df_challenge_test.shape}')

Shape of train challenge dataset is (1119, 12)
Shape of dev challenge dataset is (299, 12)
Shape of test challenge dataset is (1172, 12)


In [7]:
df_easy_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2251 entries, 0 to 2250
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   questionID                2251 non-null   object 
 1   originalQuestionID        2251 non-null   object 
 2   totalPossiblePoint        2251 non-null   int64  
 3   AnswerKey                 2251 non-null   object 
 4   isMultipleChoiceQuestion  2251 non-null   int64  
 5   includesDiagram           2251 non-null   int64  
 6   examName                  2251 non-null   object 
 7   schoolGrade               2251 non-null   int64  
 8   year                      2251 non-null   object 
 9   question                  2251 non-null   object 
 10  subject                   0 non-null      float64
 11  category                  2251 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 211.2+ KB


Information regarding above columns are as follows:

questionID: Unique identifier for the question.

originalQuestionID: Legacy ID used within AI2.

totalPossiblePoint: The point value of the question for grading purposes.

AnswerKey: The letter signifying the correct answer option for the question.

isMultipleChoice: 1 indicates the question is multiple choice.

includesDiagram: 0 indicates the question does not include a diagram.

examName: The name of the source exam for these questions.

schoolGrade: The intended grade level for the question.

year: The year the questions were sourced for AI2.

question: The question and its answer options. Each answer option is indicated by a letter in parentheses, e.g., (A) and (B).

subject: The question's subject; this is left blank in this data set.

category: Whether the question is a Train, Dev, or Test question.

In [8]:
df_easy_train.head(5)

Unnamed: 0,questionID,originalQuestionID,totalPossiblePoint,AnswerKey,isMultipleChoiceQuestion,includesDiagram,examName,schoolGrade,year,question,subject,category
0,Mercury_7220990,7220990,1,B,1,0,Mercury,8,2015,Which factor will most likely cause a person t...,,Train
1,MCAS_2007_8_5189,5189,1,B,1,0,MCAS,8,2007,Lichens are symbiotic organisms made of green ...,,Train
2,Mercury_SC_401169,401169,1,D,1,0,Mercury,5,2015,When a switch is used in an electrical circuit...,,Train
3,MCAS_2004_8_27,27,1,A,1,0,MCAS,8,2004,Which of the following is an example of an ass...,,Train
4,NYSEDREGENTS_2006_8_10,10,1,3,1,0,NYSEDREGENTS,8,2006,"Rocks are classified as igneous, metamorphic, ...",,Train


In [9]:

imp_columns = ['AnswerKey','question']

df_easy_train = df_easy_train[imp_columns]
df_easy_dev = df_easy_dev[imp_columns]
df_easy_test = df_easy_test[imp_columns]

df_challenge_train = df_challenge_train[imp_columns]
df_challenge_dev = df_challenge_dev[imp_columns]
df_challenge_test = df_challenge_test[imp_columns]

### question.

In [10]:
df_easy_train['question'].iloc[0]

'Which factor will most likely cause a person to develop a fever?  (A) a leg muscle relaxing after exercise (B) a bacterial population in the bloodstream (C) several viral particles on the skin (D) carbohydrates being digested in the stomach'

In [11]:
df_easy_train['question'].iloc[4]

'Rocks are classified as igneous, metamorphic, or sedimentary according to (1) their color (2) their shape (3) how they formed (4) the minerals they contain'

In [12]:

file = open('drive/MyDrive/ARC/ARC_Corpus.txt','r')

In [13]:
def extract_question(question):
    '''
    This function will take a value in question column then, will split the string 
    at (A) or (1). After splitting we will take only first part which is the question.
    '''
    if '(A)' in question:
        x = question.split('(A)')[0]
    elif '(1)' in question:
        x = question.split('(1)')[0]

    return x

In [14]:
def extract_answers(question):
    '''
    This function will take a value in question column then, will split the string 
    at (A) or (1). After splitting we will take only first part which is the question.
    '''
    if '(A)' in question:
        x = '(A) ' + question.split('(A)')[1]
    elif '(1)' in question:
        x = '(1) ' + question.split('(1)')[1]

    return x

In [15]:
df_easy_train['only_question'] = df_easy_train['question'].apply(extract_question)
df_easy_dev['only_question'] = df_easy_dev['question'].apply(extract_question)
df_easy_test['only_question'] = df_easy_test['question'].apply(extract_question)
df_challenge_train['only_question'] = df_challenge_train['question'].apply(extract_question)
df_challenge_dev['only_question'] = df_challenge_dev['question'].apply(extract_question)
df_challenge_test['only_question'] = df_challenge_test['question'].apply(extract_question)

In [16]:
df_easy_train['only_answers'] = df_easy_train['question'].apply(extract_answers)
df_easy_dev['only_answers'] = df_easy_dev['question'].apply(extract_answers)
df_easy_test['only_answers'] = df_easy_test['question'].apply(extract_answers)
df_challenge_train['only_answers'] = df_challenge_train['question'].apply(extract_answers)
df_challenge_dev['only_answers'] = df_challenge_dev['question'].apply(extract_answers)
df_challenge_test['only_answers'] = df_challenge_test['question'].apply(extract_answers)

In [17]:
#Adding length of a question as a new column.
df_easy_train['quesiton_length'] = df_easy_train['only_question'].apply(lambda x: len(x))
df_easy_dev['quesiton_length'] = df_easy_dev['only_question'].apply(lambda x: len(x))
df_easy_test['quesiton_length'] = df_easy_test['only_question'].apply(lambda x: len(x))
df_challenge_train['quesiton_length'] = df_challenge_train['only_question'].apply(lambda x: len(x))
df_challenge_dev['quesiton_length'] = df_challenge_dev['only_question'].apply(lambda x: len(x))
df_challenge_test['quesiton_length'] = df_challenge_test['only_question'].apply(lambda x: len(x))

In [18]:
#Saving the newly created dataframes.
df_easy_train.to_csv('df_easy_train',index=False)
df_easy_dev.to_csv('df_easy_dev',index=False)
df_easy_test.to_csv('df_easy_test',index=False)

In [19]:
df_challenge_train.to_csv('df_challenge_train',index=False)
df_challenge_dev.to_csv('df_challenge_dev',index=False)
df_challenge_test.to_csv('df_challenge_test',index=False)

In [20]:
!pip install elasticsearch==7.9.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting elasticsearch==7.9.1
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl (219 kB)
[K     |████████████████████████████████| 219 kB 7.0 MB/s 
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.9.1


In [21]:
!pip install tensorflow-io

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-io
  Downloading tensorflow_io-0.26.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 1.3 MB/s 
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.26.0


In [22]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.1-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.1-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-oss-7.9.1-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.9.1/
shasum -a 512 -c elasticsearch-oss-7.9.1-linux-x86_64.tar.gz.sha512 

elasticsearch-oss-7.9.1-linux-x86_64.tar.gz: OK


In [23]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.9.1/bin/elasticsearch

Starting job # 0 in a separate thread.


In [24]:
import os
import time
# Sleep for few seconds to let the instance start.
time.sleep(20)

In [25]:
%%bash

ps -ef | grep elasticsearch

root         380     378  0 03:09 ?        00:00:00 sudo -H -u daemon elasticsearch-7.9.1/bin/elasticsearch
daemon       381     380 99 03:09 ?        00:00:26 /content/elasticsearch-7.9.1/jdk/bin/java -Xshare:auto -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -XX:+ShowCodeDetailsInExceptionMessages -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.locale.providers=SPI,COMPAT -Xms1g -Xmx1g -XX:+UseG1GC -XX:G1ReservePercent=25 -XX:InitiatingHeapOccupancyPercent=30 -Djava.io.tmpdir=/tmp/elasticsearch-1125315475300519567 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecou

In [26]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "7c7d7789480b",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "TUqFhBPsRdCLrnbmKY9_qg",
  "version" : {
    "number" : "7.9.1",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "083627f112ba94dffc1232e8b42b73492789ef91",
    "build_date" : "2020-09-01T21:22:21.964974Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [27]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

import pandas as pd
import numpy as np
import re

In [28]:
es = Elasticsearch([{'host':'localhost', 'port':9200}])

In [29]:
es.ping()

True

In [30]:
corpus2 = open('drive/MyDrive/ARC/ARC_Corpus.txt', 'r', encoding='utf-8')

In [31]:
def create_index(es_client):
    """ Creates an Elasticsearch index."""
    is_created = False
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },
        "mappings": {
            "dynamic": "true",
            "_source": {
            "enabled": "true"
            },
            "properties": {
                "body": {
                    "type": "text"
                }
            }
        }
    }
    print('Creating `corpus2` index...')
    try:
        if es_client.indices.exists('corpus2'):
            es_client.indices.delete(index='corpus2', ignore=[404])
        es_client.indices.create(index='corpus2', body=settings)
        is_created = True
        print('index `corpus2` created successfully.')
    except Exception as ex:
        print(str(ex))
    finally:
        return is_created
    return is_created

In [32]:
create_index(es)

Creating `corpus2` index...
index `corpus2` created successfully.


True

In [33]:
from elasticsearch.helpers import bulk
def index_batch(docs):
    """ Indexes a batch of documents."""
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = 'corpus2'
        request["body"] = doc['body']
        requests.append(request)
    bulk(es, requests)

In [34]:
def index_data(es_client, data, BATCH_SIZE=100000):
    """ Indexs all the rows in data (python questions)."""
    docs = []
    count = 0
    for line in data:
        js_object = {}
        js_object['body'] = line
        docs.append(js_object)
        count += 1

        if count % BATCH_SIZE == 0:
            index_batch(docs)
            docs = []
            print('Indexed {} documents.'.format(count))
    if docs:
        index_batch(docs)
        print('Indexed {} documents.'.format(count))

    es_client.indices.refresh(index='corpus2')
    print("Done indexing.")

In [35]:
index_data(es,corpus2.readlines(), BATCH_SIZE=100000)

Indexed 100000 documents.
Indexed 200000 documents.
Indexed 300000 documents.
Indexed 400000 documents.
Indexed 500000 documents.
Indexed 600000 documents.
Indexed 700000 documents.
Indexed 800000 documents.
Indexed 900000 documents.
Indexed 1000000 documents.
Indexed 1100000 documents.
Indexed 1200000 documents.
Indexed 1300000 documents.
Indexed 1400000 documents.
Indexed 1500000 documents.
Indexed 1600000 documents.
Indexed 1700000 documents.
Indexed 1800000 documents.
Indexed 1900000 documents.
Indexed 2000000 documents.
Indexed 2100000 documents.
Indexed 2200000 documents.
Indexed 2300000 documents.
Indexed 2400000 documents.
Indexed 2500000 documents.
Indexed 2600000 documents.
Indexed 2700000 documents.
Indexed 2800000 documents.
Indexed 2900000 documents.
Indexed 3000000 documents.
Indexed 3100000 documents.
Indexed 3200000 documents.
Indexed 3300000 documents.
Indexed 3400000 documents.
Indexed 3500000 documents.
Indexed 3600000 documents.
Indexed 3700000 documents.
Indexed 38

In [36]:
import json
import time
from functools import reduce
import operator

In [37]:
df_easy_train = pd.read_csv('df_easy_train')
df_easy_dev = pd.read_csv('df_easy_dev')
df_easy_test = pd.read_csv('df_easy_test')

In [38]:
df_challenge_train = pd.read_csv('df_challenge_train')
df_challenge_dev = pd.read_csv('df_challenge_dev')
df_challenge_test = pd.read_csv('df_challenge_test')

In [39]:
df_easy_train.head()

Unnamed: 0,AnswerKey,question,only_question,only_answers,quesiton_length
0,B,Which factor will most likely cause a person t...,Which factor will most likely cause a person t...,(A) a leg muscle relaxing after exercise (B) ...,66
1,B,Lichens are symbiotic organisms made of green ...,Lichens are symbiotic organisms made of green ...,(A) carbon dioxide (B) food (C) protection (D...,139
2,D,When a switch is used in an electrical circuit...,When a switch is used in an electrical circuit...,(A) cause the charge to build. (B) increase a...,63
3,A,Which of the following is an example of an ass...,Which of the following is an example of an ass...,(A) contact lens (B) motorcycle (C) raincoat ...,61
4,3,"Rocks are classified as igneous, metamorphic, ...","Rocks are classified as igneous, metamorphic, ...",(1) their color (2) their shape (3) how they ...,74


In [40]:
def data_generator(a):
    '''
    This function will accept a data point and returns a list of options
    converts the only_options list of options.

    Output will be as follows:

    options = [option_1, option_2, option_3, option_4, option_5]                   

    Note: If there's no option E then it will be written as 'None of the above'.
    '''
    options = []

    if '(A)' in a:
        option_1 = a.split('(B)')[0].replace('(A)','').lstrip()

        b = ' '.join(i for i in a.split() if i not in option_1)
        option_2 = b.split('(C)')[0].replace('(B)','').replace('(A)','').lstrip()

        c = ' '.join(i for i in b.split() if i not in option_2)
        option_3 = c.split('(D)')[0].replace('(C)','').replace('(B)','').replace('(A)','').lstrip()

        if '(D)' not in c:
            option_4 = 'None of the above'
            option_5 = 'None of the above'
        else:
            if '(E)' not in c:
                option_4 = c.split('(D)')[1].lstrip()
                option_5 = 'None of the above'
            else:
                d = ' '.join(i for i in c.split() if i not in option_3)
                option_4 = d.split('(E)')[0].replace('(D)','').replace('(C)','').replace('(B)','').replace('(A)','').lstrip()
                option_5 = d.split('(E)')[1].lstrip()
    else:
        option_1 = a.split('(2)')[0].replace('(1)','').lstrip()

        b = ' '.join(i for i in a.split() if i not in option_1)
        option_2 = b.split('(3)')[0].replace('(2)','').replace('(1)','').lstrip()

        c = ' '.join(i for i in b.split() if i not in option_2)
        option_3 = c.split('(4)')[0].replace('(3)','').replace('(2)','').replace('(1)','').lstrip()

        if '(4)' in c:
            option_4 = c.split('(4)')[1].lstrip()
        else:
            option_4 = 'None of the above'
        option_5 = 'None of the above'

    options = [option_1, option_2, option_3, option_4, option_5]

    return options

In [41]:
df_easy_dev['options_list'] = df_easy_dev['only_answers'].map(data_generator)
df_easy_train['options_list'] = df_easy_train['only_answers'].map(data_generator)
df_easy_test['options_list'] = df_easy_test['only_answers'].map(data_generator)

In [42]:
df_easy_train.head()

Unnamed: 0,AnswerKey,question,only_question,only_answers,quesiton_length,options_list
0,B,Which factor will most likely cause a person t...,Which factor will most likely cause a person t...,(A) a leg muscle relaxing after exercise (B) ...,66,"[a leg muscle relaxing after exercise , bacter..."
1,B,Lichens are symbiotic organisms made of green ...,Lichens are symbiotic organisms made of green ...,(A) carbon dioxide (B) food (C) protection (D...,139,"[carbon dioxide , food , protection , water, N..."
2,D,When a switch is used in an electrical circuit...,When a switch is used in an electrical circuit...,(A) cause the charge to build. (B) increase a...,63,"[cause the charge to build. , increase and dec..."
3,A,Which of the following is an example of an ass...,Which of the following is an example of an ass...,(A) contact lens (B) motorcycle (C) raincoat ...,61,"[contact lens , motorcycle , raincoat , coffee..."
4,3,"Rocks are classified as igneous, metamorphic, ...","Rocks are classified as igneous, metamorphic, ...",(1) their color (2) their shape (3) how they ...,74,"[their color , shape , how they formed , miner..."


In [62]:
df_challenge_dev['options_list'] = df_challenge_dev['only_answers'].map(data_generator)
df_challenge_train['options_list'] = df_challenge_train['only_answers'].map(data_generator)
df_challenge_test['options_list'] = df_challenge_test['only_answers'].map(data_generator)

In [51]:
def get_context(question, options):
    '''
    This function will return a context after joining a question
    and its options separately. 
    For example: question + option_1
                 question + option_2
                 question + option_3
                 question + option_4
   '''
    scores = []
    sentences = []
    choices = []
    
    #Dividing options into separate parts.
    if '(A)' in options:

        split_1 = options.split('(B)')
        option_1 = split_1[0].replace('(A)','')
        choices.append(option_1)

        split_2 = split_1[1].split('(C)')
        option_2 = split_2[0]
        choices.append(option_2)

        if '(D)' not in split_2[1]:
            option_3 = split_2[1]
            choices.append(option_3)

        elif '(D)' in split_2[1]:
            split_3 = split_2[1].split('(D)')
            option_3 = split_3[0]
            choices.append(option_3)

            if '(E)' not in split_3[1]:
                option_4 = split_3[1]
                choices.append(option_4)

            else:
                split_4 = split_3[1].split('(E)')
                option_4 = split_4[0]
                choices.append(option_4)

                option_5 = split_4[1]
                choices.append(option_5)
    else:
        split_1 = options.split('(2)')
        option_1 = split_1[0].replace('(1)','')
        choices.append(option_1)

        split_2 = split_1[1].split('(3)')
        option_2 = split_2[0]
        choices.append(option_2)

        if '(4)' not in split_2[1]:
            option_3 = split_2[1]
            choices.append(option_3)

        elif '(4)' in split_2[1]:
            split_3 = split_2[1].split('(4)')
            option_3 = split_3[0]
            choices.append(option_3)

            if '(5)' not in split_3[1]:
                option_4 = split_3[1]
                choices.append(option_4)

            else:
                split_4 = split_3[1].split('(5)')
                option_4 = split_4[0]
                choices.append(option_4)

                option_5 = split_4[1]
                choices.append(option_5)
    
    for i in [choices]:
  
        query=question+f'{i}'
        search_start = time.time()
        search = {"size":20,"query": {"match": {"body": query}}}
        #print(search)
        response = es.search(index='corpus2', body=json.dumps(search))
        search_time = time.time() - search_start
        #print("{} total hits.".format(response["hits"]["total"]["value"]))
        #print("search time: {:.2f} ms".format(search_time * 1000))

        #Searching in the 'corpus' index.
        a = es.search(index='corpus2', body=json.dumps(search))

        for j in a['hits']['hits']:
            sentence = j['_source']
            out=list(sentence.values())
            out = reduce(operator.concat,out)
            sentences.append(out)

   
    
    context= ' '.join(sentences) #Selecting top 20 sentences for the context.
    
    return context

In [52]:
def get_context_for_each_candidate(question,options_list):
    """ Searches the user query and finds the best matches using elasticsearch."""
    #query = input("Enter query: ")
    """This function will return a context after joining a question
    and its options separately. 
    For example: question + option_1
                 question + option_2
                 question + option_3
                 question + option_4"""
   
    option_1_docs = []
    option_2_docs =  []
    option_3_docs =  []
    option_4_docs =  []
    option_5_docs =  []
    for j in range(len(options_list)):
        query=question+options_list[j]
        search_start = time.time()
        search = {"size": 8,"query": {"match": {"body": query}}}
        #print(search)
        response = es.search(index='corpus2', body=json.dumps(search))
        search_time = time.time() - search_start
        #print("{} total hits.".format(response["hits"]["total"]["value"]))
        #print("search time: {:.2f} ms".format(search_time * 1000))
        sentences = []
        for hit in response["hits"]["hits"]:
            sentence = hit['_source']
            out=list(sentence.values())
            out = reduce(operator.concat,out)
           
            if j == 0:
                option_1_docs.append(out)
            elif j == 1:
                option_2_docs.append(out)
            elif j == 2:
                option_3_docs.append(out)
            elif j == 3:
                option_4_docs.append(out)
            elif j == 4:
                option_5_docs.append(out)
        
    return pd.Series([option_1_docs, option_2_docs, option_3_docs, option_4_docs, option_5_docs])



In [53]:
a = df_easy_train['only_question'].iloc[1139]
b = df_easy_train['options_list'].iloc[1139]

In [56]:
q,w,e,r,t=get_context_for_each_candidate(a,b)

search time: 929.67 ms
search time: 858.03 ms
search time: 341.28 ms
search time: 230.67 ms
search time: 225.32 ms


In [None]:
df_easy_train[['option_1_docs','option_2_docs','option_3_docs','option_4_docs','option_5_docs',]] = df_easy_train[['only_question','options_list']].apply(lambda x: get_context_for_each_candidate(*x), axis=1)
df_easy_dev[['option_1_docs','option_2_docs','option_3_docs','option_4_docs','option_5_docs',]] = df_easy_dev[['only_question','options_list']].apply(lambda x: get_context_for_each_candidate(*x), axis=1)
df_easy_test[['option_1_docs','option_2_docs','option_3_docs','option_4_docs','option_5_docs',]] = df_easy_test[['only_question','options_list']].apply(lambda x: get_context_for_each_candidate(*x), axis=1)

In [59]:
df_easy_train.head()

Unnamed: 0,AnswerKey,question,only_question,only_answers,quesiton_length,options_list,option_1_docs,option_2_docs,option_3_docs,option_4_docs,option_5_docs
0,B,Which factor will most likely cause a person t...,Which factor will most likely cause a person t...,(A) a leg muscle relaxing after exercise (B) ...,66,"[a leg muscle relaxing after exercise , bacter...",[Which will most likely cause variations to oc...,[If a bacterial infection is the suspected cau...,[A person who has been sick for one week with ...,[Which will most likely cause variations to oc...,[Fever: most infections of the lung will cause...
1,B,Lichens are symbiotic organisms made of green ...,Lichens are symbiotic organisms made of green ...,(A) carbon dioxide (B) food (C) protection (D...,139,"[carbon dioxide , food , protection , water, N...",[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...
2,D,When a switch is used in an electrical circuit...,When a switch is used in an electrical circuit...,(A) cause the charge to build. (B) increase a...,63,"[cause the charge to build. , increase and dec...",[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...
3,A,Which of the following is an example of an ass...,Which of the following is an example of an ass...,(A) contact lens (B) motorcycle (C) raincoat ...,61,"[contact lens , motorcycle , raincoat , coffee...",[Which of the following is an example of an an...,[Which of the following is an example of an an...,[Which of the following is an example of an an...,[Which of the following is an example of an an...,[Which of the following is an example of an an...
4,3,"Rocks are classified as igneous, metamorphic, ...","Rocks are classified as igneous, metamorphic, ...",(1) their color (2) their shape (3) how they ...,74,"[their color , shape , how they formed , miner...","[Metamorphic rocks can begin as igneous, sedim...","[Metamorphic rocks can begin as igneous, sedim...","[Rocks are formed on Earth as igneous, sedimen...","[Metamorphic rocks can begin as igneous, sedim...","[Metamorphic rocks can begin as igneous, sedim..."


In [60]:
df_easy_train.to_csv('df_easy_train_3.csv', index=False)
df_easy_dev.to_csv('df_easy_dev_3.csv', index=False)
df_easy_test.to_csv('df_easy_test_3.csv', index=False)

In [None]:
df_challenge_train[['option_1_docs','option_2_docs','option_3_docs','option_4_docs','option_5_docs',]] = df_challenge_train[['only_question','options_list']].apply(lambda x: get_context_for_each_candidate(*x), axis=1)
df_challenge_dev[['option_1_docs','option_2_docs','option_3_docs','option_4_docs','option_5_docs',]] = df_challenge_dev[['only_question','options_list']].apply(lambda x: get_context_for_each_candidate(*x), axis=1)
df_challenge_test[['option_1_docs','option_2_docs','option_3_docs','option_4_docs','option_5_docs',]] = df_challenge_test[['only_question','options_list']].apply(lambda x: get_context_for_each_candidate(*x), axis=1)

In [64]:
df_challenge_train.to_csv('df_challenge_train_3.csv', index=False)
df_challenge_dev.to_csv('df_challenge_dev_3.csv', index=False)
df_challenge_test.to_csv('df_challenge_test_3.csv', index=False)

In [None]:
list_of_context = []

for i in range(len(df_easy_train)):
    question = df_easy_train['only_question'].iloc[i]
    answers = df_easy_train['only_answers'].iloc[i]
    
    context = get_context(question, answers)
    list_of_context.append(context)

In [66]:
df_easy_train['context'] = list_of_context

In [67]:
df_easy_train.head()

Unnamed: 0,AnswerKey,question,only_question,only_answers,quesiton_length,options_list,option_1_docs,option_2_docs,option_3_docs,option_4_docs,option_5_docs,context
0,B,Which factor will most likely cause a person t...,Which factor will most likely cause a person t...,(A) a leg muscle relaxing after exercise (B) ...,66,"[a leg muscle relaxing after exercise , bacter...",[Which will most likely cause variations to oc...,[If a bacterial infection is the suspected cau...,[A person who has been sick for one week with ...,[Which will most likely cause variations to oc...,[Fever: most infections of the lung will cause...,If a bacterial infection is the suspected caus...
1,B,Lichens are symbiotic organisms made of green ...,Lichens are symbiotic organisms made of green ...,(A) carbon dioxide (B) food (C) protection (D...,139,"[carbon dioxide , food , protection , water, N...",[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,[Lichens are comprised of blue-green or green ...,Lichens are comprised of blue-green or green a...
2,D,When a switch is used in an electrical circuit...,When a switch is used in an electrical circuit...,(A) cause the charge to build. (B) increase a...,63,"[cause the charge to build. , increase and dec...",[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,[When A Switch Is Closed In An Electrical Circ...,VOLTAGE - That force which is generated to cau...
3,A,Which of the following is an example of an ass...,Which of the following is an example of an ass...,(A) contact lens (B) motorcycle (C) raincoat ...,61,"[contact lens , motorcycle , raincoat , coffee...",[Which of the following is an example of an an...,[Which of the following is an example of an an...,[Which of the following is an example of an an...,[Which of the following is an example of an an...,[Which of the following is an example of an an...,Which of the following is an example of an ang...
4,3,"Rocks are classified as igneous, metamorphic, ...","Rocks are classified as igneous, metamorphic, ...",(1) their color (2) their shape (3) how they ...,74,"[their color , shape , how they formed , miner...","[Metamorphic rocks can begin as igneous, sedim...","[Metamorphic rocks can begin as igneous, sedim...","[Rocks are formed on Earth as igneous, sedimen...","[Metamorphic rocks can begin as igneous, sedim...","[Metamorphic rocks can begin as igneous, sedim...","Igneous, sedimentary, and metamorphic rocks; h..."


In [None]:
df_easy_dev['context'] = df_easy_dev[['only_question','only_answers']].apply(lambda x: get_context(*x), axis=1)

In [None]:
df_easy_test['context'] = df_easy_test[['only_question','only_answers']].apply(lambda x: get_context(*x), axis=1)

In [None]:
df_challenge_train['context'] = df_challenge_train[['only_question','only_answers']].apply(lambda x: get_context(*x), axis=1)
df_challenge_dev['context'] = df_challenge_dev[['only_question','only_answers']].apply(lambda x: get_context(*x), axis=1)
df_challenge_test['context'] = df_challenge_test[['only_question','only_answers']].apply(lambda x: get_context(*x), axis=1)

In [75]:
def replace_line_breaks(context):
    '''
    Function to replace '\n' with empty strings('').
    '''
    return context.replace('\n','')

In [76]:
df_easy_train['context'] = df_easy_train['context'].apply(replace_line_breaks)
df_easy_dev['context'] = df_easy_dev['context'].apply(replace_line_breaks)
df_easy_test['context'] = df_easy_test['context'].apply(replace_line_breaks)

df_challenge_train['context'] = df_challenge_train['context'].apply(replace_line_breaks)
df_challenge_dev['context'] = df_challenge_dev['context'].apply(replace_line_breaks)
df_challenge_test['context'] = df_challenge_test['context'].apply(replace_line_breaks)


In [77]:
def preprocess_context(x):

    x = re.sub(r"\xa0", " ", x) #\xa0 represents space in Unicode. Thus removing it.

    #Decontracting the words.
    x = re.sub(r"won't", "will not", x)
    x = re.sub(r"can\'t", "can not", x)
    x = re.sub(r"n\'t", " not", x)
    x = re.sub(r"\'re", " are", x)
    x = re.sub(r"\'s", " is", x)
    x = re.sub(r"\'d", " would", x)
    x = re.sub(r"\'ll", " will", x)
    x = re.sub(r"\'t", " not", x)
    x = re.sub(r"\'ve", " have", x)
    x = re.sub(r"\'m", " am", x)
    x = re.sub(r"b/t", "between", x)
    x = re.sub(r"mph", "miles per hour", x)
    x = re.sub(r"km/h", "kilometers per hour", x)

    x = re.sub(r"[^A-Za-z0-9%+]+", " ", x) #Removing special characters.
    x = re.sub(r"\s+", " ", x) #Removing extra spaces.

    return x

In [79]:
df_easy_train['context'] = df_easy_train['context'].apply(preprocess_context)
df_easy_dev['context'] = df_easy_dev['context'].apply(preprocess_context)
df_easy_test['context'] = df_easy_test['context'].apply(preprocess_context)

df_challenge_train['context'] = df_challenge_train['context'].apply(preprocess_context)
df_challenge_dev['context'] = df_challenge_dev['context'].apply(preprocess_context)
df_challenge_test['context'] = df_challenge_test['context'].apply(preprocess_context)

In [80]:
df_easy_train.to_csv('df_easy_train_2.csv', index=False)
df_easy_dev.to_csv('df_easy_dev_2.csv', index=False)
df_easy_test.to_csv('df_easy_test_2.csv', index=False)

df_challenge_train.to_csv('df_challenge_train_2.csv', index=False)
df_challenge_dev.to_csv('df_challenge_dev_2.csv', index=False)
df_challenge_test.to_csv('df_challenge_test_2.csv', index=False)