# Importing Libraries and Dataset

Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

Importing the covid qa dataset along with context/article mapped

In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset('covid_qa_deepset')



Reusing dataset covid_qa_deepset (/root/.cache/huggingface/datasets/covid_qa_deepset/covid_qa_deepset/1.0.0/fb886523842e312176f92ec8e01e77a08fa15a694f5741af6fc42796ee9c8c46)


# Data Preprocessing

Converting the dictionary object to the dataframe

In [None]:
data = pd.DataFrame.from_dict(dataset)

In [None]:
data.train[0]

{'answers': {'answer_start': [370],
  'text': ['Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide.']},
 'context': "Functional Genetic Variants in DC-SIGNR Are Associated with Mother-to-Child Transmission of HIV-1\n\nhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC2752805/\n\nBoily-Larouche, Geneviève; Iscache, Anne-Laure; Zijenah, Lynn S.; Humphrey, Jean H.; Mouland, Andrew J.; Ward, Brian J.; Roger, Michel\n2009-10-07\nDOI:10.1371/journal.pone.0007211\nLicense:cc-by\n\nAbstract: BACKGROUND: Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. Given that the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1. METHODS AND FINDIN

Extracting the feature from the dictionary file object present as single column

In [None]:
ans = []
ques = []
context = []
id = []
document_id = []
for i in range(data.train.size):
  ans.append(data.train[i]['answers']['text'])
  context.append(data.train[i]['context'])
  id.append(data.train[i]['id'])
  document_id.append(data.train[i]['document_id'])
  ques.append(data.train[i]['question'])

Creating the final dataframe with the set of columns

In [None]:
df = pd.DataFrame()
df['id'] = id
df['document_id'] = document_id
df['context'] = context
df['question'] = ques
df['answer'] = ans

Converting the answer from list to string datatype


In [None]:
ans = []
for i in range(df.answer.size):
   ans.append(' '.join([str(elem) for elem in df['answer'][i]]))
df.drop('answer',axis=1)
df['answer'] = ans

Checking the unique set of documents

In [None]:
df.document_id.unique()

array([ 630,  650, 1546, 1545, 1552, 1553, 1557, 1565,  641, 1547,  187,
       1582, 1571, 1585, 1575, 2585, 1556, 1569, 1551, 1594, 1570, 1567,
       1563, 1549, 1564, 1560, 1561, 1562, 1573, 1591, 1579, 1583, 1588,
       1589, 1554, 1568, 1566, 1548, 1574, 1581, 1593, 1550, 1572, 1576,
       1590, 1578, 1580, 1584, 1586, 1587, 1592, 1597, 1598, 1600, 1603,
       1606, 1596, 1604, 1599, 1601, 1602, 1605, 1607, 1608, 1632, 1631,
       1628, 2439, 2432, 1620, 2437, 1625, 1627, 2440, 1633, 1634, 2466,
       2458, 1621, 1629, 1623, 2459, 1663, 1660, 1645, 2461, 2463, 1652,
       1656, 2450, 1676, 1661, 1662, 1674, 1666, 1667, 1679, 1671, 2486,
       1664, 1730, 1665, 1595, 1684, 1719, 1686, 1687, 1690, 1689, 1688,
       1691, 1698,  185,  186, 1741, 2527,  188, 2522, 1714, 2519, 1618,
       1722, 2592, 2551, 2555, 2554, 1740, 2526, 2504, 2565, 2683, 2675,
       2653, 2652, 2642, 2669, 2628, 2684, 2620, 2634, 2674, 2651, 1559,
       2668, 2643,  776, 1713])

In [None]:
df[df.document_id == 1713]

Unnamed: 0,id,document_id,context,question,answer
2014,5315,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What is the structure of the Ebolavirus?,single-strand RNA filoviruses
2015,5316,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",When was the West African Ebolavirus outbreak?,2013-2016
2016,5317,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What animals are considered to be maintenance ...,African bats
2017,5318,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What do circles indicate in Figure 1?,a maintenance function play by the host(s)
2018,5319,1713,"Ebola Virus Maintenance: If Not (Only) Bats, W...",What do arrows indicate in Figure 1?,infectious transmission pathways between hosts


Concatenating  the answer and context to feed as single input text to the T5 transformer model


In [None]:
df['text'] = '<answer> ' + df['answer'].astype(str) + ' <context> ' + df['context'].astype(str)

In [None]:
df.head()

Unnamed: 0,id,document_id,context,question,answer,text
0,262,630,Functional Genetic Variants in DC-SIGNR Are As...,What is the main cause of HIV-1 infection in c...,Mother-to-child transmission (MTCT) is the mai...,<answer> Mother-to-child transmission (MTCT) i...
1,276,630,Functional Genetic Variants in DC-SIGNR Are As...,What plays the crucial role in the Mother to C...,DC-SIGNR plays a crucial role in MTCT of HIV-1...,<answer> DC-SIGNR plays a crucial role in MTCT...
2,278,630,Functional Genetic Variants in DC-SIGNR Are As...,How many children were infected by HIV-1 in 20...,"more than 400,000 children were infected world...","<answer> more than 400,000 children were infec..."
3,316,630,Functional Genetic Variants in DC-SIGNR Are As...,What is the role of C-C Motif Chemokine Ligand...,"High copy numbers of CCL3L1, a potent HIV-1 su...","<answer> High copy numbers of CCL3L1, a potent..."
4,305,630,Functional Genetic Variants in DC-SIGNR Are As...,What is DC-GENR and where is it expressed?,Dendritic cell-specific ICAM-grabbing non-inte...,<answer> Dendritic cell-specific ICAM-grabbing...


# Storing the data on google drive

Mount the google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Choose a Google Drive directory to store the data.


In [None]:
download_path = os.path.expanduser('/content/gdrive/MyDrive/QG_dataset')
try:
  os.makedirs(download_path)
except: pass

Saving the whole data to the dataset directory


In [None]:
df.to_csv(os.path.join(download_path, 'Dataset.csv'))

Splitting the data for the t5 transformer for question generation task


In [None]:
train_df, val_df = train_test_split(df[['question','text']], test_size=0.15)

train_df.to_csv(os.path.join(download_path, 'qg_train.csv'))
val_df.to_csv(os.path.join(download_path, 'qg_valid.csv'))

Splitting the data for the BERT transformer for question evaluation task


In [None]:
train_df, val_df = train_test_split(df[['question','answer']], test_size=0.15)

train_df.to_csv(os.path.join(download_path, 'qa_eval_train.csv'))
val_df.to_csv(os.path.join(download_path, 'qa_eval_valid.csv'))