In [None]:
##################################################################################
# This notebook helps convert BioASQ dataset into squad format.
# By calling the appropriate functions, Snippet or abstract contexts 
# can be created. 
##################################################################################

In [1]:
import csv
import json

#This class helps convert the unstructured dataset to structured squad like format
class ProcessTrainData:
    def __init__(self,train_data_file,abs_file=None,out_dir=None):
        self.abs_id_dict = {}
        self.output_dir = out_dir
        self.train_data_file = train_data_file
        self.abstract_title_file = abs_file
        self.train_data = ""
        self.yes_no_data = {"version":"BioASQ8b","data":[{"title":"BioASQ8b","paragraphs":[]}]} 
        #self.yes_no_data["data"][0]["paragraphs"]
        
    #Load all the abstracts document
    def loadAbstracts(self):
        with open(self.abstract_title_file) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                temp_str = row[1]+" "+row[2]
                self.abs_id_dict[row[0]] = temp_str
            
    #LOAD UNFORMATTED DATASET
    def loadDataset(self):
        with open(self.train_data_file, 'r') as f:
            datastore = json.load(f)
        self.train_data = datastore
        
    #helper function to generate yes no question
    def yesno_generator_abstract(self,doc_id,ques_id,question,answer):
        if doc_id not in self.abs_id_dict:
            print("Abstract not found :",doc_id)
            return "nothing"
        abstract = self.abs_id_dict[doc_id]
        gen_ques_context ={}
        val = False
        if answer=="no":
            val = False
        gen_ques_context["context"] = abstract
        if answer=="eval":
            gen_ques_context["qas"] =[{"question":question,"id":ques_id}] 
        else:
            gen_ques_context["qas"] =[{"question":question,"id":ques_id,"answers":answer,"is_impossible":val}] 
        return gen_ques_context
    
    #Gets the ID of the document from URL
    def getDocumentId(self,url):
        parts = url.split("/")
        return parts[len(parts)-1]
    
    #CREATE YES/NO DATASET with abstract
    def CreateDataset_yesno_abstract(self):
        no_of_ques = len(self.train_data["questions"])
        for i in range(0,no_of_ques):
            duplicate_cnt = 0 
            each_ques = self.train_data["questions"][i]
            if each_ques["type"]!="yesno":
                continue
            if "exact_answer" in each_ques:
                answer = each_ques["exact_answer"]
            else:
                answer = "eval"
            question = each_ques["body"]
            documents = each_ques["documents"]
            ques_id = each_ques["id"]
            for j in range(len(documents)):
                duplicate_cnt +=1
                dup_ques_id = self.getDuplicateId(ques_id,duplicate_cnt)
                each_doc = documents[j]
                doc_id = self.getDocumentId(each_doc)
                ques_context_ans_pair = self.yesno_generator_abstract(doc_id,dup_ques_id,question,answer)
                if ques_context_ans_pair=="nothing":
                    continue
                #print(type(self.yes_no_data["data"][0]["paragraphs"]))
                self.yes_no_data["data"][0]["paragraphs"].append(ques_context_ans_pair)
                #print(self.yes_no_data)
               
            
        self.WriteCSVFile("training_8b_abstract.json",self.yes_no_data)
        
    #yes no snippet gernerator
    def yesno_generator_snippet(self,snippet_text,ques_id, question,answer):
        gen_ques_context ={}
        val = False
        if answer=="no":
            val = True
        gen_ques_context["context"] = snippet_text
        if answer=="eval":
            gen_ques_context["qas"] =[{"question":question,"id":ques_id}] 
        else:
            gen_ques_context["qas"] =[{"question":question,"id":ques_id,"answers":answer,"is_impossible":val}] 
        return gen_ques_context
    #get new duplicate question id
    def getDuplicateId(self,ques_id,cnt):
        if len(str(cnt))==1:
            return ques_id+"_00"+str(cnt)
        if len(str(cnt))==2:
            return ques_id+"_0"+str(cnt)
        #if len(str(cnt))==3:
        return ques_id+"_"+str(cnt)
    
    #CREATE YES/NO DATASET with snippet
    def CreateDataset_yesno_snippet(self):
        no_of_ques = len(self.train_data["questions"])
        for i in range(0,no_of_ques):
            duplicate_cnt = 0
            each_ques = self.train_data["questions"][i]
            if each_ques["type"]!="yesno":
                continue
            if "exact_answer" in each_ques:
                answer = each_ques["exact_answer"]
            else:
                answer = "eval"
            question = each_ques["body"]
            snippets = each_ques["snippets"]
            ques_id = each_ques["id"]
            for j in range(len(snippets)):
                duplicate_cnt +=1
                dup_ques_id = self.getDuplicateId(ques_id,duplicate_cnt)
                ques_context_ans_pair = self.yesno_generator_snippet(snippets[j]["text"],dup_ques_id,question,answer)
                if ques_context_ans_pair=="nothing":
                    continue
                #print(type(self.yes_no_data["data"][0]["paragraphs"]))
                self.yes_no_data["data"][0]["paragraphs"].append(ques_context_ans_pair)
                #print(self.yes_no_data)
        self.WriteCSVFile("training_8b_snippet.json",self.yes_no_data)
        
        
        
    def WriteCSVFile(self,filename,json_data):
        with open(self.output_dir+filename, 'w+', encoding='utf-8') as outfile:
            json.dump(json_data, outfile,indent=5)
            
    #CREATE FACTIOD DATASET
    
    #CREATE LIST DATASET
    

In [5]:
training_file = "../Dataset/8b_dataset/training8b.json"
abstract_title_file = "../Dataset/8b_dataset/titles_and_abstr_bioasq.csv"
out_dir ='../Dataset/8b_dataset/'
#Initialize
processData = ProcessTrainData(training_file,abstract_title_file,out_dir)
#Load abstracts
processData.loadAbstracts()
#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_abstract()

Abstract not found : 20639591
Abstract not found : 20301779
Abstract not found : 23304742
Abstract not found : 20301420
Abstract not found : 20301462
Abstract not found : 20301585
Abstract not found : 21249951
Abstract not found : 22550943
Abstract not found : 21952424
Abstract not found : 20301466
Abstract not found : 22129433
Abstract not found : 22787616
Abstract not found : 22787626
Abstract not found : 27940438
Abstract not found : 27940438


In [10]:
#####Absract yes/no#####################
training_file = "../Dataset/Task7BGoldenEnriched/7B5_golden.json"
abstract_title_file = "../Dataset/8b_dataset/titles_and_abstr_bioasq.csv"
#Initialize
processData = ProcessTrainData(training_file,abstract_title_file)
#Load abstracts
processData.loadAbstracts()
#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_abstract()

In [7]:
#####Snippet yes/no#####################

training_file = "../Dataset/8b_dataset/training8b.json"
#abstract_title_file = "../Dataset/8b_dataset/titles_and_abstr_bioasq.csv"
#Initialize
out_dir ='../Dataset/8b_dataset/'
processData = ProcessTrainData(training_file,None,out_dir)

#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_snippet()

In [7]:
#####Absract yes/no#####################
training_file = "../Dataset/8b_dataset/test_sets/test1/BioASQ-task8bPhaseB-testset1.json"
abstract_title_file = "../Dataset/8b_dataset/test_sets/test1/titles_and_abstr_bioasq_8b.csv"
out_dir ='../Dataset/8b_dataset/test_sets/test1/'
#Initialize
processData = ProcessTrainData(training_file,abstract_title_file,out_dir)
#Load abstracts
processData.loadAbstracts()
#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_abstract()

In [4]:
#####Snippet yes/no#####################

training_file = "../Dataset/8b_dataset/test_sets/test2/BioASQ-task8bPhaseB-testset2.json"
#abstract_title_file = "../Dataset/8b_dataset/titles_and_abstr_bioasq.csv"
#Initialize
out_dir ='../Dataset/8b_dataset/test_sets/test2/'
processData = ProcessTrainData(training_file,None,out_dir)

#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_snippet()

In [11]:
#####Snippet yes/no#####################

training_file = "../Dataset/Task7BGoldenEnriched/7B5_golden.json"
#abstract_title_file = "../Dataset/8b_dataset/titles_and_abstr_bioasq.csv"
#Initialize
out_dir ='../Dataset/Task7BGoldenEnriched/'
processData = ProcessTrainData(training_file,None,out_dir)

#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_snippet()

In [4]:
#####Snippet yes/no#####################

training_file = "../Dataset/8b_dataset/test_sets/test3/BioASQ-task8bPhaseB-testset3"
#abstract_title_file = "../Dataset/8b_dataset/titles_and_abstr_bioasq.csv"
#Initialize
out_dir ='../Dataset/8b_dataset/test_sets/test3/'
processData = ProcessTrainData(training_file,None,out_dir)

#Load the unstructured dataset
processData.loadDataset()
#create strcutured dataset
processData.CreateDataset_yesno_snippet()