In [3]:
### PREPROCESSING MANUAL ANNOTATED DATA ####
### 50 PMC Articles ####

from xml.etree import ElementTree as ET
import os
import pandas as pd
import random


def read_xml_section(filePath):
    PMCID = filePath.replace(".xml","")
    PMCID = PMCID.replace("/Users/phuong/Documents/skr-consort-master/datasets/XML_50/","")
    tree = ET.parse(filePath)
    root = tree.getroot()
    
    document_id_list = []
    section_title_list = []
    section_start_list = []
    section_end_list = []
    section_sub_or_not_list = []
    
    for child in root:
        if child.tag == "section":
            if len(child) == 0:
                sectionTextSpan = child.attrib["textSpan"]
                sectionTitle = child.attrib["title"]
                sectionSubSectionOrNot = "No"
                startSectionChar, endSectionchar = sectionTextSpan.split("-")
                     
                document_id_list.append(PMCID)
                section_title_list.append(sectionTitle)
                section_start_list.append(startSectionChar)
                section_end_list.append(endSectionchar)
                section_sub_or_not_list.append(sectionSubSectionOrNot)
                    
            if len(child) != 0:
                sectionTextSpan = child.attrib["textSpan"]
                sectionTitle = child.attrib["title"]
                startSectionChar, endSectionchar = sectionTextSpan.split("-")
                sectionSubSectionOrNot = "No"
                
                document_id_list.append(PMCID)
                section_title_list.append(sectionTitle)
                section_start_list.append(startSectionChar)
                section_end_list.append(endSectionchar)
                section_sub_or_not_list.append(sectionSubSectionOrNot)
                
                
                for child_level1 in child:
                    subSectionTextSpan = child_level1.attrib["textSpan"]
                    subSectionTitle = child_level1.attrib["title"]
                    startSubSectionChar, endSubSectionchar = subSectionTextSpan.split("-")
                    sectionSubSectionOrNot = "Yes"
                                        
                    document_id_list.append(PMCID)
                    section_title_list.append(subSectionTitle)
                    section_start_list.append(startSubSectionChar)
                    section_end_list.append(endSubSectionchar)
                    section_sub_or_not_list.append(sectionSubSectionOrNot)
    
    section_df = pd.DataFrame(
    {'PMCID': document_id_list,
     'section_title': section_title_list,
     'section_start': section_start_list,
     'section_end': section_end_list,
     'subsection?': section_sub_or_not_list
    })
    return section_df

                    
def read_xml(filePath):
    section_df = read_xml_section(filePath)
    
    PMCID = filePath.replace(".xml","")
    PMCID = PMCID.replace("/Users/phuong/Documents/skr-consort-master/datasets/XML_50/","")
    tree = ET.parse(filePath)
    root = tree.getroot()
    
    document_id_list = []
    sentence_list = []
    sentence_id_list = []
    start_char_list = []
    end_char_list = []
    section_list = []

    for child in root:
        if child.tag == "sentence":
            charOffSet = (child.attrib["charOffset"])
            sentence_id= (child.attrib["id"])
            start_char,end_char = charOffSet.split("-")
            
            for index_section,row_section in section_df.iterrows():
                if (PMCID == row_section["PMCID"] and int(start_char)>=int(row_section["section_start"]) and int(end_char)<=int(row_section["section_end"])):
                    section = row_section["section_title"]
            
            for child_level1 in child:
                if child_level1.tag == "text":
                    sentence_text = child_level1.text
                    
                    document_id_list.append(PMCID)
                    sentence_id_list.append(sentence_id)
                    sentence_list.append(sentence_text)
                    start_char_list.append(start_char)
                    end_char_list.append(end_char)
                    section_list.append(section)

    df = pd.DataFrame(
    {'PMCID': document_id_list,
     'sentence_id': sentence_id_list,
     'sentence_text': sentence_list,
     'start_char_pos': start_char_list,
     'end_char_pos': end_char_list,
     'section': section_list
    })
    return df

data_path = "/Users/phuong/Documents/skr-consort-master/datasets/XML_50/"
filelist = os.listdir(data_path)

columns=['PMCID', 'sentence_id','sentence_text', 'start_char_pos','end_char_pos','section']
all_data_df = pd.DataFrame(columns=columns)


for i in filelist:
    if i.endswith("xml"):
        file_path = data_path + i    
        data_df = read_xml(file_path)
        all_data_df = pd.concat([all_data_df,data_df])

#import consort_golden_label data file
consort_golden_label_data_file = "/Users/phuong/Documents/skr-consort-master/datasets/gold_50.txt"
consort_golden_label_df = pd.read_csv(consort_golden_label_data_file, header = None, sep = "|")
consort_golden_label_df.columns = ["PMCID","sentence_id","CONSORT_Item"]

merged_df = pd.merge(consort_golden_label_df, all_data_df, on=["PMCID","sentence_id"], how="right")
merged_df.to_csv("/Users/phuong/Documents/skr-consort-master/datasets/Manual_Annotated_Data_All.csv",index=False)


In [2]:
#SPLIT ROW THAT HAVE MORE THAN ONE LABELS
import pandas as pd     
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import re
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from itertools import chain

#Get data that has annotation
data_filter_df = merged_df[merged_df["CONSORT_Item"].notnull()]

# remove whitespace and split by ','
data_filter_df['CONSORT_Item'] = data_filter_df['CONSORT_Item'].str.replace(' ', '').str.split(',')

# construct expanded dataframe
res = pd.DataFrame({'CONSORT_Item': list(chain.from_iterable(data_filter_df['CONSORT_Item'])),
                    'PMCID': np.repeat(data_filter_df['PMCID'], data_filter_df['CONSORT_Item'].map(len)),
                    'sentence_id': np.repeat(data_filter_df['sentence_id'], data_filter_df['CONSORT_Item'].map(len)),
                    'start_char_pos': np.repeat(data_filter_df['start_char_pos'], data_filter_df['CONSORT_Item'].map(len)),
                    'end_char_pos': np.repeat(data_filter_df['end_char_pos'], data_filter_df['CONSORT_Item'].map(len)),
                    'section': np.repeat(data_filter_df['section'], data_filter_df['CONSORT_Item'].map(len)),
                    'sentence_text': np.repeat(data_filter_df['sentence_text'], data_filter_df['CONSORT_Item'].map(len))})

#drop CONSORT items <3a and >12
array = ['13a','13b','14a','15','16','17a','23','24', '25']
main_df = res.loc[~res['CONSORT_Item'].isin(array)]

#split into validation & testing sets
PMCID_list = main_df["PMCID"]
unique_PMCID_list = set(PMCID_list)
unique_PMCID_list = list(unique_PMCID_list)
random.shuffle(unique_PMCID_list)

index = int(len(unique_PMCID_list)/3)
validation_index = unique_PMCID_list[index:]
testing_index = unique_PMCID_list[:index]
print (len(validation_index))
print (len(testing_index))

validation_df = main_df.loc[~main_df['PMCID'].isin(validation_index)]
testing_df = main_df.loc[~main_df['PMCID'].isin(testing_index)]

validation_df.to_csv("/Users/phuong/Documents/skr-consort-master/datasets/Manual_Annotated_Data_Validation.csv",index=False)
testing_df.to_csv("/Users/phuong/Documents/skr-consort-master/datasets/Manual_Annotated_Data_Testing.csv",index=False)


34
16


In [9]:
#import Halil paper data file
consort_manual_label_data_file = "/Users/phuong/Documents/skr-consort-master/phrase_heuristics_out.txt"
consort_manual_label_data_df = pd.read_csv(consort_manual_label_data_file, header = None, sep = "|")
consort_manual_label_data_df.columns = ["PMCID","sentence_id","CONSORT_Item","Section","sentence_text"]
consort_manual_label_data_df.to_csv("/Users/phuong/Documents/skr-consort-master/datasets/Manual_Annotated_Data_Halil.csv", index = False)
