In [1]:
# background: to extract info from XML files for RAPID LEAN CRF project.
# data needs to extract are:
# oversight_info/has_dmc,primary_outcome/measure, primary_outcome/time_frame, secondary_outcome/measure, secondary_outcome/time_frame,arm_group/arm_group_label[0],arm_group/description, eligibility/criteria/textblock
# put each extraction from the XML into one row in the csv file.

In [1]:
import csv
import xml.etree.ElementTree as ET
import re
from six.moves.urllib.parse import urlencode, urljoin
from six.moves.html_parser import HTMLParser
import pandas as pd
import requests

In [16]:
#get the list of NCTID
def NCTID_input(xlsxfile):
    """
    Get the input NCTID as a list
    :param xlsxfile
    :return the list
    """
    NCTID_list = []
    filepath = "input\\"+xlsxfile
    df = pd.read_excel(filepath, sheet_name='Sheet1')
    NCTID_list = df['NCT Number'].dropna().tolist()
    return NCTID_list


#call the xml from clinicalTrials.gov using request.get()
BASE_URL = "https://clinicaltrials.gov/ct2/show/"

def get_study(nct_id):
    """
    Pull the XML for the study
    :param nct_id:
    :return:
    """
    t = urljoin(BASE_URL, nct_id)
    full_url = t + "?" + urlencode(dict(displayxml=True))
    response = requests.get(full_url)
    if not response.status_code == 200:
        raise ValueError("Unable to load study {}".format(nct_id))
    return response.text

# function to get the primary outcome and secondary outcome
def CT_tag(element):
    element_count = len(root.findall(element))
    xml_dict[element+'_count'] = element_count

    # primary/secondary outcome info
    Out_list=[]
    Out_measure=""
    Out_time=""
    Out_desc=""
    for Out in root.iter(element):
        if Out.find("measure") is not None: 
            Out_measure = Out[0].text 
        if Out.find("time_frame") is not None: 
            Out_time = Out[1].text  
        if Out.find("descrption") is not None: 
            Out_desc = Out[2].text 
        Out_list.append(Out_measure+"|"+Out_time+"|"+Out_desc)
    xml_dict[element+'_list'] = Out_list

#function to get the arm_group label and description
def CT_arm(element):
    element_count = len(root.findall(element))
    xml_dict[element+'_count'] = element_count
    
    arm_list=[]
    arms_label=""
    arms_description=""
    for arms in root.iter(element):
        if arms.find("arm_group_label") is not None: 
            arms_label = arms[0].text 
        if arms.find("description") is not None:
            arms_description = arms[2].text
            arms_description=arms_description.replace("\n","")
        arm_list.append(arms_label+"|"+arms_description)
    xml_dict[element+"_list"]=arm_list

NCTIDfile = input('please give the file name contains NCTID list:' )
outfilename = input('please give the file name for output: ')

outputfile = "output\\"+outfilename

NCTID_list = NCTID_input(NCTIDfile)
print (str(len(NCTID_list))+' files need to be processed.')

j = 0
for i in NCTID_list:
    xml_dict={}
    xml_dict['NCTID']=i
    xml_str= get_study(i)
    #the get_study function returns xml in string. need to read the tree from a string.
    tree = ET.ElementTree(ET.fromstring(xml_str))
    root = tree.getroot()
    #oversight_info
    if root.findall('oversight_info/has_dmc'): 
        for dmc in root.findall('oversight_info/has_dmc'):
            if dmc != None:
                xml_dict['dmc_oversight']=dmc.text
            else:
                xml_dict['dmc_oversight']="n/a"
    else:
        xml_dict['dmc_oversight']="n/a"
           
    #primary and secondary outcome
    CT_tag('primary_outcome')
    CT_tag('secondary_outcome')
            
    #arm info
    CT_arm('arm_group')

    #eligibility
    for inclusion_exclusion in root.iter('eligibility'):
        for text in inclusion_exclusion.findall('criteria/textblock') :
            content = text.text
            content=str(content)
            content=content.replace("\n","")
            content = re.sub(' +', ' ', content)
            xml_dict['inclusion_exclusion']=content
    #mesh_term
    mesh_term_list = []
    if root.find('condition_browse') is not None:
        for mesh_term in root.findall('condition_browse/mesh_term'):
            mesh_term_list.append(mesh_term.text)
    if root.find('intervention_browse') is not None:
        for mesh_term in root.findall('intervention_browse/mesh_term'):
            mesh_term_list.append(mesh_term.text)
    xml_dict['mesh_term'] = mesh_term_list   
            
    print (i+" parsed successfully!")
        #print (xml_dict)
     
    # write the dictionary into a csv file, each key:value is a cell, separated by ";". one NCTID in one row
    with open(outputfile, 'a', encoding='utf8') as f:
        #writer = csv.writer(f)  
        for key, value in xml_dict.items():
            if type(xml_dict[key]) == list:
                f.write(str(key)+":")
                for t in xml_dict[key]:
                    f.write(str(t)+",")
                f.write("||")
            else:
                f.write(str(key)+":"+str(value))
            f.write("||")
        f.write("\n")
    f.close()
    j=j+1
print (str(j)+ " files completed.")

please give the file name contains NCTID list:katsanos.xlsx
please give the file name for output: katsanos_parse_URL.csv
20files need to be processed.
NCT01816412 parsed successfully!
NCT02540018 parsed successfully!
NCT01083030 parsed successfully!
NCT00930813 parsed successfully!
NCT03421561 parsed successfully!
NCT02013193 parsed successfully!
NCT01850056 parsed successfully!
NCT02013193 parsed successfully!
NCT01412541 parsed successfully!
NCT01450722 parsed successfully!
NCT00156624 parsed successfully!
NCT00120406 parsed successfully!
NCT01947478 parsed successfully!
NCT01970579 parsed successfully!
NCT03023098 parsed successfully!
NCT00472472 parsed successfully!
NCT00986752 parsed successfully!
NCT01083394 parsed successfully!
NCT01175850 parsed successfully!
NCT01566461 parsed successfully!
20 files completed.
