Parsing the 'outcomes' from \*\*\[independent,CT69,katsanos\]_xmltodict_01_outcome.csv. <br>
output file csv (primary outcome and secondary outcome will be separated csv files) header: NCTID, measure, time_frame, desc, and prim/second flag

In [1]:
import csv
import re
import pandas as pd
import ast

In [2]:
# use the xmltodict_out as input file
data = pd.read_csv("output\CT69_xmltodict_01_outcome.csv",sep="\t",names = ["NCTID","Primary Outcome Count","Primary Outcome",\
                                                                            "Secondary Outcome Count","Secondary Outcome"])
# there is a 0 row contains the first line, which is not part of the data.
primaryOut_data=data[['NCTID','Primary Outcome Count','Primary Outcome']].drop(0,axis=0)
primaryOut_data.head()

Unnamed: 0,NCTID,Primary Outcome Count,Primary Outcome
1,NCT02599389,2,"{""measure"": ""Incremental cost per Qaly gained ..."
2,NCT02710656,1,measure:Efficacy measured by binary restenosis...
3,NCT01221610,1,measure:Assessment of the 6 months late lumen ...
4,NCT01867736,2,"{""measure"": ""Safety: Major adverse event rate ..."
5,NCT00696956,1,measure:Primary endpoint: Late lumen loss (LLL...


In [3]:
# separate the data into 2 df: one is count = 1, the other is what left
primOut_1_data = primaryOut_data.loc[primaryOut_data['Primary Outcome Count'] == "1"]
primOut_2_data = primaryOut_data.loc[primaryOut_data['Primary Outcome Count'] != "1"]
primOut_2_data.head()

Unnamed: 0,NCTID,Primary Outcome Count,Primary Outcome
1,NCT02599389,2,"{""measure"": ""Incremental cost per Qaly gained ..."
4,NCT01867736,2,"{""measure"": ""Safety: Major adverse event rate ..."
10,NCT03175744,4,"{""measure"": ""Freedom from Major Adverse Limb E..."
13,NCT02063672,2,"{""measure"": ""Percentage of Participants With P..."
14,NCT01566461,2,"{""measure"": ""Primary Patency"", ""time_frame"": ""..."


In [4]:
# do the same for secondary outcome
secondaryOut_data=data[['NCTID','Secondary Outcome Count','Secondary Outcome']].drop(0,axis=0)
secOut_1_data = secondaryOut_data.loc[secondaryOut_data['Secondary Outcome Count'] == "1"]
secOut_2_data = secondaryOut_data.loc[(secondaryOut_data['Secondary Outcome Count'] != "1") &\
                                     (secondaryOut_data['Secondary Outcome Count'] != "0")]
secOut_0_data = secondaryOut_data.loc[secondaryOut_data['Secondary Outcome Count'] == "0"]
secOut_0_data.tail()

Unnamed: 0,NCTID,Secondary Outcome Count,Secondary Outcome
42,NCT01366482,0,
43,NCT02004951,0,
60,NCT03206762,0,
64,NCT03625830,0,
69,NCT01970579,0,


In [5]:
#join the NCTID to the outcome
def joinNCTID(indata,col_name):
    new_outcome = pd.DataFrame()
    new_outcome[col_name] = indata[['NCTID',col_name]].apply(lambda x: '||'.join(x),axis=1)
    return(new_outcome)

#for primary outcome
new_outcome_2_data=joinNCTID(primOut_2_data,'Primary Outcome')
new_outcome_1_data = joinNCTID(primOut_1_data,'Primary Outcome')

#for secondary outcome
new_secout_2_data = joinNCTID(secOut_2_data,'Secondary Outcome')
new_secout_1_data = joinNCTID(secOut_1_data,'Secondary Outcome')
new_secout_0_data = joinNCTID(secOut_0_data,'Secondary Outcome')
new_outcome_1_data.head()

Unnamed: 0,Primary Outcome
2,NCT02710656||measure:Efficacy measured by bina...
3,NCT01221610||measure:Assessment of the 6 month...
5,NCT00696956||measure:Primary endpoint: Late lu...
6,NCT00930813||measure:Angiographic Late Lumen L...
7,NCT01594684||measure:Late Lumen Loss (LLL)|tim...


In [6]:
def dictList_2(data,col_name):
    '''
    create a list of dictionary with NCTID, measure, timeframe and description for next step
    param: dataframe for outcomes more than 2, and column name: either 'Primary Outcome' or 'Secondary Outcome'
    return: the list of dictionaries
    '''
    outcome = data[col_name]
    dict_list = []
    for i in outcome:
        texts = []
        texts = i.split("||",1)
        NCTid = texts[0]
        outcome1 = texts[1]
        outcomes = outcome1.split("|")
        for j in outcomes: 
            dict1 = ast.literal_eval(j)
            dict1.update({'NCTID' : NCTid})
            dict_list.append(dict1) 
    return (dict_list)

def dictList_1(data,col_name):
    '''
    create a list of dictionary with NCTID, measure, timeframe and description for next step
    example: NCT02710656||measure:Efficacy measured by binary restenosis rate|time_frame:Until 12 months after procedure|description:
            Binary restenosis rate determined by duplex ultrasonography at 1, 6 and 12 months afte procedure. 
            Binary restenosis is defined as a re-obstruction â‰¥50% of the target lesion (peak systolic velocity ratio > 2.4).
    param: dataframe for outcomes is only 1, and column name: either 'Primary Outcome' or 'Secondary Outcome'
    return: the list of dictionaries
    '''
    outcome = data[col_name]
    dict_list = []
    for i in outcome:
        texts = []
        texts = i.split("||",1)
        NCTid = texts[0]
        outcome = texts[1]
        outcomes = outcome.split('|')
        dict1 = {}
        for j in outcomes:
            v = j.split(':',1)
            key = v[0]
            value = v[1]
            dict1[key] = value
            dict1.update({'NCTID' : NCTid})
        dict_list.append(dict1) 
    return (dict_list)

#for primary outcome
dict_list2 = dictList_2(new_outcome_2_data,'Primary Outcome')
dict_list1 = dictList_1(new_outcome_1_data,'Primary Outcome')
dict_list = dict_list2 + dict_list1

#for secondary outcome when it is not None
dict_list3 = dictList_2(new_secout_2_data, 'Secondary Outcome')
dict_list4 = dictList_1(new_secout_1_data, 'Secondary Outcome')

# for secondary outcome of "None"
def dictList_0(data,col_name) :
    outcome = data[col_name]
    dict_list0 = []
    for i in outcome:
        dict0 = {}
        texts = []
        texts = i.split("||",1)
        NCTid = texts[0]
        dict0['NCTID'] = NCTid
        dict_list0.append(dict0)
    return (dict_list0)

dict_list0 = dictList_0(new_secout_0_data,'Secondary Outcome')
    
dict_list_sec = dict_list3 + dict_list4+dict_list0



In [7]:
def csvoutfile(dict_list,filename):
# 1) because some data missing keys, first take all the keys from dict_list and create unique header
# 2) create a result list with appends value related hearder which is key of input dict, and if key is not available, .get() will return None.
# 3) write header and each row form result list into csv file
    process_data = [[k,v] for _dict in dict_list for k,v in _dict.items()]
    headers = [i[0] for i in process_data]
    headers = list(set(headers))
#print(headers)

    result = []
    for _dict in dict_list:
        row = []
        for header in headers:
            row.append(_dict.get(header,None))
        result.append(row)
    
    outfilepath = "output\\"+filename+".csv"
    with open(outfilepath, 'w', encoding = 'utf-8-sig', newline='') as outfile:
        csvwriter = csv.writer(outfile, delimiter = '\t')
        csvwriter.writerow(headers)
        for r in result:
            csvwriter.writerow(r)
    outfile.close()
    return

csvoutfile(dict_list,'CT69_primaryOutcome')
csvoutfile(dict_list_sec,'CT69_secondaryOutcome')

Other files: katsanos, independent <br>
use functions defined above

In [8]:
def parse_outcome(project):
    '''
    param: project "katsanos" "independent" "CT69"
    return: csv files -one for primary outcome, the other for secondary outcome
    '''
    filepath = "output\\"+project+"_xmltodict_01_outcome.csv"
    data = pd.read_csv(filepath,sep="\t",names = ["NCTID","Primary Outcome Count","Primary Outcome","Secondary Outcome Count","Secondary Outcome"])
# there is a 0 row contains the first line, which is not part of the data.
    primaryOut_data=data[['NCTID','Primary Outcome Count','Primary Outcome']].drop(0,axis=0)
    primOut_1_data = primaryOut_data.loc[primaryOut_data['Primary Outcome Count'] == "1"]
    primOut_2_data = primaryOut_data.loc[primaryOut_data['Primary Outcome Count'] != "1"]
    secondaryOut_data=data[['NCTID','Secondary Outcome Count','Secondary Outcome']].drop(0,axis=0)
    secOut_1_data = secondaryOut_data.loc[secondaryOut_data['Secondary Outcome Count'] == "1"]
    secOut_2_data = secondaryOut_data.loc[(secondaryOut_data['Secondary Outcome Count'] != "1") &\
                                     (secondaryOut_data['Secondary Outcome Count'] != "0")]
    secOut_0_data = secondaryOut_data.loc[secondaryOut_data['Secondary Outcome Count'] == "0"]
    
    #use previously defined function joinNCTID
    new_outcome_2_data=joinNCTID(primOut_2_data,'Primary Outcome')
    new_outcome_1_data = joinNCTID(primOut_1_data,'Primary Outcome')
    
    new_secout_2_data = joinNCTID(secOut_2_data,'Secondary Outcome')
    new_secout_1_data = joinNCTID(secOut_1_data,'Secondary Outcome')
    new_secout_0_data = joinNCTID(secOut_0_data,'Secondary Outcome')
    
    #for primary outcome, use dictList_2,dict_list_1, dict_list_0 functions 
    dict_list2 = dictList_2(new_outcome_2_data,'Primary Outcome')
    dict_list1 = dictList_1(new_outcome_1_data,'Primary Outcome')
    dict_list = dict_list2 + dict_list1
    
    #for secondary outcome 
    dict_list3 = dictList_2(new_secout_2_data, 'Secondary Outcome')
    dict_list4 = dictList_1(new_secout_1_data, 'Secondary Outcome')
    dict_list0 = dictList_0(new_secout_0_data,'Secondary Outcome')
    
    dict_list_sec = dict_list3 + dict_list4+dict_list0

    #use cvsoutfile function
    prim_out = project+"_primaryOutcome"
    sec_out = project+"_secondaryOutcime"
    csvoutfile(dict_list,prim_out)
    csvoutfile(dict_list_sec,sec_out)
    return ()

In [9]:
for k in ['katsanos','independent']:
    parse_outcome(k)