In [1]:
# -*- coding: utf-8 -*-

 
import xml.etree.cElementTree as et
import pandas as pd
import re
import os
import time
import sys

def update_progress(progress):
    barLength = 20 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "Halt...\r\n"
    if progress >= 1:
        progress = 1
        status = "Done...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

# File Management
def find_files(x, working_dir):
    pattern = re.compile(x)
    file_list = []
    for file in os.listdir(working_dir):
        if re.match(pattern=pattern, string=file):
            file_list.append(file)
    return(file_list)

def move_files(file_list, working_dir, completed_dir):
    for file in file_list:
        old = os.path.join(working_dir, file)
        new = os.path.join(completed_dir, file)
        os.rename(old, new)

# Exploratory Data Analysis (EDA)
def get_children_tags(parent):
    '''Input:  XML root/parent object
    Output:  List of unique children tags'''
    child_list = []
    for x in parent.getchildren():
        if x.tag not in child_list:
            child_list.append(x.tag)
    return(child_list)

def get_children_details(parent, plist):
    for p in plist:
        count = 0
        for x in parent.getchildren():
            if x.tag == p:
                count += 1
        print("Parent Tag:  {0} - Count:  {1}".format(p, count))

def get_children_info(parent):
    '''Input:  XML root/parent object
    Output:  1 dict with 2 Lists, [attribute key names] [unique children tag names]'''
    child_list = get_children_tags(parent)
    attrib_list = parent.keys()
    return({'attributes': attrib_list, 'child_tags': child_list})

def print_info(parent, level):
    pinfo = get_children_info(parent)
    indent = "  " * level
    level += 1
    print(parent.tag)
    print("{2}Attributes ({0}):\t{1}".format(len(pinfo['attributes']), pinfo['attributes'], indent))
    print("{2}Children ({0}):\t{1}".format(len(pinfo['child_tags']), pinfo['child_tags'], indent))
      

# Data Parsing and Extraction
def digger01(elementid, parentid, parent):
    '''
    Inputs:
      elemendid = the index number assigned to the data being processed
      parentid = the index number of the function that called the function
      parent = the XML object to be processed
    Outputs:  
      attributes = LIST containing DICTS the attributes of of each child parsed IF it is also a parent itself
      data = LIST containing DICTS the attributes of of each child parsed IF it has no children.  
      elementid = The last assigned index number.  
    '''
    attributes = []
    data = []
    for child in parent.getchildren():
        parentid = elementid
        elementid += 1
        attribs = child.attrib
        attribs.update({'elementid':  elementid})        
        if len(child.getchildren()) > 0:
            attributes.append(attribs)
            parentid = elementid
            elementid += 1
            digger01(elementid, parentid, child)
        else:
            data.append(attribs)
            parentid = elementid
            elementid += 1
    return(attributes, data, elementid)
        
# Data Consolidation
def get_df(df_list, index_key='elementid'):
    '''
    Input:  
      df_list = LIST containing DataFrames to consolidate
      index_key = Name of column to use as the key, default is the elementid
    Output:  
      df = A single DataFrame containing all of the DF's passed in
      '''
    df = pd.DataFrame()
    for x in df_list:
        df = df.append(pd.DataFrame(data = x, index=[x[index_key]]))
    return(df)
        



In [2]:
# Find files to work with
dir_raw = "./data/raw/"
dir_processed = "./data/processed/"
search_string = ".*.xml"

find_files(search_string, dir_raw)



['export.xml', 'export2.xml', 'export_cda.xml']

In [None]:
# Import File as XML Object
file = 'export.xml'
file_name = os.path.join(dir_raw, file)

parsed_xml = et.parse(file_name)
root = parsed_xml.getroot()

# Initiate parsing of XML
parent = root
parentid = 1
elementid = 2
dug1 = digger01(elementid, parentid, parent)

# Attributes
df1 = get_df(dug1[0])
# Data
df2 = get_df(dug1[1])
# Current elementid
elementid = get_df(dug1[2])

print("Done!")

In [None]:
print("\nAttributes DataFrame:  ")
print(df1.shape)
print(df1.columns)
print("\nData DataFrame:  ")
print(df2.shape)
print(df2.columns)
print("\nLast used Dlement ID:  ")
print(elementid)



In [None]:
dug1

In [23]:
df2[['HKCharacteristicTypeIdentifierBiologicalSex',
       'HKCharacteristicTypeIdentifierBloodType',
       'HKCharacteristicTypeIdentifierDateOfBirth',
       'HKCharacteristicTypeIdentifierFitzpatrickSkinType', 'device', 'sourceName', 'type', 'unit']].drop_duplicates()

Unnamed: 0,HKCharacteristicTypeIdentifierBiologicalSex,HKCharacteristicTypeIdentifierBloodType,HKCharacteristicTypeIdentifierDateOfBirth,HKCharacteristicTypeIdentifierFitzpatrickSkinType,device,sourceName,type,unit
3,,,,,,,,
5,HKBiologicalSexMale,HKBloodTypeNotSet,1979-06-26,HKFitzpatrickSkinTypeNotSet,,,,
7,,,,,,IntelliDrink,HKQuantityTypeIdentifierBloodAlcoholContent,%
31603,,,,,,BACtrack,HKQuantityTypeIdentifierBloodAlcoholContent,%
34859,,,,,,Health,HKQuantityTypeIdentifierHeight,ft
34861,,,,,,Health,HKQuantityTypeIdentifierBodyMass,lb
34863,,,,,,Connect,HKQuantityTypeIdentifierBodyMass,lb
34915,,,,,,Shortcuts,HKQuantityTypeIdentifierBodyMass,lb
35143,,,,,"<<HKDevice: 0x281d91b30>, name:iPhone, manufac...",jefalexa’s iPhone,HKQuantityTypeIdentifierStepCount,count
35145,,,,,"<<HKDevice: 0x281d91a90>, name:iPhone, manufac...",jefalexa’s iPhone,HKQuantityTypeIdentifierStepCount,count


In [None]:

for p in range(11):
    p2 = p/10
    update_progress(p2)
    time.sleep(1)
    