In [2]:
import base64
import xml.etree.ElementTree as ET
import pandas as pd
import os

In [43]:
def set_bit(v, index, x):
    """
        Set the index:th bit of v to 1 if x is truthy,
        else to 0, and return the new value.
    """
    mask = 1 << index   # Compute mask, an integer with just bit 'index' set.
    v &= ~mask          # Clear the bit indicated by the mask (if x is False)
    if x:
        v |= mask         # If x was True, set the bit indicated by the mask.
    return v 

In [44]:
def decode_xml_file(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for cpc in root.findall('cpc'):
        
        seq = cpc.attrib.get('seq', None)
        datetime = cpc.attrib.get('datetime', None)
        tzoffset = cpc.attrib.get('tzoffset', None)
        tz = cpc.attrib.get('tz', None)

        for device in cpc.findall('.//device'):
            for mg in device.findall('.//mg'):
                mg_name = mg.attrib.get('name', None)
                wave = ''
                offset = 0
                gain = 0
                points = 0
                hz = 0
                binwave = []

                for m in mg:                
                    if m.attrib['name'] == 'Offset':
                        offset = int(m.text)
                    elif m.attrib['name'] == 'Gain':
                        if mg_name == 'GE_ART':
                            gain = 0.25
                        elif mg_name == 'INVP1':
                            gain = 0.01
                        else:
                            gain = float(m.text)       

                    elif m.attrib['name'] == 'Hz':
                        hz = int(m.text)   
                    elif m.attrib['name'] == 'Points':
                        points = int(m.text)
                    elif m.attrib['name'] == 'Wave':
                        wave = m.text

                        wave = base64.b64decode(wave)

                for i in range(0, len(wave)-1, 2):
                    t  = (wave[i]) + wave[i+1] * 256
                    t = set_bit(t, 15, 0) + (-32768) * (t >> 15)   

                    t = t * gain + offset

                    binwave.append(t)  

                data.append([seq, datetime, tzoffset, tz, 
                                    mg_name,hz,points,binwave]) 


    df = pd.DataFrame(data, columns=['seq', 'datetime', 'tzoffset', 'tz', 
                                    'mg_name','hz','points','binwave'])                        

    return df            

In [45]:
# Import patient_labeled data

#更改为你自己的patient_labeled文件路径
Patient_labeled = pd.read_csv(r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\processedData\trainData_label.csv')

# 更改为你自己的EPIC_MRN_PATH_ID文件路径
file_path = r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\rawData\unzip_file\\'
patient_id = 'EPIC_MRN_PAT_ID.csv'

patient_id = pd.read_csv(os.path.join(file_path, patient_id))
patient_id = patient_id.drop_duplicates()

patient_labeled = pd.merge(Patient_labeled, patient_id, on=['LOG_ID','MRN'], how='inner')
print(patient_labeled.shape)

# Using pat_id to get the xml file path
patient_labeled['folder_name'] = patient_labeled['PAT_ID'].apply(lambda x:x[:2])
patient_labeled.head()




(1604, 4)


Unnamed: 0,LOG_ID,MRN,Cardiovascular,PAT_ID,folder_name
0,8c615f6805325d06,692072abc16db15d,1.0,e4f0d479b4f38b02,e4
1,0ecf38ed306338f6,4c29aee08612d793,1.0,803158b92f707c79,80
2,595439a55deda0d5,71d77695a69b26d2,1.0,161f8ef3c10f2a6f,16
3,36662c223daeac3a,08b0d6a24cf382ad,1.0,cf348d103a4ab8c6,cf
4,1383c875d3b6ffbd,4c29aee08612d793,1.0,803158b92f707c79,80


In [46]:
# then, create a for loop to decode the xml file based on the file path

#更改为你自己的waveform解压后的文件路径
filePath = r"H:\sml data\epic_wave_2_v2.tar\epic_wave_2_v2\UCI_deidentified_part2_EPIC_08_10\Waveforms"

# 更改为你自己的waveform解码后想保存的文件路径
output_directory = r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\rawData\processedData\waveform_decode_data'

visited_list= []

for index, row in patient_labeled.iterrows():

    folder_name = row['folder_name']
    folder_path = os.path.join(filePath, folder_name)
    patient_name = row['PAT_ID']
    file_prefix = row['PAT_ID'] + 'IP'

    data_individual = []

    # check if the file has been visited or not
    if patient_name in visited_list:
        continue

    if os.path.exists(folder_path):
        for file in os.listdir(folder_path):
            if file.startswith(file_prefix) and file.endswith('.xml'):
                file_path = os.path.join(folder_path, file)

                try:
                    df = decode_xml_file(file_path)
                    df['PAT_ID'] = row['PAT_ID']
                    data_individual.append(df)

                except Exception as e:
                    print(f"error processing file {row['PAT_ID']}: {e}")   
                    continue 

        if data_individual:        
            combined_data = pd.concat(data_individual, ignore_index=True) 
            combined_data_sorted = combined_data.sort_values(by=['datetime'])
            output_path = os.path.join(output_directory, f"{patient_name}.csv")
            combined_data_sorted.to_csv(output_path, index=False)

            print(f"file for {row['PAT_ID']} saved, current index is {index}")
            visited_list.append(patient_name)

        else:
            visited_list.append(patient_name)
            print(f"no file for {row['PAT_ID']}, current index is {index}")  


file for e4f0d479b4f38b02 saved, current index is 0
file for 803158b92f707c79 saved, current index is 1
file for 161f8ef3c10f2a6f saved, current index is 2
file for cf348d103a4ab8c6 saved, current index is 3
file for 4226e5867c24712b saved, current index is 5
file for 902250375a6c030e saved, current index is 6
no file for 4e8242a8fee9d20d, current index is 7
file for 5ac7123eedcc61cc saved, current index is 8
file for 3607035000e1753a saved, current index is 9
file for c77152790a66c4c4 saved, current index is 10
file for ba2751ccecfdf11e saved, current index is 14
file for 507a45d1654d8044 saved, current index is 16
file for eca09794ace31b68 saved, current index is 21
file for 7ee440630ec152a6 saved, current index is 24
file for 87ed40ee112ffa87 saved, current index is 27
file for 4520773c12187063 saved, current index is 29
file for 901199f219acc57f saved, current index is 31
file for 4fbb87408dc55f79 saved, current index is 46
file for 860444887ca0215f saved, current index is 60
file 