In [88]:
import base64
import xml.etree.ElementTree as ET
import pandas as pd
import os
from datetime import timedelta

In [89]:
def set_bit(v, index, x):
    """
        Set the index:th bit of v to 1 if x is truthy,
        else to 0, and return the new value.
    """
    mask = 1 << index   # Compute mask, an integer with just bit 'index' set.
    v &= ~mask          # Clear the bit indicated by the mask (if x is False)
    if x:
        v |= mask         # If x was True, set the bit indicated by the mask.
    return v 

In [81]:
def decode_xml_file(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for cpc in root.findall('cpc'):
        
        seq = cpc.attrib.get('seq', None)
        datetime = cpc.attrib.get('datetime', None)
        tzoffset = cpc.attrib.get('tzoffset', None)
        tz = cpc.attrib.get('tz', None)

        for device in cpc.findall('.//device'):
            for mg in device.findall('.//mg'):
                mg_name = mg.attrib.get('name', None)
                wave = ''
                offset = 0
                gain = 0
                points = 0
                hz = 0
                binwave = []

                for m in mg:                
                    if m.attrib['name'] == 'Offset':
                        offset = int(m.text)
                    elif m.attrib['name'] == 'Gain':
                        if mg_name == 'GE_ART':
                            gain = 0.25
                        elif mg_name == 'INVP1':
                            gain = 0.01
                        else:
                            gain = float(m.text)       

                    elif m.attrib['name'] == 'Hz':
                        hz = int(m.text)   
                    elif m.attrib['name'] == 'Points':
                        points = int(m.text)
                    elif m.attrib['name'] == 'Wave':
                        wave = m.text

                        wave = base64.b64decode(wave)

                for i in range(0, len(wave)-1, 2):
                    t  = (wave[i]) + wave[i+1] * 256
                    t = set_bit(t, 15, 0) + (-32768) * (t >> 15)   

                    t = t * gain + offset

                    binwave.append(t)  

                data.append([seq, datetime, tzoffset, tz, 
                                    mg_name,hz,points,binwave]) 


    df = pd.DataFrame(data, columns=['seq', 'datetime', 'tzoffset', 'tz', 
                                    'mg_name','hz','points','binwave'])                        

    return df            

In [82]:
def true_records (data,pat_id):
   
    data1 = data.merge(pat_id[['LOG_ID', 'MRN']], on='LOG_ID', how='inner', suffixes=('data', 'pat_id'))
    data_filtered= data1[data1['MRNdata'] == data1['MRNpat_id']]
    
    data_filtered = data_filtered.drop(columns=['MRNpat_id'])
    data_filtered = data_filtered.rename(columns={'MRNdata': 'MRN'})
    data2 = data_filtered.drop_duplicates()

    return data2

In [83]:
# Adjust time based on timezone offset since ECG data have different timezone
def adjust_time(row):
    hours_offset = int(row['tzoffset'].split(':')[0])
    return row['datetime'] + timedelta(hours=hours_offset)

In [84]:
# Import the patient_labeled data as base
#更改为你们自己的patient_labeled文件路径
Patient_labeled = pd.read_csv(r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\processedData\trainData_label.csv')

# 更改为你们自己的EPIC_MRN_PAT_ID文件路径
# Import the patient_IDs for identifying the true records
file_path = r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\rawData\unzip_file\\'
patient_id = 'EPIC_MRN_PAT_ID.csv'
patient_id = pd.read_csv(os.path.join(file_path, patient_id))
patient_id = patient_id.drop_duplicates()

# Import the patient_information data for surgery start and stop time

#更改为你们自己的patient_information文件路径
patient_info = 'patient_information.csv'
patient_info = pd.read_csv(os.path.join(file_path, patient_info))
patient_info = true_records(patient_info, patient_id)
patient_info = patient_info.drop_duplicates()
patient_info = patient_info[['LOG_ID', 'MRN','AN_START_DATETIME','AN_STOP_DATETIME']]


In [85]:
# Combined import data for used columns
patient_labeled1 = pd.merge(Patient_labeled, patient_id, on=['LOG_ID','MRN'], how='inner')

patient_labeled1['folder_name'] = patient_labeled1['PAT_ID'].apply(lambda x:x[:2])

patient_labeled2= pd.merge(patient_labeled1, patient_info, on=['LOG_ID','MRN'], how='left')

patient_labeled2['AN_START_DATETIME'] = pd.to_datetime(patient_labeled2['AN_START_DATETIME'], format='%m/%d/%y %H:%M')
patient_labeled2['AN_STOP_DATETIME'] = pd.to_datetime(patient_labeled2['AN_STOP_DATETIME'], format='%m/%d/%y %H:%M')

print(patient_labeled2.shape)

patient_labeled2.head()



(1604, 7)


Unnamed: 0,LOG_ID,MRN,Cardiovascular,PAT_ID,folder_name,AN_START_DATETIME,AN_STOP_DATETIME
0,8c615f6805325d06,692072abc16db15d,1.0,e4f0d479b4f38b02,e4,2019-01-28 13:35:00,2019-01-28 15:58:00
1,0ecf38ed306338f6,4c29aee08612d793,1.0,803158b92f707c79,80,2019-08-05 19:56:00,2019-08-05 22:18:00
2,595439a55deda0d5,71d77695a69b26d2,1.0,161f8ef3c10f2a6f,16,2019-08-31 09:13:00,2019-08-31 16:08:00
3,36662c223daeac3a,08b0d6a24cf382ad,1.0,cf348d103a4ab8c6,cf,2019-01-10 11:32:00,2019-01-11 00:56:00
4,1383c875d3b6ffbd,4c29aee08612d793,1.0,803158b92f707c79,80,2019-08-21 11:06:00,2019-08-21 13:55:00


In [87]:
#then, create a for loop to decode the xml file based on the file path

#更改为你自己的waveform解压后的文件路径
filePath = r"H:\sml data\epic_wave_2_v2.tar\epic_wave_2_v2\UCI_deidentified_part2_EPIC_08_10\Waveforms"

# 更改为你自己的waveform解码后想保存的CSV文件路径
output_directory = r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\rawData\processedData\waveform_decode_data'

# Create a list to store the error message for later manual check
error_list = []

for index, row in patient_labeled2.iterrows():
    #extract useful information from the row
    folder_name = row['folder_name']
    patient_name = row['PAT_ID']
    file_prefix = row['PAT_ID'] + 'IP'
    log_id = row['LOG_ID']
    start_time = row['AN_START_DATETIME']
    end_time = row['AN_STOP_DATETIME']

    folder_path = os.path.join(filePath, folder_name)

    #create a list to store the data for each log_id
    data_individual = []

    if os.path.exists(folder_path):
        for file in os.listdir(folder_path):

            # check if the file is the file we want to decode
            if file.startswith(file_prefix) and file.endswith('.xml'):
                file_path = os.path.join(folder_path, file)

                try:
                    df = decode_xml_file(file_path)
                    df['PAT_ID'] = row['PAT_ID']
                    data_individual.append(df)

                # Capture the error message if the file cannot be processed
                except Exception as e:
                    error_message = (f"error processing file pat_id {row['PAT_ID']}"
                                        f"and log_id{row['LOG_ID']}: {e}")
                    
                    print(error_message)
                    error_list.append(error_message)  
                    continue 

        # combine the data for each log_id
        if data_individual:        
            combined_data = pd.concat(data_individual, ignore_index=True) 
            
            combined_data['datetime'] = pd.to_datetime(combined_data['datetime'])
            combined_data['adjusted_datetime'] = combined_data.apply(adjust_time, axis=1)
            combined_data['adjusted_datetime'] = combined_data['adjusted_datetime'].dt.tz_localize(None)
            
            # Filter the data based on the anesthesia start and stop time
            if pd.notna(start_time) and pd.notna(end_time): 
                filtered_data = combined_data[
                            (combined_data['adjusted_datetime'] >= start_time) & 
                            (combined_data['adjusted_datetime'] <= end_time)]
                
            # if no anesthesia time, save the data without filtering    
            else:
                filtered_data = combined_data
                error_message = f"no anesthesia time for log_id {row['LOG_ID']} and pat_id {row['PAT_ID']}"
                print(error_message)
                error_list.append(error_message)
            
            filtered_data_sorted = filtered_data.sort_values(by=['datetime'])

            if not filtered_data_sorted.empty:
                output_path = os.path.join(output_directory, f"{log_id}.csv")
                filtered_data_sorted.to_csv(output_path, index=False)
                print(f"file for log_id {row['LOG_ID']} saved, current index is {index}")

            # Capture the error message if there is no data for the log_id
            elif filtered_data_sorted.empty and not combined_data.empty:
                error_message = (f"all data have been filtered out for log_id {row['LOG_ID']} " 
                            f"and pat_id {row['PAT_ID']}, current index is {index}")
                
                print(error_message)
                output_path = os.path.join(output_directory, f"{log_id}.csv")
                combined_data.to_csv(output_path, index=False)
                error_list.append(error_message)   

        else:
            error_message = (f"no file for pat_id {row['PAT_ID']} "
                            f"and log_id {row['LOG_ID']}, current index is {index}")
            print(error_message)
            error_list.append(error_message)
    else:
        error_message = (f"folder not found for {row['LOG_ID']} "
                        f"and pat_id {row['PAT_ID']}, current index is {index}")
        
        print(error_message)
        error_list.append(error_message)
            



all data have been filtered out for log_id 8c615f6805325d06 and pat_id e4f0d479b4f38b02, current index is 0
all data have been filtered out for log_id 0ecf38ed306338f6 and pat_id 803158b92f707c79, current index is 1
file for log_id 595439a55deda0d5 saved, current index is 2
file for log_id 36662c223daeac3a saved, current index is 3
file for log_id 1383c875d3b6ffbd saved, current index is 4
all data have been filtered out for log_id 04bc8c7cd226af94 and pat_id 4226e5867c24712b, current index is 5
file for log_id adc7ab049f9dd7d3 saved, current index is 6
no file for pat_id 4e8242a8fee9d20d and log_id 49b960e2ab9e68b9, current index is 7
file for log_id 4df74a9be3018a98 saved, current index is 8
file for log_id 09f75f98b740c3a8 saved, current index is 9
all data have been filtered out for log_id cd02fa4a2f9ae70a and pat_id c77152790a66c4c4, current index is 10
file for log_id c8a9a6c6b011f68d saved, current index is 11
file for log_id 727bc1694a63a57f saved, current index is 12
folder no

KeyboardInterrupt: 

In [20]:
## Manually decode the waveform for error files

folder_path= r"H:\sml data\epic_wave_2_v2.tar\epic_wave_2_v2\UCI_deidentified_part2_EPIC_08_10\Waveforms\0e"

file_prefix = '0ef3dede9b3f478e'
data_individual = []

for file in os.listdir(folder_path):
            if file.startswith(file_prefix) and file.endswith('.xml'):
                file_path = os.path.join(folder_path, file)
                try:
                    df = decode_xml_file(file_path)
                    df['PAT_ID'] = '0ef3dede9b3f478e'
                    data_individual.append(df)

                except Exception as e:
                    print(file)
                    print(f"error processing file : {e}")   
                    continue

combined_data = pd.concat(data_individual, ignore_index=True) 
combined_data_sorted = combined_data.sort_values(by=['datetime']) 


output_directory = r'C:\Users\rodge\OneDrive\Desktop\SML\Group Project\comp90051\data\rawData\processedData\waveform_decode_data'

output_path = os.path.join(output_directory, "0ef3dede9b3f478e.csv")
combined_data_sorted.to_csv(output_path, index=False)

Unnamed: 0,seq,datetime,tzoffset,tz,mg_name,hz,points,binwave,PAT_ID
7206,23979,2019-01-21T00:00:00.024Z,-07:00:00,Pacific Standard Time,GE_ECG,180,180,"[18.0, 9.0, 15.0, 7.0, 17.0, 10.0, 18.0, 7.0, ...",0ef3dede9b3f478e
7207,23979,2019-01-21T00:00:00.024Z,-07:00:00,Pacific Standard Time,GE_ART,180,180,"[0.25, 7.25, 0.5, 7.0, -0.25, 6.5, 0.25, 7.25,...",0ef3dede9b3f478e
7208,23980,2019-01-21T00:00:01.566Z,-07:00:00,Pacific Standard Time,GE_ECG,180,180,"[15.0, 9.0, 17.0, 10.0, 17.0, 7.0, 15.0, 9.0, ...",0ef3dede9b3f478e
7209,23980,2019-01-21T00:00:01.566Z,-07:00:00,Pacific Standard Time,GE_ART,180,180,"[0.75, 7.0, -0.25, 6.25, -0.25, 7.25, 0.75, 7....",0ef3dede9b3f478e
7210,23981,2019-01-21T00:00:03.046Z,-07:00:00,Pacific Standard Time,GE_ECG,180,180,"[16.0, 11.0, 19.0, 9.0, 15.0, 7.0, 15.0, 10.0,...",0ef3dede9b3f478e
...,...,...,...,...,...,...,...,...,...
7201,31182,2019-01-21T03:00:04.394Z,-07:00:00,Pacific Standard Time,GE_ART,180,180,"[1.25, 6.25, 1.75, 6.0, 1.75, 6.25, 1.75, 6.0,...",0ef3dede9b3f478e
7202,31183,2019-01-21T03:00:05.815Z,-07:00:00,Pacific Standard Time,GE_ECG,180,180,"[-7.0, 27.0, -7.0, 27.0, -5.0, 29.0, -5.0, 28....",0ef3dede9b3f478e
7203,31183,2019-01-21T03:00:05.815Z,-07:00:00,Pacific Standard Time,GE_ART,180,180,"[1.75, 6.0, 1.75, 6.25, 1.75, 6.0, 1.5, 6.25, ...",0ef3dede9b3f478e
7205,31184,2019-01-21T03:00:07.415Z,-07:00:00,Pacific Standard Time,GE_ART,180,180,"[1.75, 6.25, 1.5, 6.25, 1.5, 6.25, 1.25, 5.75,...",0ef3dede9b3f478e
