In [2]:
import pandas as pd
import itertools
import numpy as np
import gc

: 

In [3]:
csv= '/home/mei/nas/docker/thesis/data/csv/'
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

In [4]:
def round_up(x, base=5):
    return base * round(x/base)

In [5]:
print('==> Loading data from timeseries files...')
timeseries_lab = pd.read_csv(csv + 'timeserieslab.csv', low_memory=False)
timeseries_periodic = pd.read_csv(csv+ 'timeseriesperiodic.csv')

==> Loading data from timeseries files...


In [16]:
print("there are {} patients in the  and {} records in lab table.".format(len(list(timeseries_lab ['patientunitstayid'].unique())),len(timeseries_lab)))
print("there are {} patients in the  and {} records in vital periodic table.".format(len(list(timeseries_periodic ['patientunitstayid'].unique())),len(timeseries_periodic)))

there are 12260 patients in the  and 2337787 records in lab table.
there are 12260 patients in the  and 10671165 records in vital periodic table.


In [17]:
timeseries_lab.set_index(['patientunitstayid','labresultoffset'], inplace=True)
timeseries_periodic.set_index(['patientunitstayid','observationoffset'], inplace=True)
timeseries_lab.rename(round_up, level = 'labresultoffset', inplace = True)
timeseries_periodic.rename(round_up, level = 'observationoffset', inplace = True)
timeseries_lab.sort_index(inplace=True)
timeseries_periodic.sort_index(inplace=True)

In [18]:
def reconfigure_timeseries(timeseries, offset_column, feature_column=None, test=False):
    """
    Reconfigure timeseries data by setting multi-index and pivoting if necessary.
    """
    timeseries.reset_index(inplace=True)
    if test:
        timeseries = timeseries.iloc[:5000]  # Limit for testing
    timeseries.set_index(['patientunitstayid', pd.to_timedelta(timeseries[offset_column], unit='min')], inplace=True)
    timeseries.drop(columns=offset_column, inplace=True)
    if feature_column:
        timeseries = timeseries.pivot_table(columns=feature_column, index=timeseries.index)
    timeseries.index = pd.MultiIndex.from_tuples(timeseries.index, names=['patient', 'time'])
    return timeseries

In [19]:
test=False
print('==> Reconfiguring lab timeseries...')
lab = reconfigure_timeseries(timeseries_lab, 'labresultoffset', 'labname', test)
lab.columns=lab.columns.droplevel()

print('==> Reconfiguring periodic timeseries...')
periodic = reconfigure_timeseries(timeseries_periodic, 'observationoffset', test)

==> Reconfiguring lab timeseries...
==> Reconfiguring periodic timeseries...


In [20]:
flat=pd.read_csv(csv + 'preprocessed_flat_drug.csv')
labels = pd.read_csv(csv + 'preprocessed_labels.csv')
diagnoses= pd.read_csv(csv + 'preprocessed_diagnoses.csv')


In [21]:
common_id = list(set(flat['patient']).intersection(set(labels['patient']).intersection(set(diagnoses['patient']))))

In [22]:
len(common_id)

11698

In [23]:
lab = lab.reset_index()
lab = lab[lab['patient'].isin(common_id)]
# lab['time'] = pd.to_timedelta(lab['time'])
lab = lab[lab['time'] <= pd.to_timedelta('14 days')]

lab = lab.set_index(['patient', 'time'])

periodic = periodic.reset_index()
periodic = periodic[periodic['patient'].isin(common_id)]
periodic = periodic.set_index(['patient', 'time'])

In [24]:
print('==> Combining data together...')
merged = pd.concat([lab, periodic], axis=0, sort=False)

==> Combining data together...


In [25]:
possible_value_ranges = {
   "temperature": (32,43),        # Temperature
    "sao2": (40, 100),             # SpO2
    "heartrate": (30, 400),        # Heart rate
    "respiration": (0, 60),        # Resp. rate
    "cvp": (0, 20),                # CVP
    
    # "etco2": (0, 60),             # EtCO2
    "systemicsystolic": (40, 300), # BP systolic
    "systemicdiastolic": (20, 150),# BP diastolic
    "systemicmean": (30, 200),     # BP mean
}

In [26]:
def filter_vital_signs(data, ranges):
    for column, (min_val, max_val) in ranges.items():
        if column in data.columns:
            data = data[(data[column].isna()) | ((data[column] >= min_val) & (data[column] <= max_val))]
    return data

merged = filter_vital_signs(merged, possible_value_ranges)

# 计算 0.1% 和 99.9% 分位数
low_quantile = merged.quantile(0.001, numeric_only=True)  # 0.1% 分位数
high_quantile= merged.quantile(0.999, numeric_only=True)  # 99.9% 分位数

# 仅保留在 [0.1%, 99.9%] 之间的值
merged = merged[(merged >= low_quantile) & (merged <= high_quantile)]

print("select valid vlaue of vital signs")
print("There are {} patients and {} records in the vital periodic table.".format(
    merged.index.get_level_values('patient').nunique(),
    len(merged)
))


select valid vlaue of vital signs
There are 11698 patients and 10237638 records in the vital periodic table.


In [16]:
merged.to_csv(csv + 'preprocessed_merged.csv')

In [8]:
merged=pd.read_csv(csv + 'preprocessed_merged.csv').set_index(['patient', 'time'])

In [27]:
def gen_patient_chunk(patients, merged, size=500): # 500 is a good size for LSTM
    """
    Generate patient data chunks for processing.
    """
    it = iter(patients)
    chunk = list(itertools.islice(it, size))
    while chunk:
        yield merged.loc[chunk]
        chunk = list(itertools.islice(it, size))

In [50]:
def resample(timeseries, hdf, header):
    resampled_data = []
    for patient, group in timeseries.groupby(level=0):  # process each patient separately
            # **1.resample **
            group = group.droplevel(0)  # remove patient from index
            group.index = group.index.ceil(freq='5min')  # round up to nearest 5 minutes
            resampled = group.resample('5min', closed='right', label='right').mean()  # resample to 5 minutes
            resampled['patient'] = patient  # add patient back to the DataFrame
            resampled = resampled.ffill()  # fill missing values with the last known value
            resampled.fillna(0, inplace=True) # fill remaining missing values with 0
            
            # **2.reset the multi index**
            resampled.reset_index(inplace=True)  # remove the MultiIndex
            resampled['time'] = range(1, len(resampled) + 1) 
            resampled.set_index(['patient', 'time'], inplace=True)  # set new MultiIndex
            
            resampled_data.append(resampled)

 
    final = pd.concat(resampled_data)
#     final.to_csv(csv + 'preprocessed_timeseries.csv', mode='a', header=header)
    final.to_hdf(hdf + "final_timeseries.h5", key="df", mode="w", complevel=5, complib="zlib") # mode='w' to overwrite
#     return final

In [51]:

patients = merged.index.unique(level=0)
gen_chunks = gen_patient_chunk(patients, merged)
header = True
print('==> Initiating main processing loop...')
for i, patient_chunk in enumerate(gen_chunks, start=1):
    final=resample(patient_chunk,hdf,header)
    print(f'==> Processed {i * 500} patients...')
    header = False
    
    # Clear memory
    del patient_chunk
    gc.collect()

==> Initiating main processing loop...
==> Processed 500 patients...
==> Processed 1000 patients...


KeyboardInterrupt: 

In [48]:
final= pd.read_hdf(hdf+ "final_timeseries.h5", key="df")

In [49]:
final

Unnamed: 0_level_0,Unnamed: 1_level_0,-bands,-basos,-eos,-lymphs,-monos,-polys,24 h urine protein,24 h urine urea nitrogen,ALT (SGPT),ANF/ANA,...,sao2,heartrate,respiration,cvp,systemicsystolic,systemicdiastolic,systemicmean,st1,st2,st3
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
511612,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,90.0,15.0,0.0,0.0,0.0,0.0,0.00,0.2,0.2
511612,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,98.0,91.0,15.0,0.0,0.0,0.0,0.0,0.00,0.2,0.2
511612,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,89.0,17.0,0.0,0.0,0.0,0.0,0.00,0.2,0.3
511612,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,93.0,17.0,0.0,0.0,0.0,0.0,0.00,0.2,0.2
511612,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,91.0,17.0,0.0,0.0,0.0,0.0,-0.05,0.2,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3348105,1500,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3348105,1501,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3348105,1502,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3348105,1503,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0


In [37]:
processed_ts = pd.read_hdf(hdf + 'final_timeseries.h5', index_col=[0, 1])

In [38]:
processed_ts

Unnamed: 0_level_0,Unnamed: 1_level_0,-bands,-basos,-eos,-lymphs,-monos,-polys,24 h urine protein,24 h urine urea nitrogen,ALT (SGPT),ANF/ANA,...,sao2,heartrate,respiration,cvp,systemicsystolic,systemicdiastolic,systemicmean,st1,st2,st3
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
511612,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,90.0,15.0,0.0,0.0,0.0,0.0,0.00,0.2,0.2
511612,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,98.0,91.0,15.0,0.0,0.0,0.0,0.0,0.00,0.2,0.2
511612,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,89.0,17.0,0.0,0.0,0.0,0.0,0.00,0.2,0.3
511612,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,93.0,17.0,0.0,0.0,0.0,0.0,0.00,0.2,0.2
511612,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.0,91.0,17.0,0.0,0.0,0.0,0.0,-0.05,0.2,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3348105,1500,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3348105,1501,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3348105,1502,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3348105,1503,0.0,1.0,2.0,12.0,13.0,0.0,0.0,0.0,17.0,0.0,...,98.0,113.0,9.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0


In [40]:
len(processed_ts.index.get_level_values('patient').unique())

198

In [None]:
patient_id =  273703

p = processed_ts.loc[patient_id]
p

In [35]:
merged

Unnamed: 0_level_0,Unnamed: 1_level_0,-bands,-basos,-eos,-lymphs,-monos,-polys,24 h urine protein,24 h urine urea nitrogen,ALT (SGPT),ANF/ANA,...,sao2,heartrate,respiration,cvp,systemicsystolic,systemicdiastolic,systemicmean,st1,st2,st3
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
252784,0 days 00:20:00,,,,,,,,,,,...,,,,,,,,,,
252784,0 days 00:40:00,,,,,,,,,,,...,,,,,,,,,,
252784,0 days 01:20:00,,,,,,,,,,,...,,,,,,,,,,
252784,0 days 01:40:00,,,,,,,,,,,...,,,,,,,,,,
252784,0 days 02:35:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3348105,2 days 19:25:00,,,,,,,,,,,...,,85.0,,,,,,,,
3348105,2 days 19:30:00,,,,,,,,,,,...,,86.0,,,,,,,,
3348105,2 days 19:35:00,,,,,,,,,,,...,,97.0,,,,,,,,
3348105,2 days 19:40:00,,,,,,,,,,,...,,97.0,,,,,,,,
