In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pickle
import numpy as np

In [3]:
def is_number(s):
    if s is None:
        s = np.nan

    try:
        float(s)
        return True
    except ValueError:
        pass

    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass

    return False

def read_data_cn(fname, interval, forecast):
    """
    interval: interval of group data, days
    forecast: whether the patient will die within the next forecast hours
    """

    # read data
    data_df = pd.read_excel(fname, encoding='gbk', index_col=[0])
    # group by interval
    data_df['total_period'] = data_df['出院时间'] - data_df['入院时间']
    data_df['total_period'] = data_df['total_period'].apply(lambda x: x.days//interval)
    data_df['period'] = data_df['出院时间'] - data_df['RE_DATE']
    data_df['period'] = data_df['period'].apply(lambda x: x.days//interval)
    data_df['period'] = data_df['total_period'] - data_df['period']
    data_df = data_df.groupby(['PATIENT_ID', 'period']).last().reset_index()
    # make outcome
    decease = data_df['出院方式'].values # whether the patient will die in hospital
    data_df['decease'] = decease
    decompensation = data_df['出院时间'] - data_df['RE_DATE'] # whether the patient will die within the next 24 hours
    decompensation = decompensation.apply(lambda x: int(x.total_seconds()/3600 < forecast))
    data_df['decompensation'] = data_df['decease'] * decompensation
    # make it cleaner
    data_df = data_df.drop(['RE_DATE', '出院方式', '入院时间', '出院时间', 'total_period'], axis=1)
    data_df = data_df.applymap(lambda x: x.replace('>', '').replace('<', '') if isinstance(x, str) else x)
    data_df = data_df.applymap(lambda x: x if is_number(x) else -1)
    data_df = data_df.astype(float)
    columns = ['PATIENT_ID', 'period','decease', 'decompensation', '乳酸脱氢酶', '超敏C反应蛋白', '淋巴细胞(%)']
    data_df = data_df[columns]
    return data_df

In [4]:
interval = 3
forecast = 72
df_train = read_data_cn('data/time_series_375_prerpocess.xlsx', interval, forecast)
df_test = read_data_cn('data/time_series_test_110_preprocess.xlsx', interval, forecast)

# interpolate
df_train = df_train.interpolate(method='linear').ffill().bfill() 
df_test = df_test.interpolate(method='linear').ffill().bfill() 

In [5]:
df_train

Unnamed: 0,PATIENT_ID,period,decease,decompensation,乳酸脱氢酶,超敏C反应蛋白,淋巴细胞(%)
0,1.0,0.0,0.0,0.0,306.0,43.10,22.600000
1,1.0,1.0,0.0,0.0,278.0,23.35,24.250000
2,1.0,2.0,0.0,0.0,250.0,3.60,25.900000
3,1.0,3.0,0.0,0.0,200.0,3.10,10.500000
4,1.0,5.0,0.0,0.0,206.0,2.60,25.900000
...,...,...,...,...,...,...,...
1162,374.0,1.0,1.0,1.0,1867.0,61.70,2.300000
1163,375.0,0.0,1.0,0.0,915.0,58.50,9.900000
1164,375.0,1.0,1.0,0.0,1083.5,128.00,8.833333
1165,375.0,2.0,1.0,0.0,1252.0,197.50,7.766667


In [1]:
def creat_batch_data(raw_data,outfile):
    
    index=raw_data.groupby(['PATIENT_ID'])['decease'].count()
    index=index.sort_values().reset_index()# sort
    index.columns = ['PATIENT_ID', 'time_length']
    index.head()
    time_length_count = index.groupby(['time_length'])['PATIENT_ID'].count().reset_index()
    time_length_count.columns=['time_length','count']
    time_length_count.head()
    
    batches=[]
    labels=[]
    intervals=[]
    for length in time_length_count['time_length']:
        PID_series=index[index['time_length']==length]['PATIENT_ID'].tolist()
        batch=[]
        label=[]
        interval=[]
        n=0
        for j in PID_series:
            temp = raw_data[raw_data['PATIENT_ID']==j]
            if temp['decease'].sum()==0:
                label.append([1,0]) # survival[1,0] die[0,1]
            else:
                label.append([0,1])
            features=temp.iloc[:,4:7].values.tolist() # 3 features
            batch.append(features)
            time=temp.iloc[:,1].values.tolist() # time interval
            interval.append(time)
            n+=1
            if n%256==0: # max 256 for a bathc
                batches.append(batch)
                labels.append(label)
                batch=[]
                label=[]
        batches.append(batch)
        labels.append(label)
        intervals.append(interval)
    pickle.dump(batches, open('BatchData/' +outfile+ 'Data.seqs', 'wb'), -1) 
    pickle.dump(labels, open('BatchData/' +outfile+ 'Label.seqs', 'wb'), -1) 
    pickle.dump(intervals, open('BatchData/' +outfile+ 'Interval.seqs', 'wb'), -1) 

In [6]:
creat_batch_data(df_train,'Train')
creat_batch_data(df_test,'Test')

In [10]:
path_string = 'BatchData/TrainData.seqs'
with open(path_string,'rb') as f:
    a = pickle.load(f)

In [11]:
a[7]

[[[238.0, 10.5, 31.8],
  [340.0, 12.3, 33.3],
  [352.0, 6.95, 39.55],
  [364.0, 1.6, 45.8],
  [251.0, 1.2, 44.43333333333333],
  [206.5, 1.4, 43.06666666666667],
  [162.0, 1.6, 41.7],
  [237.0, 25.7, 29.6]]]