In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
from pandarallel import pandarallel

tqdm.pandas()
pandarallel.initialize(nb_workers=8,progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [3]:
pd.options.display.float_format = '{:.5f}'.format
pd.set_option('mode.chained_assignment',  None) # 경고 off
pd.set_option('display.max_seq_items', None)

Data

In [4]:
df = pd.read_csv('processed/preprocess_sepsis.csv',index_col=0)
df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,intime,outtime,admittime,dischtime,dod,deathtime,...,MV,SOFA_Resp,SOFA_Coag,SOFA_Liver,SOFA_Central,SOFA_Renal,SOFA,presumed_onset,sepsis,septic_shock
0,18421337,22413411,30000484,2136-01-14 17:23:32,2136-01-14 17:23:32,2136-01-17 04:53:08,2136-01-14 17:22:00,2136-01-24 16:00:00,2136-02-21,,...,0,,,,3.0,,4.0,1,1,0
1,18421337,22413411,30000484,2136-01-14 21:23:32,2136-01-14 17:23:32,2136-01-17 04:53:08,2136-01-14 17:22:00,2136-01-24 16:00:00,2136-02-21,,...,0,,,,3.0,,4.0,1,1,0
2,18421337,22413411,30000484,2136-01-15 01:23:32,2136-01-14 17:23:32,2136-01-17 04:53:08,2136-01-14 17:22:00,2136-01-24 16:00:00,2136-02-21,,...,0,,0.0,0.0,2.0,1.0,3.0,1,1,0
3,18421337,22413411,30000484,2136-01-15 05:23:32,2136-01-14 17:23:32,2136-01-17 04:53:08,2136-01-14 17:22:00,2136-01-24 16:00:00,2136-02-21,,...,0,,0.0,0.0,2.0,1.0,5.0,1,1,0
4,18421337,22413411,30000484,2136-01-15 09:23:32,2136-01-14 17:23:32,2136-01-17 04:53:08,2136-01-14 17:22:00,2136-01-24 16:00:00,2136-02-21,,...,0,,0.0,0.0,4.0,1.0,8.0,1,1,0


In [5]:
for col in ['charttime', 'intime', 'outtime', 'admittime', 'dischtime', 'dod', 'deathtime']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [6]:
df['traj'] = pd.factorize(df['stay_id'])[0]
df.sort_values(by=['traj','charttime'],inplace=True)

Reward

In [9]:
def Reward(target):

    target = target.reset_index(drop=True)
    
    if target['morta_icu'].sum() != 0:
        target['reward_morta_icu'].iloc[-1] = -1

    if target['morta_hosp'].sum() != 0:
        target['reward_morta_hosp'].iloc[-1] = -1

    if target['morta_90'].sum() != 0:
        target['reward_morta_90'].iloc[-1] = -1
    
    return target

df['reward_morta_icu'] = 0
df['reward_morta_hosp'] = 0
df['reward_morta_90'] = 0

df = df.groupby('traj').parallel_apply(Reward).reset_index(drop=True)

print(df['reward_morta_icu'].value_counts())
print(df['reward_morta_hosp'].value_counts())
print(df['reward_morta_90'].value_counts())

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5275), Label(value='0 / 5275'))), …

reward_morta_icu
 0    1208517
-1       4579
Name: count, dtype: int64
reward_morta_hosp
 0    1207041
-1       6055
Name: count, dtype: int64
reward_morta_90
 0    1203211
-1       9885
Name: count, dtype: int64


0.05 <

In [10]:
from functools import partial

target_columns = [
    'Weight', 'GCS', 'Heartrate', 'Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'Resprate', 'SpO2', 'Temperature', 'FiO2',
    'Potassium', 'Sodium', 'Chloride', 'Glucose', 'BUN', 'SCr', 'Magnesium', 'Calcium', 'SGOT', 'SGPT', 'Total_Bilirubin',
    'Hemoglobin', 'WBC', 'Platelet', 'PTT', 'PT', 'INR', 'Arterial_ph', 'PaO2', 'PaCO2', 'BaseExcess', 'Bicarbonate', 'Lactate'
]

low_col = []
high_col = []

for i in target_columns:
    missing_count = df[i].isna().sum()
    if (missing_count <= df.shape[0] * 0.05) and (missing_count > 0):
        low_col.append(i)
    else:
        high_col.append(i)

print("Columns with low missing values:", low_col)
print("Columns with high missing values:", high_col)

def imputation(target, high_cols):
    def fixgaps(series):
        return series.interpolate(method='linear', limit_direction='both')
    
    target = target.reset_index(drop=True)
    for col in high_cols:
        target[col] = fixgaps(target[col])

    return target

imputation_func = partial(imputation, high_cols=high_col)
df = df.groupby('traj').parallel_apply(imputation_func).reset_index(drop=True)

Columns with low missing values: ['GCS', 'Heartrate', 'Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'Resprate', 'SpO2', 'Temperature', 'Potassium', 'BUN', 'SCr', 'WBC', 'Platelet']
Columns with high missing values: ['Weight', 'FiO2', 'Sodium', 'Chloride', 'Glucose', 'Magnesium', 'Calcium', 'SGOT', 'SGPT', 'Total_Bilirubin', 'Hemoglobin', 'PTT', 'PT', 'INR', 'Arterial_ph', 'PaO2', 'PaCO2', 'BaseExcess', 'Bicarbonate', 'Lactate']


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5275), Label(value='0 / 5275'))), …

Split

In [11]:
from skmultilearn.model_selection import IterativeStratification

def iterative_split(df, test_size, stratify_columns):

    one_hot_cols = [pd.get_dummies(df[col]) for col in stratify_columns]
    one_hot_cols = pd.concat(one_hot_cols, axis=1).to_numpy()
    stratifier = IterativeStratification(
        n_splits=2, order=len(stratify_columns), sample_distribution_per_fold=[test_size, 1-test_size])
    train_indices, test_indices = next(stratifier.split(df.to_numpy(), one_hot_cols))
    train, test = df.iloc[train_indices], df.iloc[test_indices]

    return train, test

In [12]:
target_stay_id = df.groupby('stay_id').tail(1).reset_index(drop=True)
target_stay_id = target_stay_id[['stay_id', 'traj', 'gender', 're_admission', 'reward_morta_icu','reward_morta_hosp', 'reward_morta_90']]

In [13]:
train_id, test_id = iterative_split(target_stay_id[['traj', 'gender', 're_admission', 'reward_morta_icu','reward_morta_hosp', 'reward_morta_90']], test_size=0.25, 
                           stratify_columns=['gender', 're_admission', 'reward_morta_icu'])

In [14]:
test_id, valid_id = iterative_split(test_id[['traj', 'gender', 're_admission', 'reward_morta_icu','reward_morta_hosp', 'reward_morta_90']], test_size=0.2, 
                           stratify_columns=['gender', 're_admission', 'reward_morta_icu'])

In [15]:
train_id.to_csv('processed/train_id.csv')
valid_id.to_csv('processed/valid_id.csv')
test_id.to_csv('processed/test_id.csv')

In [66]:
train_df = pd.merge(df,train_id['traj'],on='traj',how='inner')
valid_df = pd.merge(df,valid_id['traj'],on='traj',how='inner')
test_df = pd.merge(df,test_id['traj'],on='traj',how='inner')

KNN imputation

In [67]:
from sklearn.impute import KNNImputer

def knn_impute_chunkwise(df, target_columns, imputer, chunk_size=10000, fit=True):

    imputed_chunks = []
    
    for start in range(0, len(df), chunk_size):
        end = min(start + chunk_size, len(df))
        chunk = df.iloc[start:end][target_columns]
        if fit :
            imputed_chunk = imputer.fit_transform(chunk)
        else :
            imputed_chunk = imputer.transform(chunk)

        imputed_chunks.append(imputed_chunk)
    
    imputed_df = np.vstack(imputed_chunks)
    df[target_columns] = imputed_df

    if fit :
        return df, imputer
    else :
        return df

target_columns = [
    'Weight', 'GCS', 'Heartrate', 'Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'Resprate', 'SpO2', 'Temperature', 'FiO2',
    'Potassium', 'Sodium', 'Chloride', 'Glucose', 'BUN', 'SCr', 'Magnesium', 'Calcium', 'SGOT', 'SGPT', 'Total_Bilirubin',
    'Hemoglobin', 'WBC', 'Platelet', 'PTT', 'PT', 'INR', 'Arterial_ph', 'PaO2', 'PaCO2', 'BaseExcess', 'Bicarbonate', 'Lactate'
]

imputer = KNNImputer(n_neighbors=1)

chunk_size = 10000

train_df, trained_imputer = knn_impute_chunkwise(train_df, target_columns, imputer, chunk_size, fit=True)

In [68]:
valid_df  = knn_impute_chunkwise(valid_df, target_columns, trained_imputer, chunk_size, fit=False)
test_df = knn_impute_chunkwise(test_df, target_columns, trained_imputer, chunk_size, fit=False)

Derived variables

In [69]:
def calculate_derived_variables(df):

    df['FiO2'] = round((df['FiO2'] / 100), 1)
    df['PaO2/FiO2'] = round(df['PaO2'] / df['FiO2'], 1)
    df['Shock_Index'] = round(df['Heartrate'] / df['Systolic_BP'], 1)

    inf_stays = df[np.isinf(df['Shock_Index']) | np.isinf(df['PaO2/FiO2'])]['stay_id'].unique()
    df = df[~df['stay_id'].isin(inf_stays)]

    df['SIRS'] = ((df['Temperature'] < 36) | (df['Temperature'] > 38)).astype(int) + \
                 (df['Heartrate'] > 90).astype(int) + \
                 ((df['WBC'] < 4) | (df['WBC'] > 12)).astype(int) + \
                 (df['Resprate'] > 20).astype(int)


    df.loc[(df['Mean_BP'] >= 70) & (df['SOFA_Cardio'].isna()), 'SOFA_Cardio'] = 0
    df.loc[(df['Mean_BP'] < 70) & (df['SOFA_Cardio'].isna()), 'SOFA_Cardio'] = 1

    df['SOFA_Resp'] = np.nan
    df.loc[df['PaO2/FiO2'] >= 400, 'SOFA_Resp'] = 0
    df.loc[df['PaO2/FiO2'] < 400, 'SOFA_Resp'] = 1
    df.loc[df['PaO2/FiO2'] < 300, 'SOFA_Resp'] = 2
    df.loc[(df['PaO2/FiO2'] < 200) & (df['MV'] == 1), 'SOFA_Resp'] = 3
    df.loc[(df['PaO2/FiO2'] < 100) & (df['MV'] == 1), 'SOFA_Resp'] = 4

    df['SOFA_Coag'] = np.nan
    df.loc[df['Platelet'] >= 150, 'SOFA_Coag'] = 0
    df.loc[df['Platelet'] < 150, 'SOFA_Coag'] = 1
    df.loc[df['Platelet'] < 100, 'SOFA_Coag'] = 2
    df.loc[df['Platelet'] < 50, 'SOFA_Coag'] = 3
    df.loc[df['Platelet'] < 20, 'SOFA_Coag'] = 4

    df['SOFA_Liver'] = np.nan
    df.loc[df['Total_Bilirubin'] < 1.2, 'SOFA_Liver'] = 0
    df.loc[(df['Total_Bilirubin'] >= 1.2) & (df['Total_Bilirubin'] < 2.0), 'SOFA_Liver'] = 1
    df.loc[(df['Total_Bilirubin'] >= 2.0) & (df['Total_Bilirubin'] < 6.0), 'SOFA_Liver'] = 2
    df.loc[(df['Total_Bilirubin'] >= 6.0) & (df['Total_Bilirubin'] < 12.0), 'SOFA_Liver'] = 3
    df.loc[df['Total_Bilirubin'] >= 12.0, 'SOFA_Liver'] = 4

    df['SOFA_Central'] = np.nan
    df.loc[df['GCS'] == 15, 'SOFA_Central'] = 0
    df.loc[(df['GCS'] >= 13) & (df['GCS'] <= 14), 'SOFA_Central'] = 1
    df.loc[(df['GCS'] >= 10) & (df['GCS'] <= 12), 'SOFA_Central'] = 2
    df.loc[(df['GCS'] >= 6) & (df['GCS'] <= 9), 'SOFA_Central'] = 3
    df.loc[df['GCS'] < 6, 'SOFA_Central'] = 4

    df['SOFA_Renal'] = np.nan
    df.loc[df['SCr'] < 1.2, 'SOFA_Renal'] = 0
    df.loc[(df['SCr'] >= 1.2) & (df['SCr'] < 2.0), 'SOFA_Renal'] = 1
    df.loc[(df['SCr'] >= 2.0) & (df['SCr'] < 3.5), 'SOFA_Renal'] = 2
    df.loc[((df['SCr'] >= 3.5) & (df['SCr'] < 5.0)) | (df['output_4hr'] < (500 / 6)), 'SOFA_Renal'] = 3
    df.loc[(df['SCr'] >= 5.0) | (df['output_4hr'] < (200 / 6)), 'SOFA_Renal'] = 4

    df['SOFA'] = df[['SOFA_Resp', 'SOFA_Coag', 'SOFA_Liver', 'SOFA_Central', 'SOFA_Renal']].sum(axis=1, skipna=True)

    return df

In [70]:
train_df = calculate_derived_variables(train_df)
valid_df = calculate_derived_variables(valid_df)
test_df = calculate_derived_variables(test_df)

RAW

In [71]:
train_df.to_csv('processed/train_df_RAW.csv')
valid_df.to_csv('processed/valid_df_RAW.csv')
test_df.to_csv('processed/test_df_RAW.csv')

Z

In [None]:
from sklearn.preprocessing import StandardScaler

colnorm = ['age', 'Weight', 'GCS', 'Heartrate', 'Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'Resprate', 'Temperature', 'FiO2',
        'Potassium', 'Sodium', 'Chloride', 'Glucose', 'Magnesium', 'Calcium', 'Hemoglobin','WBC', 'Platelet', 'PTT', 'PT',
        'Arterial_ph', 'PaO2', 'PaCO2', 'BaseExcess', 'Bicarbonate', 'Lactate', 'SOFA', 'SIRS', 'Shock_Index','PaO2/FiO2', 'Cumulated_balance', 'elixhauser']

scaler = StandardScaler()

for col in [x for x in colnorm]:
    train_df[col] = scaler.fit_transform(train_df[col].values.reshape(-1, 1)).flatten()
    valid_df[col] = scaler.transform(valid_df[col].values.reshape(-1, 1)).flatten()
    test_df[col] = scaler.transform(test_df[col].values.reshape(-1, 1)).flatten()

In [75]:
collog=['SpO2', 'BUN', 'SCr', 'SGOT', 'SGPT', 'Total_Bilirubin', 'INR', 'output_total','output_4hr']

scaler = StandardScaler()

for col in [x for x in collog]:

    train_df[col] = np.log(0.1+train_df[col].values)
    valid_df[col] = np.log(0.1+valid_df[col].values)
    test_df[col] = np.log(0.1+test_df[col].values)

    train_df[col] = scaler.fit_transform(train_df[col].values.reshape(-1, 1)).flatten()
    valid_df[col] = scaler.transform(valid_df[col].values.reshape(-1, 1)).flatten()
    test_df[col] = scaler.transform(test_df[col].values.reshape(-1, 1)).flatten()

In [76]:
colbin = [['gender', 're_admission', 'MV']]

for col in [x for x in colbin]:
    train_df[col] = train_df[col].values - 0.5
    valid_df[col] = valid_df[col].values - 0.5
    test_df[col] = test_df[col].values - 0.5

In [77]:
train_df.to_csv('processed/train_df_Z.csv')
valid_df.to_csv('processed/valid_df_Z.csv')
test_df.to_csv('processed/test_df_Z.csv')

Action

In [99]:
def fluid_action(df, bins):
    conditions = [
        df['input_4hr'] == 0,
        (df['input_4hr'] > 0) & (df['input_4hr'] <= bins[0]),
        (df['input_4hr'] > bins[0]) & (df['input_4hr'] <= bins[1]),
        (df['input_4hr'] > bins[1]) & (df['input_4hr'] <= bins[2]),
        df['input_4hr'] > bins[2]
    ]
    choices = [1, 2, 3, 4, 5]
    df['fluid_action'] = np.select(conditions, choices, default=0)
    return df

def vaso_action(df, bins):
    conditions = [
        df['median_vaso'] == 0,
        (df['median_vaso'] > 0) & (df['median_vaso'] <= bins[0]),
        (df['median_vaso'] > bins[0]) & (df['median_vaso'] <= bins[1]),
        (df['median_vaso'] > bins[1]) & (df['median_vaso'] <= bins[2]),
        df['median_vaso'] > bins[2]
    ]
    choices = [1, 2, 3, 4, 5]
    df['vaso_action'] = np.select(conditions, choices, default=0)
    return df

def action(train_df, valid_df, test_df):
    fluid_bins = train_df[train_df['input_4hr'] > 0]['input_4hr'].quantile([0.25, 0.5, 0.75]).values
    vaso_bins = train_df[train_df['median_vaso'] > 0]['median_vaso'].quantile([0.25, 0.5, 0.75]).values
    
    train_df = fluid_action(train_df, fluid_bins)
    train_df = vaso_action(train_df, vaso_bins)
    
    valid_df = fluid_action(valid_df, fluid_bins)
    valid_df = vaso_action(valid_df, vaso_bins)
    
    test_df = fluid_action(test_df, fluid_bins)
    test_df = vaso_action(test_df, vaso_bins)
    
    return train_df, valid_df, test_df

In [100]:
train_df,valid_df,test_df = action(train_df,valid_df,test_df)

In [103]:
def action_set(df):

    for i in range(1, int(np.max(df['fluid_action']) + 1)):
        df[f'fluid_action_{i}'] = (df['fluid_action'] == i).astype(int)
        df[f'vaso_action_{i}'] = (df['vaso_action'] == i).astype(int)

    prev_columns = {}
    for i in range(1, int(np.max(df['fluid_action']) + 1)):
        prev_columns[f'fluid_action_{i}_prev'] = df.groupby('stay_id')[f'fluid_action_{i}'].shift().fillna(0)
        prev_columns[f'vaso_action_{i}_prev'] = df.groupby('stay_id')[f'vaso_action_{i}'].shift().fillna(0)

    df = pd.concat([df, pd.DataFrame(prev_columns)], axis=1)

    df['action'] =  (df['fluid_action']-1) * 5  + (df['vaso_action']-1) + 1

    action_columns = {}
    
    for i in range(1, int(np.max(df['action']) + 1)):
        action_columns[f'action_{i}'] = (df['action'] == i).astype(int)
    df = pd.concat([df, pd.DataFrame(action_columns)], axis=1)

    prev_columns = {}
    for i in range(1, int(np.max(df['action']) + 1)):
        prev_columns[f'action_{i}_prev'] = df.groupby('stay_id')[f'action_{i}'].shift().fillna(0)
    df = pd.concat([df, pd.DataFrame(prev_columns)], axis=1)

    return df

In [104]:
df = pd.concat([train_df,valid_df,test_df])
df['step'] = df.groupby('stay_id').cumcount()
df = action_set(df)

Feature selection & save

In [108]:
df

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,intime,outtime,admittime,dischtime,dod,deathtime,...,action_16_prev,action_17_prev,action_18_prev,action_19_prev,action_20_prev,action_21_prev,action_22_prev,action_23_prev,action_24_prev,action_25_prev
0,12207593,22795209,30000646,2194-04-29 01:39:22,2194-04-29 01:39:22,2194-05-03 18:23:48,2194-04-27 18:43:00,2194-05-06 02:29:00,2194-05-06,2194-05-06 02:29:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1,12207593,22795209,30000646,2194-04-29 05:39:22,2194-04-29 01:39:22,2194-05-03 18:23:48,2194-04-27 18:43:00,2194-05-06 02:29:00,2194-05-06,2194-05-06 02:29:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2,12207593,22795209,30000646,2194-04-29 09:39:22,2194-04-29 01:39:22,2194-05-03 18:23:48,2194-04-27 18:43:00,2194-05-06 02:29:00,2194-05-06,2194-05-06 02:29:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000
3,12207593,22795209,30000646,2194-04-29 13:39:22,2194-04-29 01:39:22,2194-05-03 18:23:48,2194-04-27 18:43:00,2194-05-06 02:29:00,2194-05-06,2194-05-06 02:29:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
4,12207593,22795209,30000646,2194-04-29 17:39:22,2194-04-29 01:39:22,2194-05-03 18:23:48,2194-04-27 18:43:00,2194-05-06 02:29:00,2194-05-06,2194-05-06 02:29:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237827,10428217,28879895,35269020,2152-03-19 00:22:00,2152-03-10 20:22:00,2152-03-19 18:38:18,2152-03-10 18:43:00,2152-04-11 12:31:00,2152-04-11,2152-04-11 12:31:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
237828,10428217,28879895,35269020,2152-03-19 04:22:00,2152-03-10 20:22:00,2152-03-19 18:38:18,2152-03-10 18:43:00,2152-04-11 12:31:00,2152-04-11,2152-04-11 12:31:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
237829,10428217,28879895,35269020,2152-03-19 08:22:00,2152-03-10 20:22:00,2152-03-19 18:38:18,2152-03-10 18:43:00,2152-04-11 12:31:00,2152-04-11,2152-04-11 12:31:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
237830,10428217,28879895,35269020,2152-03-19 12:22:00,2152-03-10 20:22:00,2152-03-19 18:38:18,2152-03-10 18:43:00,2152-04-11 12:31:00,2152-04-11,2152-04-11 12:31:00,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [106]:
def feature_selection(df,target_reward):

       target = df[['traj', 'step','sepsis',

              'age', 'gender', 'Weight', 're_admission', 'elixhauser',
              'GCS', 'Heartrate', 'Systolic_BP', 'Diastolic_BP',
              'Mean_BP', 'Resprate', 'Temperature', 'FiO2',
              'Potassium', 'Sodium', 'Chloride', 'Glucose',
              'INR', 'Magnesium', 'Calcium', 'Hemoglobin',
              'WBC', 'Platelet', 'PTT', 'PT',
              'Arterial_ph', 'Lactate', 'PaO2', 'PaCO2',
              'PaO2/FiO2', 'Bicarbonate', 'SpO2', 'BUN',
              'SCr', 'SGOT', 'SGPT', 'Total_Bilirubin',
              'output_4hr', 'output_total', 'Cumulated_balance', 'SOFA',
              'SIRS', 'Shock_Index', 'BaseExcess', 'MV',

              'action_1_prev', 'action_2_prev', 'action_3_prev', 'action_4_prev', 'action_5_prev',
              'action_6_prev', 'action_7_prev', 'action_8_prev', 'action_9_prev', 'action_10_prev',
              'action_11_prev', 'action_12_prev', 'action_13_prev', 'action_14_prev', 'action_15_prev',
              'action_16_prev', 'action_17_prev', 'action_18_prev', 'action_19_prev', 'action_20_prev',
              'action_21_prev', 'action_22_prev', 'action_23_prev', 'action_24_prev', 'action_25_prev',


              'action',
              
              target_reward
              ]]
       
       for i,idx in enumerate(target.columns):
              if i < 3 : pass

              elif i >= 3 and i < (len(target.columns) - (1*(9)+1+1)) : 
                     target.rename(columns={idx:'s:'+idx},inplace=True)       

              elif (idx == 'action'): 
                     target.rename(columns={idx:'a:'+idx},inplace=True)

              elif ('action' in idx) and ('prev' in idx): 
                     target.rename(columns={idx:'s:'+idx},inplace=True)

              else : 
                     target.rename(columns={idx:'r:'+idx},inplace=True)

       return target

In [None]:
feature_selection(df,'reward_morta_icu').to_csv(f'processed/morta_icu_RAW.csv', index=False)