In [3]:
import numpy as np
import pandas as pd

# Data acquisition

In [4]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
df = df.drop(columns = ['time'])

df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,1


In [5]:
# Add age_cohort column
# < 55 = young
# 55-64 = middle-aged
# 65-74 = old
# >=75 = elderly

df['age_cohort'] = pd.cut(x=df['age'],
                          bins=[0, 54, 64, 74, 120],
                          labels=['young', 'middle-aged', 'old', 'elderly'])

df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,DEATH_EVENT,age_cohort
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,1,elderly
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,1,middle-aged
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,1,old
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,1,young
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,1,old


## Male and Female records

In [7]:
def getMaleRecords(inputFrame):
    return inputFrame[inputFrame['sex'] == 1]

def getFemaleRecords(inputFrame):
    return inputFrame[inputFrame['sex'] == 0]

# Get all male records
df_M = getMaleRecords(df)

# Get all female records
df_F = getFemaleRecords(df)

# df_M.head()

## One-hot encoding age cohorts

In [8]:
df_onehot = pd.get_dummies(df, columns=["age_cohort"])

df_onehot.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,DEATH_EVENT,age_cohort_young,age_cohort_middle-aged,age_cohort_old,age_cohort_elderly
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,1,0,0,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,1,0,1,0,0
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,1,0,0,1,0
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,1,1,0,0,0
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,1,0,0,1,0


In [38]:
age_cohorts = ["age_cohort_young",
               "age_cohort_middle-aged",
               "age_cohort_old",
               "age_cohort_elderly"]

feat_2 = ["serum_creatinine", "ejection_fraction"]

feat_3        = ["serum_creatinine", "ejection_fraction", "age"]
feat_3_cohort = ["serum_creatinine", "ejection_fraction", "age_cohort"]
feat_3_onehot = ["serum_creatinine", "ejection_fraction"] + age_cohorts

feat_4 = ["serum_creatinine", "ejection_fraction", "age", "sex"]
feat_4_cohort = ["serum_creatinine", "ejection_fraction", "age_cohort", "sex"]
feat_4_onehot = ["serum_creatinine", "ejection_fraction", "sex"] + age_cohorts

In [42]:
target = ["DEATH_EVENT"]

df.to_csv('output/feat11_cohorts.csv', index=False)

# 2 features
df[feat_2 + target].to_csv('output/feat_2.csv', index=False)

# 3 features
df[feat_3 + target].to_csv('output/feat_3.csv', index=False)
df[feat_3_cohort + target].to_csv('output/feat_3_cohort.csv', index=False)
df_onehot[feat_3_onehot + target].to_csv('output/feat_3_onehot.csv', index=False)

# 4 features
df[feat_4 + target].to_csv('output/feat_4.csv', index=False)
df[feat_4_cohort + target].to_csv('output/feat_4_cohort.csv', index=False)
df_onehot[feat_4_onehot + target].to_csv('output/feat_4_onehot.csv', index=False)