In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [2]:
file_name = "../dataset/raw/new_data.xlsx"
sheet_name = "Sheet1"

In [3]:
xl_sheet = pd.read_excel(file_name,sheet_name=sheet_name)

In [4]:
csv_path = "../dataset/raw/new_data.csv"
xl_sheet.to_csv(csv_path, index=False)

In [5]:
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,ID,mortality_30_days,6month_mortality,Grace Score (in hospital),in_Hospital_mortality,Age,Gender_Female,Religion,Presentation,Education,...,OnD_P2Y12_Inhibitors,OnD_Beta_blocker,OnD_Calcium_Channel_Blocker,OnD_ACE_ARB,OnD_Aldosterone_Blocking_Antagonist,OnD_Anticoagulant,OnD_INSULIN,OnD_Oral_Anti_glycemics,OnD_0n_statin_lipid_drugs,Anterior STE or LBBB
0,DT 723,0,0,138,0,46,0,Muslim,Referral,Illiterate,...,1,1,0,1,0,0,0,0,0,0
1,DT 261,0,0,177,0,73,0,Hindu,Referral,Middle School,...,1,1,1,1,0,0,0,0,0,1
2,DT 849,0,0,148,0,70,0,Hindu,Direct,High School Completed,...,1,1,0,1,0,0,0,0,0,1
3,DT 264,0,0,221,0,62,0,Muslim,Referral,Middle School,...,1,0,0,0,0,0,0,0,0,1
4,DT 1297,0,1,216,0,65,0,Hindu,Direct,Illiterate,...,1,0,0,0,0,0,0,0,0,1


In [6]:
def timi_score_to_mortality_percentage(timi_score):
    if timi_score == 0:
        return 0.8
    elif timi_score == 1:
        return 1.6
    elif timi_score == 2:
        return 2.2
    elif timi_score == 3:
        return 4.4
    elif timi_score == 4:
        return 7.3
    elif timi_score == 5:
        return 12.4
    elif timi_score == 6:
        return 16.1
    elif timi_score == 7:
        return 23.4
    elif timi_score == 8:
        return 26.8
    else:
        return 35.9

In [7]:
def replace(val):
    if isinstance(val,str):
        if(val=="No"):
            return 0
        else:
            if(val.isdigit() or val.isdecimal()):
                return float(val)
            else:
                return val
    else:
        return val

In [8]:
def convert_time(t):
    h,m,s = map(int,t.split(':'))
    return h*60 + m + s/60

In [9]:
df['TIMI'] = df['TIMI_new'].apply(timi_score_to_mortality_percentage)
df['Pack_Year'] = df['Number of Cigarettes Beedi per day']/20 + df['Number of years smoked']
df['BMI'] = df['Weight']/((df['Height']/100)**2)

In [10]:
for col in df.columns:
    df[col] = df[col].fillna(0)

In [11]:
columns = [
    'mortality_30_days', 'Age', 'Gender_Female', 'Diabetes Mellitus',
    'Hypertension', 'CardiacStatus_Presentation',
    'EJECTION FRACTION(%)', 'MR_Moderate', 'MR_Severe',
    'VSR', 'LV ANEURYSM', 'KILLIP_new', 'TIMI', 'Pack_Year', 'BMI'
]

In [12]:
df = df[columns]

In [13]:
num = [
    'Age', 'Pack_Year', 'BMI', 'KILLIP_new'
]

cat = [
    'Gender_Female', 'Diabetes Mellitus',
    'Hypertension', 'CardiacStatus_Presentation',
    'EJECTION FRACTION(%)', 'MR_Moderate', 'MR_Severe',
    'VSR', 'LV ANEURYSM', 'KILLIP_new', 'TIMI'
]

dummy = [
    'CardiacStatus_Presentation', 'EJECTION FRACTION(%)'
]

In [14]:
df.columns

Index(['mortality_30_days', 'Age', 'Gender_Female', 'Diabetes Mellitus',
       'Hypertension', 'CardiacStatus_Presentation', 'EJECTION FRACTION(%)',
       'MR_Moderate', 'MR_Severe', 'VSR', 'LV ANEURYSM', 'KILLIP_new', 'TIMI',
       'Pack_Year', 'BMI'],
      dtype='object')

In [15]:
target = df["mortality_30_days"]

In [16]:
df_num = df[num]
df_cat = df[cat]

In [17]:
df_cat = pd.get_dummies(df_cat, columns = dummy, drop_first=False, dtype = 'uint8')

In [18]:
df_cat = df_cat.drop(columns=['CardiacStatus_Presentation_Chest Pain'],axis=1)

In [19]:
df_num = pd.DataFrame(StandardScaler().fit_transform(df_num), columns=df_num.columns)

In [20]:
df = pd.concat([df_num,df_cat],axis=1)
df = pd.concat([df,target],axis=1)

In [21]:
df_minority = df[df['mortality_30_days']==1]
df_majority = df[df['mortality_30_days']==0]

df_minority = resample(df_minority, replace=True, n_samples=1000, random_state=0)
df_majority = resample(df_majority, replace=True, n_samples=1000, random_state=0)

df_resampled = pd.concat([df_majority,df_minority])

In [22]:
csv_path = "../dataset/preprocessed.csv"
csv_path_resampled = "../dataset/resampled.csv"

In [23]:
df.to_csv(csv_path, index=False)
df_resampled.to_csv(csv_path_resampled, index = False)

In [24]:
df.columns

Index(['Age', 'Pack_Year', 'BMI', 'KILLIP_new', 'Gender_Female',
       'Diabetes Mellitus', 'Hypertension', 'MR_Moderate', 'MR_Severe', 'VSR',
       'LV ANEURYSM', 'KILLIP_new', 'TIMI',
       'CardiacStatus_Presentation_Cardiogenic Shock',
       'CardiacStatus_Presentation_Heart failure',
       'EJECTION FRACTION(%)_20-25', 'EJECTION FRACTION(%)_25-30',
       'EJECTION FRACTION(%)_30-35', 'EJECTION FRACTION(%)_35-40',
       'EJECTION FRACTION(%)_40-45', 'EJECTION FRACTION(%)_>50',
       'mortality_30_days'],
      dtype='object')