# Problem Description
## Heart Failure Prediction - 12 clinical features for predicting death events.

Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, 
which accounts for 31% of all deaths worlwide.

Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.

Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, 
unhealthy diet and obesity, physical inactivity and harmful use of alcohol using population-wide strategies.

People with cardiovascular disease or who are at high cardiovascular risk (due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or already established disease) need early detection and management wherein a machine learning model can be of great help.


In [65]:
!pip install scikit-learn==1.0

In [66]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [67]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [68]:
df.columns

In [69]:
df.head()

# EDA
## Features
    
    
### age
Age of patient, between 40 years old until 95 years old 
### anaemia
Decrease of red blood cells or hemoglobin (boolean) - 0 = No, 1 = Yes
### creatinine_phosphokinase
Level of the CPK enzyme in the blood (mcg/L)
### diabetes
If the patient has diabetes (boolean) - 0 = No, 1 = Yes
### ejection_fraction
Percentage of blood leaving the heart at each contraction (percentage)
### high_blood_pressure
If the patient has hypertension (boolean) - 0 = No, 1 = Yes
### platelets
Platelets in the blood (kiloplatelets/mL)
### serum_creatinine
Level of serum creatinine in the blood (mg/dL)
### serum_sodium
Level of serum sodium in the blood (mEq/L)
### sex
Woman or man (binary) - Female = 0, Male = 1
### smoking
0 = No, 1 = Yes
### time
not enough information, between 4 until 285


## TARGET
### DEATH_EVENT
0 = No, 1 = Yes


In [70]:
df.info() #No categorical features

In [71]:
df.isnull().sum().sort_values(ascending=False) #No null

In [72]:
df.DEATH_EVENT.value_counts()

In [73]:
df.shape

In [74]:
df.describe().T #df.describe(include=object).T

In [75]:
df.nunique()

In [76]:
df.duplicated().value_counts() #No duplicated value

In [77]:
skew_vals = df.skew().sort_values(ascending=False)
skew_vals

In [78]:
kurtosis_vals = df.kurtosis().sort_values(ascending=False)
kurtosis_vals

In [79]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True)
plt.xticks(rotation=45);

In [80]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
pd.options.plotting.backend = "plotly"


df.iplot(kind='hist', subplots=True, bins=30)

In [81]:
df.isna().sum()

In [82]:
df.info()

# Model training

In [83]:
#df = df.astype('category')
#df['time'] = pd.cut(df['time'], bins=[3, 115, 285], labels=[0,1])

In [84]:
df

In [85]:
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder


features = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking', 'time']
X = df[features] #X = df.drop(columns=['DEATH_EVENT'])
y = df['DEATH_EVENT']

#pipeline = ColumnTransformer([('cat', LabelEncoder(), features)], remainder='passthrough')
#pipeline = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), features)], remainder='passthrough')
#le = LabelEncoder()
#le.fit(X, y)
#le.transform(X, y)


In [86]:
y.shape

In [87]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

#pipeline = Pipeline([
 #   ('label', LabelEncoder()),
#    ('scale', StandardScaler())
#])

#X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
X.apply(LabelEncoder().fit_transform)

In [88]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#le = LabelEncoder()
#X_train = le.fit_transform(X, y)
#X_test = le.transform(X_test)


In [89]:
y_test.shape

In [90]:
from colorama import Fore, Back, Style 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

accuracy_list = []
lr = LogisticRegression(penalty='l2', dual=False, C=0.75, fit_intercept=True, intercept_scaling=0.5, class_weight=0.75)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)
#accuracy_list.append(100*lr_acc)
print(Fore.BLUE + "Accuracy of LR : ", "{:.2f}%".format(100*lr_acc))

In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

accuracy_list = []
rf = RandomForestClassifier(max_features=0.5, max_depth=1, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
rf_acc
#accuracy_list.append(100*rf_acc)
print(Fore.BLUE + "Accuracy of RF : ", "{:.2f}%".format(100*rf_acc))

In [92]:
import xgboost
#from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

accuracy_list = []
xgb = xgboost.XGBClassifier(max_depth=5, learning_rate=0.3, random_state=42, n_estimators=20, early_stopping_rounds=300, eval_set=[(X_train, y_train)], verbose=1000)#enable_categorical=True) tree_method='gpu_hist'
#xgb = xgboost.DMatrix(pd.DataFrame, enable_categorical=True)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
#accuracy_list.append(100*xgb_acc)
print(Fore.BLUE + "Accuracy of XGB : ", "{:.2f}%".format(100*xgb_acc))

In [93]:
# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(xgb.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(15, 8))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('XGB Features')
plt.tight_layout()
plt.show()

# Exporting notebook to script

In [94]:
#Creating a pickle file using serialization
import pickle
pickle_out = open('rf.pkl', 'wb')
pickle.dump(rf, pickle_out)
pickle_out.close()

# Model deployment

# Dependency and enviroment management

# Containerization

## Reff:
https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp/07-midterm-project
https://docs.google.com/spreadsheets/d/e/2PACX-1vQo-cOOGMA-ddbp6FgxusNBjS_HOmWaOYtvO7z-wk_TcCnPOBAza9s8Uj_eqfKGadoU0741cCGd95qI/pubhtml
https://www.kaggle.com/nayansakhiya/heart-fail-analysis-and-quick-prediction
