### Data analisys using Decision Tree Classifier, Logistic Regression, Random Forest Classifier and Neural Network. 
#### The purpose of this project is to predict heart failure  using basic ML models. Dataset - heart failure clinic data from https://www.kaggle.com/andrewmvd/heart-failure-clinical-data. After сhoosing the most accurate model will be creating UI for further convenient use

### 1. Import libraries

In [180]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from keras.models import Sequential 
from keras.layers import Dense


### 2. Data preparation

In [181]:
os.getcwd()

'/content/drive/My Drive/Heart_failure'

In [191]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Heart_failure')
base_data = pd.read_csv("heart_failure_clinical_records.csv", delimiter = ",")
base_data.columns

cols = ["age", "anaemia", "creatinine_phosphokinase", "diabetes", "ejection_fraction", "high_blood_pressure", "platelets",
        "serum_creatinine", "serum_sodium", "sex", "smoking", "DEATH_EVENT"]
data = base_data[cols].copy()
# create copy of data without useless columns such as "time" 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data.isnull().any()
# check data with null

In [None]:
corr = data.corr()
corr[abs(corr['DEATH_EVENT']) > 0.1]['DEATH_EVENT']

In [194]:
# get to dataframe most correlated data
cols = ["age", "ejection_fraction", "serum_creatinine", "serum_sodium", "DEATH_EVENT"]
data = base_data[cols].copy()

In [None]:
data.describe()
# Sex - Gender of patient Male = 1, Female =0
# Age - Age of patient
# Diabetes - 0 = No, 1 = Yes
# Anaemia - 0 = No, 1 = Yes
# High_blood_pressure - 0 = No, 1 = Yes
# Smoking - 0 = No, 1 = Yes
# DEATH_EVENT - 0 = No, 1 = Yes

In [None]:
print(data.head())

### 4. Training model

In [197]:
y = data.iloc[:,4]
x = data.iloc[:,0:4] # prediction death_event
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 30% of dataset for tests

In [None]:
def model(X_train, y_train):
    
    forest = RandomForestClassifier (n_estimators=20, random_state=0)
    forest.fit(X_train,y_train)
    print("Random Forest: {0}".format(forest.score(X_train,y_train)))
    
    lreg = LogisticRegression(max_iter=10000)
    lreg.fit(X_train,y_train)
    print("Regresja logistyczna: {0}".format(lreg.score(X_train,y_train)))
  
    tree = DecisionTreeClassifier()
    tree.fit(X_train,y_train)
    print("Decision Tree: {0}".format(tree.score(X_train,y_train)))

    nnm = Sequential()
    nnm.add(Dense(4, kernel_initializer = 'uniform', activation = 'relu', input_dim = 4))
    nnm.add(Dense(4, kernel_initializer = 'uniform', activation = 'relu'))
    nnm.add(Dense(2, kernel_initializer = 'uniform', activation = 'relu'))
    nnm.add(Dense(1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    nnm.summary()
    nnm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

    nnm.fit(X_train, y_train, batch_size = 32, epochs = 200)
    
    return forest, lreg, tree, nnm

forest, lreg, tree, nnm = model(X_train,y_train)

In [None]:
y1_predict = forest.predict(X_test)
print("Random Forest {0}".format(accuracy_score(y_test, y1_predict)))

y2_predict = lreg.predict(X_test)
print("Logistic Regresion {0}".format(accuracy_score(y_test, y2_predict)))

y3_predict = tree.predict(X_test)
print("Decision Tree {0}".format(accuracy_score(y_test, y3_predict)))

y4_predict = nnm.predict(X_test)
y4_predict = (y4_predict > 0.5)
print("Neural network {0}".format(accuracy_score(y_test, y4_predict)))

In [200]:
print("Ocena modelu 1. Random Forest")
print(classification_report(y_test,y1_predict))

print("Ocena modelu 2. Regresja logistyczna")
print(classification_report(y_test,y2_predict))

print("Ocena modelu 3. Decision Tree")
print(classification_report(y_test,y3_predict))

print("Ocena modelu 4. Neural network")
print(classification_report(y_test,y4_predict))

Ocena modelu 1. Random Forest
              precision    recall  f1-score   support

           0       0.75      0.89      0.81        62
           1       0.59      0.36      0.44        28

    accuracy                           0.72        90
   macro avg       0.67      0.62      0.63        90
weighted avg       0.70      0.72      0.70        90

Ocena modelu 2. Regresja logistyczna
              precision    recall  f1-score   support

           0       0.74      0.87      0.80        62
           1       0.53      0.32      0.40        28

    accuracy                           0.70        90
   macro avg       0.63      0.60      0.60        90
weighted avg       0.67      0.70      0.68        90

Ocena modelu 3. Decision Tree
              precision    recall  f1-score   support

           0       0.80      0.76      0.78        62
           1       0.52      0.57      0.54        28

    accuracy                           0.70        90
   macro avg       0.66      0.

**The highest accuracy has Random Forest (0.72) and Neural Network (0.71), but most essential characteristic for heart failure prediction is precision of positive death_event (death of patient) has Random Forest model (0.59). Let's try to use all data for dataframe**

### 5. Training model (all data)

In [202]:
cols = ["age", "anaemia", "creatinine_phosphokinase", "diabetes", "ejection_fraction", "high_blood_pressure", "platelets",
        "serum_creatinine", "serum_sodium", "sex", "smoking", "DEATH_EVENT"]
data = base_data[cols].copy()

y = data.iloc[:,11]
x = data.iloc[:,0:11] # prediction death_event
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 30% of dataset for tests

def model(X_train, y_train):
    
    forest = RandomForestClassifier (n_estimators=20, random_state=0)
    forest.fit(X_train,y_train)
    print("Random Forest: {0}".format(forest.score(X_train,y_train)))
    
    lreg = LogisticRegression(max_iter=10000)
    lreg.fit(X_train,y_train)
    print("Regresja logistyczna: {0}".format(lreg.score(X_train,y_train)))
  
    tree = DecisionTreeClassifier()
    tree.fit(X_train,y_train)
    print("Decision Tree: {0}".format(tree.score(X_train,y_train)))

    nnm = Sequential()
    nnm.add(Dense(12, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))
    nnm.add(Dense(8, kernel_initializer = 'uniform', activation = 'relu'))
    nnm.add(Dense(6, kernel_initializer = 'uniform', activation = 'relu'))
    nnm.add(Dense(4, kernel_initializer = 'uniform', activation = 'relu'))
    nnm.add(Dense(2, kernel_initializer = 'uniform', activation = 'relu'))
    nnm.add(Dense(1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    nnm.summary()
    nnm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    nnm.fit(X_train, y_train, batch_size = 32, epochs = 200)
    
    return forest, lreg, tree, nnm

forest, lreg, tree, nnm = model(X_train,y_train)


y1_predict = forest.predict(X_test)
print("Random Forest {0}".format(accuracy_score(y_test, y1_predict)))

y2_predict = lreg.predict(X_test)
print("Logistic Regresion {0}".format(accuracy_score(y_test, y2_predict)))

y3_predict = tree.predict(X_test)
print("Decision Tree {0}".format(accuracy_score(y_test, y3_predict)))

y4_predict = nnm.predict(X_test)
y4_predict = (y4_predict > 0.5)
print("Neural network {0}".format(accuracy_score(y_test, y4_predict)))

print("Ocena modelu 1. Random Forest")
print(classification_report(y_test,y1_predict))

print("Ocena modelu 2. Regresja logistyczna")
print(classification_report(y_test,y2_predict))

print("Ocena modelu 3. Decision Tree")
print(classification_report(y_test,y3_predict))

print("Ocena modelu 4. Neural network")
print(classification_report(y_test,y4_predict))


Random Forest: 1.0
Regresja logistyczna: 0.7751196172248804
Decision Tree: 1.0
Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_144 (Dense)           (None, 12)                144       
                                                                 
 dense_145 (Dense)           (None, 8)                 104       
                                                                 
 dense_146 (Dense)           (None, 6)                 54        
                                                                 
 dense_147 (Dense)           (None, 4)                 28        
                                                                 
 dense_148 (Dense)           (None, 2)                 10        
                                                                 
 dense_149 (Dense)           (None, 1)                 3         
                                        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**In dataframe with all data highest accuracy has Random Forest model, precision of positive death_event (death of patient) in Neural Network model is 0.**

### 6. Model export

#### Best results of accuracy and precision has Random Forest model and model based on all data in dataframe will be exported.

In [203]:
filename = "heart_failure_model.sv"
pickle.dump(forest, open(filename,'wb'))