In [None]:
!ls /kaggle/input/heart-failure-clinical-data

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns 
%matplotlib inline
import warnings    # to ignore any warnings 
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report, roc_auc_score


# load the training dataset
df = pd.read_csv('datasets\heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
df.corr()

In [None]:
df.describe()

In [None]:
df.isnull().sum().sum()

In [None]:
df.dtypes

In [None]:
print(list(df.columns ))

In [None]:
features_num = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 
             'platelets', 'serum_creatinine', 'serum_sodium', 'time']
features_cat = ['anaemia', 'diabetes', 'high_blood_pressure','sex', 'smoking']
label = 'DEATH_EVENT'

In [None]:
for col in features_num:
    df.boxplot(column=col, by=label, figsize=(6,6))
plt.show()

In [None]:
for col in features_cat:
    df[col].value_counts(normalize=True).plot.bar(figsize=(2,2), title=col)
    plt.show()

In [3]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:

log_clas = LogisticRegression(random_state = 0)
log_clas.fit(X_train, y_train)

y_pred = log_clas.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
log_clas_score = accuracy_score(y_test, y_pred)
#print(cm)
print(classification_report(y_test, y_pred))
y_scores = log_clas.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',log_clas_score)


In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_class = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree_class.fit(X_train, y_train)

y_pred = tree_class.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
tree_class_score = accuracy_score(y_test, y_pred)
#print(cm)
print(classification_report(y_test, y_pred))
y_scores = tree_class.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',tree_class_score)


In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_class = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0)
forest_class.fit(X_train, y_train)

y_pred = forest_class.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
forest_class_score = accuracy_score(y_test, y_pred)
#print(cm)
print(classification_report(y_test, y_pred))
y_scores = forest_class.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',forest_class_score)

In [None]:
# convert int and float to categorical 
for col in features_cat:
    df[col] = df[col].astype('category',copy=False)
df.dtypes

In [None]:
# GrideSearchCV
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [140]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [4, 6, 8]
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]

# Create the random grid
parameters =  {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
pprint(parameters)

In [None]:
model_rfc = forest_class
rf = GridSearchCV(estimator = model_rfc, param_grid = parameters, cv = 3)
# Fit the random search model
rf.fit(X_train, y_train)

In [None]:
print(rf.best_params_)


In [None]:
# 1. {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 140}
# 2. {'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 140}
# 3. {'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 140}

In [None]:
#outlier
#regularization
#feature engineering

In [None]:
# # normalize non categorical features
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_class = RandomForestClassifier(n_estimators = 1400, min_samples_split=2, 
                                      min_samples_leaf=1, max_features='auto', 
                                      criterion = 'entropy', max_depth=8)
forest_class.fit(X_train, y_train)

y_pred = forest_class.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
forest_class_score = accuracy_score(y_test, y_pred)
#print(cm)
print(classification_report(y_test, y_pred))
y_scores = forest_class.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',forest_class_score)

In [None]:
from sklearn.decomposition import PCA

pca=PCA()
pca.fit(X_train)
plt.figure(1,figsize=(12,8))
plt.xticks(np.arange(0, 15, 1))
plt.plot(pca.explained_variance_)

In [None]:
# PCA 

# pca=PCA(n_components=, whiten=True)
# pca.fit(X_train)
# X_train_pca=pd.DataFrame(pca.transform(X_train))
# X_test_pca=pd.DataFrame(pca.transform(X_test))

# forest_class = RandomForestClassifier(n_estimators = 1400, min_samples_split=2, 
#                                       min_samples_leaf=1, max_features='auto', 
#                                       criterion = 'entropy', max_depth=8)
# forest_class.fit(X_train_pca, y_train)

# y_pred = forest_class.predict(X_test_pca)
# cm = confusion_matrix(y_test, y_pred)
# forest_class_score = accuracy_score(y_test, y_pred)
# #print(cm)
# print(classification_report(y_test, y_pred))
# y_scores = forest_class.predict_proba(X_test_pca)
# auc = roc_auc_score(y_test,y_scores[:,1])
# print('AUC: ' + str(auc))
# print('Accuracy :',forest_class_score)

In [None]:
# fEATURE iMPORTANCE

from sklearn.ensemble import RandomForestClassifier

forest_class = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0)
forest_class.fit(X_train, y_train)

importance = forest_class.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()


In [None]:
X_import = X[:, (0,2,4,6,7,8,11)]
X_train, X_test, y_train, y_test = train_test_split(X_import, y, test_size = 0.2, random_state = 1)

forest_class = RandomForestClassifier(n_estimators = 2000, min_samples_split=2, 
                                      min_samples_leaf=1, max_features='auto', 
                                      criterion = 'gini', max_depth=8, random_state= 42)

forest_class.fit(X_train, y_train)

y_pred = forest_class.predict(X_test)
forest_class_score = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
y_scores = forest_class.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',forest_class_score)
print('f1_score :', f1_score(y_test, y_pred, average='weighted'))

In [4]:
from sklearn.ensemble import RandomForestClassifier

X_import = X[:, (4,7,11)]
X_train, X_test, y_train, y_test = train_test_split(X_import, y, test_size = 0.2, random_state = 1)

forest_class = RandomForestClassifier(n_estimators = 1500, min_samples_split=2, 
                                      min_samples_leaf=1, max_features='auto', 
                                      criterion = 'gini', max_depth=8, random_state= 42)

forest_class.fit(X_train, y_train)

y_pred = forest_class.predict(X_test)
forest_class_score = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
y_scores = forest_class.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',forest_class_score)
print('f1_score :',f1_score(y_test, y_pred, average='weighted'))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96        46
           1       0.81      0.93      0.87        14

    accuracy                           0.93        60
   macro avg       0.89      0.93      0.91        60
weighted avg       0.94      0.93      0.93        60

AUC: 0.9549689440993789
Accuracy : 0.9333333333333333
f1_score : 0.9348148148148149


In [20]:
df.iloc[0, [0,2,4,6,7,8,11]]

age                             75.0
creatinine_phosphokinase       582.0
ejection_fraction               20.0
platelets                   265000.0
serum_creatinine                 1.9
serum_sodium                   130.0
time                             4.0
Name: 0, dtype: float64

In [5]:
import joblib

# Save the model as a pickle file
filename = './model_cardiaque.pkl'
joblib.dump(forest_class, filename)

['./model_cardiaque.pkl']

In [6]:
!ls /kaggle/working

__notebook_source__.ipynb  model_cardiaque.pkl


In [7]:
# Load the model from the file
model = joblib.load(filename)

In [9]:
X_new = np.array([[20,2,4]])
print ('New sample: {}'.format(list(X_new[0])))

# Get a prediction
pred = model.predict(X_new)
proba = model.predict_proba(X_new)
print('Predicted class is {}'.format(pred[0]))
print('Predicted probabilty is {}'.format(proba[0][1]))

New sample: [20, 2, 4]
Predicted class is 1
Predicted probabilty is 0.9883666666666666


In [10]:
X_import = X[:, (0,2,4,6,7,8,11)]
X_train, X_test, y_train, y_test = train_test_split(X_import, y, test_size = 0.2, random_state = 1)

forest_class_2 = RandomForestClassifier(n_estimators = 2000, min_samples_split=2, 
                                      min_samples_leaf=1, max_features='auto', 
                                      criterion = 'gini', max_depth=8, random_state= 42)

forest_class_2.fit(X_train, y_train)

y_pred = forest_class_2.predict(X_test)
forest_class_score = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
y_scores = forest_class_2.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
print('Accuracy :',forest_class_score)
print('f1_score :', f1_score(y_test, y_pred, average='weighted'))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95        46
           1       0.85      0.79      0.81        14

    accuracy                           0.92        60
   macro avg       0.89      0.87      0.88        60
weighted avg       0.92      0.92      0.92        60

AUC: 0.9642857142857144
Accuracy : 0.9166666666666666
f1_score : 0.9155714854639585


In [11]:
# Save the model as a pickle file
filename = './model_cardiaque_2.pkl'
joblib.dump(forest_class_2, filename)

# Load the model from the file
model_2 = joblib.load(filename)

In [12]:
X_new = np.array([[75, 580,20, 265000,2, 150,4]])
print ('New sample: {}'.format(list(X_new[0])))

# Get a prediction
pred = model_2.predict(X_new)
proba = model_2.predict_proba(X_new)
print('Predicted class is {}'.format(pred[0]))
print('Predicted probabilty is {}'.format(proba[0][1]))

New sample: [75, 580, 20, 265000, 2, 150, 4]
Predicted class is 1
Predicted probabilty is 0.9024823140309982
