In [18]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import RandomizedSearchCV

In [11]:
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

In [4]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [12]:
df.corr()["DEATH_EVENT"].sort_values(ascending=False)[1:6]

serum_creatinine            0.294278
age                         0.253729
high_blood_pressure         0.079351
anaemia                     0.066270
creatinine_phosphokinase    0.062728
Name: DEATH_EVENT, dtype: float64

In [13]:
df_x = df.iloc[:,1:6]

In [10]:
df.head()

Unnamed: 0,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure
0,0,582,0,20,1
1,0,7861,0,38,0
2,0,146,0,20,0
3,1,111,0,20,0
4,1,160,1,20,0


In [14]:
X = df_x
y = df["DEATH_EVENT"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_train, y_pred))
print(confusion_matrix(y_test, y_pred2))

[[167   1]
 [  3  68]]
[[27  8]
 [16  9]]


In [19]:
params = {"max_depth": [1, 2, 3, 4, 5],
          "subsample": [0.2, 0.4, 0.6, 0.8],
          "n_estimators": [100, 200, 300, 400, 500],
          "learning_rate": [0.05, 0.1, 0.2, 0.3, 0.5, 0.8],
          "min_child_weight": [1, 2, 3, 4],
          }

In [25]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="error")
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred2 = model.predict(X_test)

In [26]:
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, n_jobs=-1, verbose=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric='error', gamma=0,
                                           gpu_id=-1, grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_bin=2...
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=

In [27]:
random_search.best_params_

{'subsample': 0.6,
 'n_estimators': 500,
 'min_child_weight': 2,
 'max_depth': 1,
 'learning_rate': 0.5}

In [30]:
model = xgb.XGBClassifier(subsample=0.6,
                          n_estimators=500,
                          min_child_weight=2,
                          max_depth=1,
                          learning_rate=0.5,
                          use_label_encoder=False, 
                          eval_metric="error",
                          booster="gbtree",
                          objective="binary:logistic")

model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_train, y_pred))
print(confusion_matrix(y_test, y_pred2))

[[164   4]
 [ 24  47]]
[[30  5]
 [17  8]]


In [32]:
from scipy import stats
import numpy as np

In [33]:
z = np.abs(stats.zscore(df))
dataset = df[((z < 3)).all(axis=1)]

In [34]:
dataset.shape

(280, 13)

In [39]:
dataset.corr()["DEATH_EVENT"].sort_values(ascending=False)[1:]

serum_creatinine            0.329460
age                         0.259299
high_blood_pressure         0.064143
anaemia                     0.039480
sex                         0.012900
diabetes                    0.001009
smoking                    -0.000476
creatinine_phosphokinase   -0.029289
platelets                  -0.048259
serum_sodium               -0.203471
ejection_fraction          -0.303506
time                       -0.515579
Name: DEATH_EVENT, dtype: float64

In [40]:
dataset.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [45]:
X = dataset.iloc[:, [8, 1, 6, 2, 10, 4]]
y = dataset["DEATH_EVENT"]

In [78]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.4, random_state=42)

In [79]:
model2 = xgb.XGBClassifier(max_depth=3,
                           n_estimators=100,
                           booster="gbtree",
                           min_child_weight=1
                           

).fit(X_train2, y_train2)
y_prediction = model2.predict(X_train2)
y_prediction2 = model2.predict(X_test2)

print(accuracy_score(y_train2, y_prediction))
print(confusion_matrix(y_train2, y_prediction))
print(accuracy_score(y_test2, y_prediction2))
print(confusion_matrix(y_test2, y_prediction2))

1.0
[[114   0]
 [  0  54]]
0.7142857142857143
[[69 11]
 [21 11]]


In [80]:
train = xgb.DMatrix(X_train2, label=y_train2)
test = xgb.DMatrix(X_test2, label=y_test2)

In [82]:
epochs=10
params = {"max_depth": 1,
          "eta": 0.3,
          "objective": "multi:softmax",
          "num_class": 2
}

In [83]:
model3 = xgb.train(params, train, epochs)

In [85]:
y_prediction = model3.predict(train)
y_prediction2 = model3.predict(test)

print(accuracy_score(y_train2, y_prediction))
print(confusion_matrix(y_train2, y_prediction))
print(accuracy_score(y_test2, y_prediction2))
print(confusion_matrix(y_test2, y_prediction2))

0.7619047619047619
[[111   3]
 [ 37  17]]
0.75
[[77  3]
 [25  7]]


In [None]:
# Najlepszy model wyszedł w Projekt_ML_v2
# Należy spróbować usunąć outliery i spróbować ponownie