In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score


In [2]:
# Lees de CSV-bestand in (vervang 'data.csv' met de juiste pad naar je bestand)
df = pd.read_csv('data.csv', sep=';')

# Verken de data (optioneel)
print(df.head())
print(df.info())

# Selecteer de features (X) en de target variabele (y)
X = df.drop(['HeartDisease', 'CaseNumber', 'LastName', 'PostCode'], axis=1)
y = df['HeartDisease']

# Converteer categorische kolommen naar numeriek (One-Hot Encoding)
X = pd.get_dummies(X, drop_first=True)


    CaseNumber  LastName  PostCode  Age Sex ChestPainType  RestingBP  \
0  431-06-4243   Richard     99847   40   M           ATA        140   
1  415-39-7809  Sheppard     64192   49   F           NAP        160   
2  517-18-4618    Howard     29132   37   M           ATA        130   
3  634-33-8726    Taylor     12930   48   F           ASY        138   
4  151-40-1619   Mcgrath     79393   54   M           NAP        150   

   Cholesterol  FastingBS  RestingECG  MaxHR ExerciseAngina  Oldpeak ST_Slope  \
0          289          0           0    172              N        0       Up   
1          180          0           0    156              N       10     Flat   
2          283          0           1     98              N        0       Up   
3          214          0           0    108              Y       15     Flat   
4          195          0           0    122              N        0       Up   

   HeartDisease  
0             0  
1             1  
2             0  
3       

In [3]:
# Splits de data in een train- en testset (80% trainen, 20% testen)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Kies het Gradient Boosting model
model = GradientBoostingClassifier()

# Train het model
model.fit(X_train, y_train)


In [7]:
# Voorspel met de testdata
y_pred = model.predict(X_test)  # Voorspel met de testdata
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Krijg de voorspelde waarschijnlijkheden

# Bereken de evaluatie metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

# Print de evaluatie metrics
print(f"Gradient Boosting - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}, F1 Score: {f1:.4f}")


Gradient Boosting - Accuracy: 0.8525, ROC AUC: 0.9232, F1 Score: 0.8831


In [16]:
# Maak een DataFrame met de werkelijke en voorspelde waarden
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Predicted Probability': y_pred_proba
})

# Selecteer een willekeurige sample van 20 rijen
random_sample = results_df.sample(n=20, random_state=42)

# Print de willekeurige sample
print(random_sample)


     Actual  Predicted  Predicted Probability
72        1          1               0.858937
218       0          0               0.022750
731       0          1               0.862527
660       1          0               0.261032
518       0          1               0.557846
39        0          1               0.858233
70        1          1               0.851344
755       1          0               0.165076
559       0          0               0.045956
394       1          1               0.839931
136       0          0               0.007463
433       0          1               0.725721
888       0          0               0.154407
408       1          1               0.980328
911       1          0               0.388197
235       1          1               0.852486
578       1          1               0.988825
501       0          0               0.136526
209       1          1               0.967188
467       0          1               0.742397


In [13]:
from sklearn.model_selection import GridSearchCV

# Definieer de hyperparameters en de waarden die je wilt testen
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
}

# Configureer de GridSearchCV
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(), 
                           param_grid=param_grid, 
                           scoring='roc_auc', 
                           cv=5, 
                           verbose=2)

# Voer de Grid Search uit
grid_search.fit(X_train, y_train)

# Toon de beste hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Train het model met de beste hyperparameters
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END learning_rate=0.01, max_depth

In [14]:
# Maak voorspellingen met het beste model
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

# Maak een DataFrame met de werkelijke en voorspelde waarden
results_df_best = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_best,
    'Predicted Probability': y_pred_proba_best
})

# Selecteer een willekeurige sample van 20 rijen
random_sample_best = results_df_best.sample(n=20, random_state=42)

# Print de willekeurige sample
print(random_sample_best)


     Actual  Predicted  Predicted Probability
72        1          1               0.997981
218       0          0               0.000346
731       0          1               0.999335
660       1          0               0.031905
518       0          0               0.088730
39        0          1               0.992787
70        1          1               0.989433
755       1          0               0.023743
559       0          0               0.000827
394       1          1               0.996608
136       0          0               0.000108
433       0          1               0.977895
888       0          0               0.032233
408       1          1               0.999675
911       1          0               0.018546
235       1          1               0.932687
578       1          1               0.999654
501       0          0               0.001781
209       1          1               0.998333
467       0          1               0.968539
