In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [11]:
# Stap 2: Inlezen en voorbereiden van de data
# Lees de CSV-bestand in (vervang 'path/to/your/dataset.csv' met de daadwerkelijke pad naar je CSV)
# df = pd.read_csv('data.csv', sep=';')
# df = pd.read_csv('cleaned_data.csv', sep=';')
# df = pd.read_csv('cleaned_and_featured_data.csv', sep=';')
# df = pd.read_csv('cleaned_and_featured_data_males.csv', sep=';')
df = pd.read_csv('cleaned_and_featured_data_females.csv', sep=';')

# Verken de data (optioneel)
print(df.head())  # Print de eerste paar regels van de dataset
print(df.info())  # Geef een overzicht van de dataset

# Selecteer de features (X) en de target variabele (y)
# X = df.drop(['HeartDisease', 'CaseNumber', 'LastName', 'PostCode'], axis=1)
X = df.drop(['HeartDisease'], axis=1)  # Verwijder niet-relevante kolommen
y = df['HeartDisease']

# Converteer categorische kolommen naar numeriek (One-Hot Encoding voor categorische features)
X = pd.get_dummies(X, drop_first=True)

# Splits de data in een train- en testset (80% trainen, 20% testen)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


   Age ChestPainType  RestingBP Cholesterol  FastingBS  RestingECG  MaxHR  \
0   46           ASY        100           H          1           1    133   
1   56           NAP        125           H          1           0     98   
2   42           ASY        105           H          1           0    128   
3   38           NAP        100           H          0           0    179   
4   56           ASY        115           H          1           1     82   

   ExerciseAngina _ True  Oldpeak ST_Slope  HeartDisease  
0                      0       26     Flat             1  
1                      0       20     Flat             1  
2                      1       15     Down             1  
3                      0       11       Up             0  
4                      0       10       Up             1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                

In [12]:
# Stap 3: Model Selectie en Training
# Definieer de modellen die je wilt proberen
models = {
    "Linear Regression": LinearRegression(),  # Voorbeeldmodel, gebruik voor vergelijking, niet echt voor classificatie
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Ridge Classifier": RidgeClassifier(),
    "Lasso Regression": Lasso(),  # Voorbeeldmodel, gebruik voor vergelijking, niet echt voor classificatie
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

# Train elk model en evalueer de prestaties
results = {}
for name, model in models.items():
    try:
        model.fit(X_train, y_train)  # Train het model
        
        # Check of predict_proba beschikbaar is, anders gebruik predict direct voor ROC AUC
        if hasattr(model, "predict_proba"):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
        else:
            # Voor lineaire regressie en lasso, cast naar binair resultaat
            y_pred_proba = model.predict(X_test)
            y_pred_proba = (y_pred_proba >= 0.5).astype(int)  # Simuleer classificatie output
        
        y_pred = model.predict(X_test)  # Voorspel met de testdata
        accuracy = accuracy_score(y_test, y_pred)  # Bereken de nauwkeurigheid
        roc_auc = roc_auc_score(y_test, y_pred_proba)  # Bereken de ROC AUC
        f1 = f1_score(y_test, y_pred)  # Bereken de F1-score
        
        results[name] = {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'F1 Score': f1}
        # Print de evaluatie metrics
        print(f"{name} - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}, F1 Score: {f1:.4f}")
    
    except Exception as e:
        print(f"Error training {name}: {e}")

# Bepaal welk model de beste prestaties levert op basis van ROC AUC
best_model = max(results, key=lambda x: results[x]['ROC AUC'])
print(f"\nBest model based on ROC AUC: {best_model}")

Error training Linear Regression: Classification metrics can't handle a mix of binary and continuous targets
Logistic Regression - Accuracy: 0.9028, ROC AUC: 0.9655, F1 Score: 0.9213
Ridge Classifier - Accuracy: 0.9028, ROC AUC: 0.8970, F1 Score: 0.9213
Error training Lasso Regression: Classification metrics can't handle a mix of binary and continuous targets
Decision Tree - Accuracy: 0.8056, ROC AUC: 0.7906, F1 Score: 0.8444
Random Forest - Accuracy: 0.8889, ROC AUC: 0.9596, F1 Score: 0.9101
Gradient Boosting - Accuracy: 0.9167, ROC AUC: 0.9718, F1 Score: 0.9326

Best model based on ROC AUC: Gradient Boosting
