In [7]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.2.2

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Collecting scikit-learn==1.2.2
  Using cached scikit_learn-1.2.2-cp39-cp39-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.2.2-cp39-cp39-win_amd64.whl (8.4 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.2.2


In [1]:
import joblib
import pandas as pd
import numpy as np
import warnings
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

warnings.simplefilter(action='ignore', category=FutureWarning)

################################################
# 1. Helper Functions (Yardımcı Fonksiyonlar)
################################################

def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    return cat_cols, num_cols, cat_but_car

def heart_data_prep(df):
    """Veri Ön İşleme ve Özellik Mühendisliği"""
    # Eksik/Hatalı Değerler
    df.loc[df['RestingBP'] == 0, 'RestingBP'] = df['RestingBP'].median()
    df['Cholesterol_Is_Missing'] = (df['Cholesterol'] == 0).astype(int)
    df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)
    df['Cholesterol'] = df.groupby(['Sex', 'HeartDisease'])['Cholesterol'].transform(lambda x: x.fillna(x.median()))

    # Feature Engineering
    df['Oldpeak_Bin'] = pd.cut(df['Oldpeak'], bins=[-3, 0, 1, 2, 3, 7], labels=['<0', '0-1', '1-2', '2-3', '3+'])
    df['AgeGroup_Optimized'] = pd.cut(df['Age'], bins=[0, 45, 55, 120], labels=['Young', 'Middle', 'Senior+'])
    
    # Yeni Özellikler
    df['HighChol'] = (df['Cholesterol'] > 200).astype(int)
    df['MetabolicRisk'] = ((df['FastingBS'] == 1) & (df['HighChol'] == 1)).astype(int)
    df['StressScore'] = df['Oldpeak'] * df['ExerciseAngina'].map({'Y':1, 'N':0})
    df['DTS_Simulated'] = 1 - (5 * df['Oldpeak']) - (4 * df['ExerciseAngina'].map({'Y':1, 'N':0}))
    df['Age_Oldpeak'] = df['Age'] * df['Oldpeak']
    df['RPP'] = (df['RestingBP'] * df['MaxHR']) / 100

    # Encoding
    ordinal_mappings = {
        'ST_Slope': {'Up': 0, 'Flat': 1, 'Down': 2},
        'Oldpeak_Bin': {'<0': 0, '0-1': 1, '1-2': 2, '2-3': 3, '3+': 4},
        'AgeGroup_Optimized': {'Young': 0, 'Middle': 1, 'Senior+': 2}
    }
    for col, mapping in ordinal_mappings.items():
        df[col] = df[col].map(mapping)

    cat_cols, num_cols, _ = grab_col_names(df)
    cat_cols = [col for col in cat_cols if col not in ordinal_mappings.keys() and col != "HeartDisease"]
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df, scaler, num_cols, ordinal_mappings

################################################
# 2. Modeling Functions (Modelleme)
################################################

def base_models(X, y, scoring="roc_auc"):
    print("Base Models....")
    classifiers = [('LR', LogisticRegression()),
                   ('KNN', KNeighborsClassifier()),
                   ("SVC", SVC()),
                   ("CART", DecisionTreeClassifier()),
                   ("RF", RandomForestClassifier()),
                   ('Adaboost', AdaBoostClassifier()),
                   ('GBM', GradientBoostingClassifier()),
                   #Düzeltildi
                   ('XGBoost', XGBClassifier(eval_metric='logloss', enable_categorical=True, tree_method='hist')),
                   ('LightGBM', LGBMClassifier(verbose=-1))]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
def base_models(X, y, scoring="roc_auc"):
    print("Base Models....")
    classifiers = [('LR', LogisticRegression()),
                   ('KNN', KNeighborsClassifier()),
                   ("SVC", SVC()),
                   ("CART", DecisionTreeClassifier()),
                   ("RF", RandomForestClassifier()),
                   ('Adaboost', AdaBoostClassifier()),
                   ('GBM', GradientBoostingClassifier()),
                   # --- DÜZELTME BURADA ---
                   ('XGBoost', XGBClassifier(eval_metric='logloss', enable_categorical=True, tree_method='hist')),
                   ('LightGBM', LGBMClassifier(verbose=-1, enable_categorical=True))]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
        print(f"{scoring}: {round(cv_results['test_score'].mean(), 4)} ({name}) ")

def hyperparameter_optimization(X, y, cv=3, scoring="roc_auc"):
    print("\nHyperparameter Optimization....")
    
    knn_params = {"n_neighbors": range(2, 20)}
    cart_params = {'max_depth': range(1, 10), "min_samples_split": range(2, 10)}
    rf_params = {"max_depth": [8, 15, None], "max_features": [5, 7, "sqrt"], "n_estimators": [100, 200]}
    
    # --- DÜZELTME BURADA ---
    xgboost_params = {"learning_rate": [0.1, 0.01], "max_depth": [5, 8], "n_estimators": [100, 200]}
    lightgbm_params = {"learning_rate": [0.01, 0.1], "n_estimators": [300, 500]}

    classifiers = [('KNN', KNeighborsClassifier(), knn_params),
                   ("CART", DecisionTreeClassifier(), cart_params),
                   ("RF", RandomForestClassifier(), rf_params),
                   # XGBoost ve LightGBM nesnelerine parametreleri ekledik
                   ('XGBoost', XGBClassifier(eval_metric='logloss', enable_categorical=True, tree_method='hist'), xgboost_params),
                   ('LightGBM', LGBMClassifier(verbose=-1, enable_categorical=True), lightgbm_params)]

    best_models = {}
    for name, classifier, params in classifiers:
        print(f"########## {name} ##########")
        gs_best = GridSearchCV(classifier, params, cv=cv, n_jobs=-1, verbose=False).fit(X, y)
        final_model = classifier.set_params(**gs_best.best_params_)
        
        cv_results = cross_validate(final_model, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (After GS): {round(cv_results['test_score'].mean(), 4)}")
        best_models[name] = final_model
    return best_models
def voting_classifier(best_models, X, y):
    print("\nVoting Classifier...")
    voting_clf = VotingClassifier(estimators=[('XGBoost', best_models["XGBoost"]), 
                                              ('RF', best_models["RF"]),
                                              ('LightGBM', best_models["LightGBM"])],
                                  voting='soft').fit(X, y)
    
    cv_results = cross_validate(voting_clf, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
    print(f"Accuracy: {round(cv_results['test_accuracy'].mean(), 4)}")
    print(f"F1Score: {round(cv_results['test_f1'].mean(), 4)}")
    print(f"ROC_AUC: {round(cv_results['test_roc_auc'].mean(), 4)}")
    return voting_clf

################################################
# 3. Main Pipeline
################################################

def main():
    # 1. Veri Yükleme (Dosya yoluna dikkat!)
    df = pd.read_csv("dataset/heart.csv") 
    
    # 2. Hazırlık
    df_prepared, scaler, num_cols, mappings = heart_data_prep(df)
    y = df_prepared["HeartDisease"]
    X = df_prepared.drop(["HeartDisease"], axis=1)

    # 3. Temel Modeller
    base_models(X, y)

    # 4. Hiperparametre Optimizasyonu
    best_models = hyperparameter_optimization(X, y)

    # 5. Ensemble Modeli
    voting_clf = voting_classifier(best_models, X, y)

    # 6. Kaydetme
    joblib.dump(voting_clf, "heart_voting_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    metadata = {"num_cols": num_cols, "ordinal_mappings": mappings, "features": X.columns.tolist()}
    joblib.dump(metadata, "pipeline_metadata.pkl")
    
    print("\n[BİLGİ] Tüm süreç tamamlandı ve modeller rafa kaldırıldı!")
    return voting_clf

if __name__ == "__main__":
    main()

Base Models....
roc_auc: 0.8973 (LR) 
roc_auc: 0.8772 (KNN) 
roc_auc: 0.8985 (SVC) 
roc_auc: 0.7683 (CART) 
roc_auc: 0.9086 (RF) 
roc_auc: 0.8882 (Adaboost) 
roc_auc: 0.9015 (GBM) 
roc_auc: 0.9101 (XGBoost) 
roc_auc: 0.9029 (LightGBM) 

Hyperparameter Optimization....
########## KNN ##########
roc_auc (After GS): 0.8772
########## CART ##########
roc_auc (After GS): 0.8097
########## RF ##########
roc_auc (After GS): 0.905
########## XGBoost ##########
roc_auc (After GS): 0.9007
########## LightGBM ##########
roc_auc (After GS): 0.9075

Voting Classifier...
Accuracy: 0.8399
F1Score: 0.8557
ROC_AUC: 0.908

[BİLGİ] Tüm süreç tamamlandı ve modeller rafa kaldırıldı!


In [4]:
import sklearn
print(sklearn.__version__)

1.2.2
