################################################

1. Exploratory Data Analysis
################################################

In [4]:
import joblib
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    return cat_cols, num_cols, cat_but_car

def heart_data_prep(df):
    """Veri Ön İşleme ve Özellik Mühendisliği (Feature Engineering)"""
    
    # 1. Eksik Değer & Hatalı Girdi Operasyonları
    df.loc[df['RestingBP'] == 0, 'RestingBP'] = df['RestingBP'].median()
    df['Cholesterol_Is_Missing'] = (df['Cholesterol'] == 0).astype(int)
    df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)
    df['Cholesterol'] = df.groupby(['Sex', 'HeartDisease'])['Cholesterol'].transform(lambda x: x.fillna(x.median()))

    # 2. Feature Engineering (Yeni Özellikler)
    # Binning
    df['Oldpeak_Bin'] = pd.cut(df['Oldpeak'], bins=[-3, 0, 1, 2, 3, 7], labels=['<0', '0-1', '1-2', '2-3', '3+'])
    df['AgeGroup_Optimized'] = pd.cut(df['Age'], bins=[0, 45, 55, 120], labels=['Young', 'Middle', 'Senior+'])
    
    # Validation Features
    df['HighChol'] = (df['Cholesterol'] > 200).astype(int)
    df['HighResting'] = (df['RestingBP'] > 140).astype(int)
    df['HighHR'] = (df['MaxHR'] > 130).astype(int)
    df['MetabolicRisk'] = ((df['FastingBS'] == 1) & (df['HighChol'] == 1)).astype(int)
    
    # Medical Interaction Features
    df['StressScore'] = df['Oldpeak'] * df['ExerciseAngina'].map({'Y':1, 'N':0})
    df['RPP'] = (df['RestingBP'] * df['MaxHR']) / 100
    df['DTS_Simulated'] = 1 - (5 * df['Oldpeak']) - (4 * df['ExerciseAngina'].map({'Y':1, 'N':0}))
    df['HR_Efficiency'] = df['MaxHR'] / (220 - df['Age'])
    df['Age_Oldpeak'] = df['Age'] * df['Oldpeak']

    # 3. Encoding
    ordinal_mappings = {
        'ST_Slope': {'Up': 0, 'Flat': 1, 'Down': 2},
        'Oldpeak_Bin': {'<0': 0, '0-1': 1, '1-2': 2, '2-3': 3, '3+': 4},
        'AgeGroup_Optimized': {'Young': 0, 'Middle': 1, 'Senior+': 2}
    }
    
    # Ordinal Mapping
    for col, mapping in ordinal_mappings.items():
        df[col] = df[col].map(mapping)

    # Değişkenleri yakala
    cat_cols, num_cols, _ = grab_col_names(df)
    cat_cols = [col for col in cat_cols if col not in ordinal_mappings.keys() and col != "HeartDisease"]

    # One-Hot Encoding
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    
    # Scaling (Ölçeklendirme)
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df, scaler, num_cols, ordinal_mappings

def heart_pipeline(csv_path):
    """Ana Boru Hattı: Oku -> İşle -> Eğit -> Kaydet"""
    
    # 1. Veri Yükleme
    df = pd.read_csv(csv_path)
    
    # 2. Hazırlık
    df_prepared, scaler, num_cols, mappings = heart_data_prep(df)
    
    # 3. Model Hazırlığı
    y = df_prepared["HeartDisease"]
    X = df_prepared.drop(["HeartDisease"], axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
    
    # 4. Model Eğitimi (XGBoost)
    model_xgb = XGBClassifier(
        n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42,
        eval_metric='logloss', enable_categorical=True, tree_method='hist'
    )
    model_xgb.fit(X_train, y_train)
    
    # 5. Değerlendirme
    y_pred = model_xgb.predict(X_test)
    y_prob = model_xgb.predict_proba(X_test)[:, 1]
    
    print("\n--- Model Performans Raporu ---")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
    
    # 6. KAYDETME (Paketleme)
    joblib.dump(model_xgb, "heart_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    
    metadata = {
        "num_cols": num_cols,
        "ordinal_mappings": mappings,
        "features": X.columns.tolist()
    }
    joblib.dump(metadata, "pipeline_metadata.pkl")
    
    print("\n[BİLGİ] Model, Scaler ve Metadata başarıyla kaydedildi!")

# Çalıştırmak için:
if __name__ == "__main__":
    heart_pipeline("dataset/heart.csv") # Veri yolunu kendine göre ayarla


--- Model Performans Raporu ---
              precision    recall  f1-score   support

           0       0.89      0.91      0.90        82
           1       0.93      0.91      0.92       102

    accuracy                           0.91       184
   macro avg       0.91      0.91      0.91       184
weighted avg       0.91      0.91      0.91       184

ROC-AUC: 0.9555

[BİLGİ] Model, Scaler ve Metadata başarıyla kaydedildi!
