In [None]:
### Content

# 1- Libraries
# 2- API Integration
# 3- Download the Files
# 4- Read the Files

# 5- Data Preparation
# 5.1 Drop Unnecessary Columns 
# 5.2 Encoding

# 6- Exploratory Data Analysis
# 6.1- Statistical Analysis
# 6.2- Histograms
# 6.3- Distributions
# 6.4- Scatter-Plotting
# 6.5- Outliers Checking

# 7- Building Model-0 // Benchmark

# 8- Data Cleaning
# 8.1- Drop Outliers
# 8.1.1- Balance
# 8.1.2- EstimatedSalary
# 8.2- Standart Scaler


# 9- Handling Imbalanced Data
# 10- Building Model-1 // Improvements
# 11- Feature Engineering
# 12- Hyperparameter Tuning
# 13- Cross Validation

### 1- Libraries

In [None]:
### 1- Libraries

# Data Manipulation and Cleaning Libraries
import pandas as pd  # For data manipulation and data frames
import numpy as np  # For numerical operations and arrays

# File and System Operations Libraries
import os  # For operating system interactions, like file handling
import sys
import zipfile  # For working with zip files

# Dataset Access Libraries
from kaggle.api.kaggle_api_extended import KaggleApi  # For accessing datasets from Kaggle
from config import *  # Importing custom configurations
from model_eval_func import evaluate_model_performance, evaluate_model  # Custom model evaluation functions

# Data Visualization Libraries
import seaborn as sns  # For advanced data visualization
from matplotlib import pyplot as plt  # For plotting graphs and charts
import matplotlib  # For customizing matplotlib settings

# Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler  # For converting categorical data to numerical and scaling
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold  # Model selection and evaluation
from sklearn.linear_model import LogisticRegression  # Logistic Regression
from sklearn.tree import DecisionTreeClassifier  # Decision Tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Ensemble methods
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report  # Model evaluation metrics
from xgboost import XGBClassifier  # XGBoost classifier
from imblearn.over_sampling import RandomOverSampler  # Handling imbalanced data

### 2- API

In [None]:
# Kaggle API
api = KaggleApi()
api.authenticate()

competition = comp
api.competition_download_files(competition, path=path_1)


### 3- Download the Files

In [None]:
# Download the file
zip_file_path = os.path.join(path_1, 'playground-series-s4e1.zip')  # Burada dosya adını doğru şekilde belirtin

# Extract the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(path_1)

print(f"Zip Files Saved...")

### 4- Read the Files

In [None]:
# Raw data klasörünün yolu
raw_data_directory = path_1

In [None]:
# Klasörün içindeki dosyaları listeleme
if os.path.exists(path_1) and os.path.isdir(path_1):
    files = os.listdir(path_1)
    print("Files in the Raw Folder:")
    print(" ")
    for file in files:
        print(file)
else:
    print("Belirtilen klasör yok veya bir dizin değil.")

In [None]:
# Klasördeki dosyaları listele
files = os.listdir(path_1)

print("Dataframes: ")
print("")

# CSV dosyalarını oku ve değişken olarak kaydet
for csv_file in files:
    if csv_file.endswith('.csv'):
        file_path = os.path.join(path_1, csv_file)
        df = pd.read_csv(file_path)
        
        # Dosya adını kullanarak değişken adını belirle
        var_name = os.path.splitext(csv_file)[0] + '_df'
        
        # DataFrame'i globals() fonksiyonunu kullanarak kaydet
        globals()[var_name] = df

        print(f"{var_name} saved...")

### 5- Data Preperation

#### 5.1 Drop Unnecessary Columns 

In [None]:
# Use train_df as main dataset
df = train_df.copy()

In [None]:
# Original Data
df_org = df.copy()

In [None]:
# Drop unnecessary columns
df = df.drop(columns=['id','CustomerId','Surname'], axis=1)

In [None]:
# Copy the dataset
df_2 = df.copy()

#### 5.2 Encoding

In [None]:
# Label Encoding
labelencoder = LabelEncoder()

# Encoding 'Geography' column
df_2['Geography'] = labelencoder.fit_transform(df_2['Geography'])

# Encoding 'Gender' column
df_2['Gender'] = labelencoder.fit_transform(df_2['Gender'])

### 6- Exploratory Data Analysis

In [None]:
# Check datatype and blanks
df_2.info()
df_2.head(2)

#### 6.1- Statistical Analysis

In [None]:
# Statistical Distribution
df_2.describe()

In [None]:
'''
CreditScore:
The average credit score is 656.45, with scores ranging from 350 to 850. 
The median score is 659, indicating that half of the observations have a credit score below this value. 
The standard deviation of 80.10 suggests moderate variability in credit scores among the customers.

Geography:
The encoded geography values (0, 1, 2) show an average value of 0.65. 
With a standard deviation of 0.82, this indicates a diverse distribution of customers across different regions. 
The median value is 0, implying that a significant portion of the data is concentrated in the lowest encoded region.

Gender:
The average encoded value for gender is 0.56, reflecting a slightly higher proportion of one gender in the dataset. 
The median value of 1 shows that more than half of the observations are of the higher encoded gender value.

Age:
The average age of the customers is 38.13 years, with ages ranging from 18 to 92 years. 
The median age is 37 years, indicating a young to middle-aged customer base. 
The standard deviation of 8.87 suggests some variability in age distribution.

Tenure:
Customers have an average tenure of 5.02 years, with a range from 0 to 10 years. 
The median tenure is 5 years, suggesting that half of the customers have been with the organization for this duration. 
The standard deviation of 2.81 indicates some variation in tenure.

Balance:
The average account balance is 55,478.09, with balances ranging from 0 to 250,898.09. 
The median balance is 117,948.00, showing that half of the customers have a balance below this amount. 
A high standard deviation of 62,817.66 indicates significant variability in account balances.

NumOfProducts:
On average, customers have 1.55 products, with a median of 2 products. 
The range is from 1 to 4 products, and the standard deviation of 0.55 indicates some variation in the number of products held by customers.

HasCrCard:
The average value for credit card possession is 0.75, indicating that a majority of customers have a credit card. 
The standard deviation of 0.43 shows a significant portion without credit cards.

IsActiveMember:
The average value for active membership is 0.50, with a median of 1. 
This indicates an equal distribution between active and inactive members.

EstimatedSalary:
The average estimated salary is 112,574.82, with values ranging from 11.58 to 199,992.48. 
The median salary is 117,948.00, suggesting a moderately high-income customer base. 
The standard deviation of 50,292.87 indicates a wide range of salaries.
'''
print('')

#### 6.2- Histograms

In [None]:
# Histograms
df_2.hist(bins=30, figsize=(20, 15))
plt.tight_layout()
plt.show()

#### 6.3- Distributions

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df_2 is your DataFrame
numeric_columns = df_2.select_dtypes(include=['float64', 'int64']).columns

plt.figure(figsize=(15, 10))  # Adjusting the size to accommodate all subplots

for i, column in enumerate(numeric_columns, 1):
    plt.subplot(3, 4, i)  # Adjust the number of rows and columns if necessary
    sns.histplot(df_2[column], kde=True)
    plt.title(column)
    plt.xlabel('')
    plt.ylabel('')

plt.tight_layout()
plt.show()


#### 6.4- Scatter-Plotting

In [None]:
# Relationship between columns
plt.figure(figsize=(8, 3))  # Grafik boyutunu küçülttük
plt.scatter(df_2['EstimatedSalary'], df_2['Balance'], label='Data Points')
plt.xlabel('EstimatedSalary')
plt.ylabel('Balance')
plt.title('Relationship between Estimated Salary and Balance')
plt.legend()
plt.show()

#### 6.5- Outliers Checking

In [None]:
# Step 3: Plot the Boxplot
plt.figure(figsize=(10, 3))

sns.boxplot(data=df_2)
plt.xticks(rotation=45, fontsize=8)  
plt.show()

### 7- Building Model-0 // Benchmark

In [None]:
# Independent Features
X = df_2[['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary']]

# Dependent Features
y = df_2['Exited']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Logistic Regression modelini oluşturma ve eğitme
clf = LogisticRegression(solver="liblinear").fit(X_train, y_train)

In [None]:
# Run the Model Evaluation
evaluate_model(clf, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(clf, X_train, X_test, y_train, y_test)

In [None]:
'''
## Evaluation:

ROC AUC measures the model's classification ability, and the closer it is to 1, the better the model performs. 
The ROC AUC score for the training set is 0.714, while the score for the test set is 0.712. 
These scores indicate that the model's classification ability is reasonable but not perfect.
'''
print('')

In [None]:
'''
## Conclusions and Recommendations:

Performance Improvement:
The model's ROC AUC score is around 0.71, suggesting that there is room for improvement in classification performance. 
This could be achieved by employing more complex models or by performing feature engineering to enhance the model's accuracy.

Balanced Dataset and Other Metrics:
If there is class imbalance in the dataset, the accuracy metric might be misleading. 
In such cases, it is beneficial to consider other metrics such as precision, recall, and F1 score.

Overfitting/Underfitting Analysis:
The training and test accuracies are quite close, indicating that the model is not overfitting and has a reasonable generalization capability. 
However, to further improve the model and capture more complex patterns in the data, more advanced models (e.g., Random Forest, Gradient Boosting) 
can be tried.

In conclusion, the current model performs well as a baseline evaluation. 
However, additional model tuning and data preprocessing steps can be undertaken to enhance performance, 
particularly to achieve higher accuracy and better ROC AUC scores.
'''
print('')

### 8- Data Cleaning

In [None]:
# Copy the dataset
df_3 = df_2

#### 8.1- Drop Outliers

##### 8.1.1- Balance

In [None]:
# Remove rows where Balance is zero
df_3 = df_3[df_3['Balance'] != 0]

# Calculate Q1 and Q3 on the non-zero Balance values
Q1 = df_3['Balance'].quantile(0.10)
Q3 = df_3['Balance'].quantile(0.90)

# Calculate the IQR
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers among the non-zero balances
outliers = (df_3['Balance'] < lower_bound) | (df_3['Balance'] > upper_bound)

# Remove the identified outliers
df_3 = df_3[~outliers]

In [None]:
plt.figure(figsize=(8, 2))  
sns.histplot(df_3['Balance'], bins=50) 
plt.xlabel('Balance')
plt.ylabel('Count')
plt.title('Distribution of Balance')
plt.show()

##### 8.1.2- EstimatedSalary

In [None]:
# Remove rows where Balance is zero
df_3 = df_3[df_3['EstimatedSalary'] > 0]

# Calculate Q1 and Q3 on the non-zero Balance values
Q1 = df_3['EstimatedSalary'].quantile(0.10)
Q3 = df_3['EstimatedSalary'].quantile(0.90)

# Calculate the IQR
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers among the non-zero balances
outliers = (df_3['EstimatedSalary'] < lower_bound) | (df_3['EstimatedSalary'] > upper_bound)

# Remove the identified outliers
df_3 = df_3[~outliers]

In [None]:
# Check Histogram for EstimatedSalary
plt.figure(figsize=(8, 2))  
sns.histplot(df_3['EstimatedSalary'], bins=50) 
plt.xlabel('EstimatedSalary')
plt.ylabel('Count')
plt.title('Distribution of EstimatedSalary')
plt.show()

#### 8.2- Standart Scaler

In [None]:
# Copy the dataset
df_4 = df_3.copy()

In [None]:
# List of columns to scale
columns_to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the selected columns
df_4[columns_to_scale] = min_max_scaler.fit_transform(df_4[columns_to_scale])

In [None]:
# Check the output
df_4 = df_4.reset_index(drop=True)

df_4.info()
df_4.head(2)

### 9- Building Model-1

#### 9.1- Logistic Regression

In [None]:
# Independent Features
X = df_4[['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary']]

# Dependent Features
y = df_4['Exited']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train Logistic Regression Model 
clf = LogisticRegression(solver="liblinear").fit(X_train, y_train)

In [None]:
# Run the Model Evaluation
evaluate_model(clf, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(clf, X_train, X_test, y_train, y_test)

#### 9.2- Decision Trees and Random Forest

In [None]:
# Models initialization
log_reg = LogisticRegression(solver="liblinear")
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train models
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Predicting and evaluating
models = {
    'Logistic Regression': log_reg,
    'Decision Tree': decision_tree,
    'Random Forest': random_forest
}

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba)}")
    print("\n")

Based on the this scores, Random Forest Best Model, deep dive on it

In [None]:
# Run the Model Evaluation
evaluate_model(random_forest, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(random_forest, X_train, X_test, y_train, y_test)

It is so obivous that model is overfitting. That's why Logistic Regression is still the best one. 

#### 9.3- Gradient Boosting Machines (GBM) and XGBoost

In [None]:
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgboost = XGBClassifier(n_estimators=100, random_state=42)

# Train models
gradient_boosting.fit(X_train, y_train)
xgboost.fit(X_train, y_train)

# Predicting and evaluating
models = {
    'Gradient Boosting': gradient_boosting,
    'XGBoost': xgboost
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba)}")
    print("\n")

In [None]:
# Run the Model Evaluation
evaluate_model(gradient_boosting, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(gradient_boosting, X_train, X_test, y_train, y_test)

In [None]:
# Run the Model Evaluation
evaluate_model(xgboost, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(xgboost, X_train, X_test, y_train, y_test)

Both Model is very good

#### 10- Inblance Data Check

In [None]:
# Sınıf dağılımını kontrol etme
class_distribution = y.value_counts(normalize=True)
print(class_distribution)

It is inbalance dataset that need to be fixed

In [None]:


# Random over-sampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df_4.drop(columns='Exited'), df_4['Exited'])

In [None]:
# Sınıf dağılımını kontrol etme
class_distribution = y_resampled.value_counts(normalize=True)
print(class_distribution)

The dataset is balanced

In [None]:
# Eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [None]:
# Logistic Regression modeli
log_reg = LogisticRegression(solver="liblinear")
log_reg.fit(X_train, y_train)

# Gradient Boosting modeli
gradient_boost = GradientBoostingClassifier(n_estimators=100, random_state=42)
gradient_boost.fit(X_train, y_train)

# XGBoost modeli
xgboost = XGBClassifier(n_estimators=100, random_state=42)
xgboost.fit(X_train, y_train)

In [None]:
# Modelleri değerlendirme
models = {
    'Logistic Regression': log_reg,
    'Gradient Boosting': gradient_boost,
    'XGBoost': xgboost
}

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    print("")
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba)}")

Deep Dive Gradient Boosting and XGBoost

In [None]:
# Run the Model Evaluation
evaluate_model(xgboost, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(xgboost, X_train, X_test, y_train, y_test)

In [None]:
# Run the Model Evaluation
evaluate_model(gradient_boost, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(gradient_boost, X_train, X_test, y_train, y_test)

The best model is xgboost.
It show high performance for both train and test dataset.

#### 11- Feature Engineering

#### 11.1 Correlation Test

In [None]:
corr = X_resampled.corr()
plt.figure(figsize=(15,15))

corr_select = sns.heatmap(corr, annot=True, cmap='coolwarm')
bottom, top = corr_select.get_ylim()

corr_select.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
# Drop highly correlated columns 

def correlated_columns(X_resampled, threshold):
    
    col_corr    = set()
    corr_matrix = X_resampled.corr()
    
    for i in range(len(corr_matrix.columns)):
    
        for j in range(i):
                   
            if corr_matrix.iloc[i,j] > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
                   
    return col_corr

In [None]:
# If the correlation between columns are higher than 80% then drop these columns

corr_features = correlated_columns(X_resampled, 0.9)
len(set(corr_features))

No Correlation

#### 11.2 Recursive Feature Elimination

In [None]:
# Farklı özellik sayıları için döngü
results = []
for n_features in range(1, X_train.shape[1] + 1):
    rfe = RFE(estimator=xgb_model, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)
    
    # Seçilen özelliklerle modeli eğitme
    X_train_rfe = X_train.loc[:, rfe.support_]
    X_test_rfe = X_test.loc[:, rfe.support_]
    xgb_model.fit(X_train_rfe, y_train)
    
    # Modeli değerlendirme
    y_pred = xgb_model.predict(X_test_rfe)
    y_pred_proba = xgb_model.predict_proba(X_test_rfe)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    results.append((n_features, accuracy, roc_auc))

# Sonuçları yazdırma
for n_features, accuracy, roc_auc in results:
    print(f"Number of features: {n_features}, Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")

The best performance with all feature, no need to drop any features. 

### 12- Hyperparameter Tuning

In [None]:
# Hyperparameter ızgarası
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# XGBoost modeli
xgb_model = XGBClassifier(random_state=42)

# GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# En iyi parametreler
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best ROC AUC score: {grid_search.best_score_}")

# En iyi model
best_model = grid_search.best_estimator_

# Model değerlendirme
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")

In [None]:


# Hyperparameter dağılımı
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=50, stop=200, num=10)],
    'max_depth': [int(x) for x in np.linspace(3, 10, num=8)],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# XGBoost modeli
xgb_model = XGBClassifier(random_state=42)

# RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=100, cv=3, scoring='roc_auc', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

# En iyi parametreler
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best ROC AUC score: {random_search.best_score_}")

# En iyi model
best_model = random_search.best_estimator_

# Model değerlendirme
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")


In [None]:
# Train with best parameters

# En iyi modelin hyperparameter'ları
best_params = {'subsample': 0.9, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.3, 'colsample_bytree': 1.0}

# En iyi modelin yeniden oluşturulması
best_model = XGBClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

In [None]:
# Run the Model Evaluation
evaluate_model(best_model, X_train, X_test, y_train, y_test)

In [None]:
# Run Model Success Metrics Visuals
evaluate_model_performance(best_model, X_train, X_test, y_train, y_test)

### 13- Cross Validation

In [None]:
# En iyi modelin yeniden oluşturulması
best_model = XGBClassifier(**best_params, random_state=42)

# k-katlı çapraz doğrulama
k = 10  # 10 katlı çapraz doğrulama
stratified_kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [None]:
train_roc_aucs = []
test_roc_aucs = []

In [None]:
X = X_resampled
y = y_resampled

In [None]:
for train_index, test_index in stratified_kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    best_model.fit(X_train, y_train)
    
    # Eğitim seti üzerindeki değerlendirme
    y_train_proba = best_model.predict_proba(X_train)[:, 1]
    train_roc_auc = roc_auc_score(y_train, y_train_proba)
    train_roc_aucs.append(train_roc_auc)
    
    # Test seti üzerindeki değerlendirme
    y_test_proba = best_model.predict_proba(X_test)[:, 1]
    test_roc_auc = roc_auc_score(y_test, y_test_proba)
    test_roc_aucs.append(test_roc_auc)
    
    # Fold sonuçlarını yazdırma
    print("Fold Performance")
    print(f"Train ROC AUC: {train_roc_auc:.4f}")
    print(f"Test ROC AUC: {test_roc_auc:.4f}")
    print("\n")

In [None]:
# Çapraz doğrulama sonuçları
print("Cross-Validation Results")
print(f"Mean Train ROC AUC: {np.mean(train_roc_aucs):.4f}, Std Train ROC AUC: {np.std(train_roc_aucs):.4f}")
print(f"Mean Test ROC AUC: {np.mean(test_roc_aucs):.4f}, Std Test ROC AUC: {np.std(test_roc_aucs):.4f}")