In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE


In [None]:
from google.colab import files
files.upload()


In [71]:
import zipfile
import pandas as pd

# Zip dosyasının yolunu belirtiyoruz (önceden drive'a yüklediysen path verebilirsin)
zip_path = 'archive.zip'

# Zip dosyasını oku
with zipfile.ZipFile(zip_path, 'r') as z:
    # İçindeki CSV dosyasını aç ve oku
    with z.open('WA_Fn-UseC_-Telco-Customer-Churn.csv') as f:
        df = pd.read_csv(f)

# Veri geldi mi kontrol edelim
print(df.head())



   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [None]:
# Veri tipi kontrolü
df.info()

# Eksik değer kontrolü
df.isnull().sum()


In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [None]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


In [None]:
df.drop('customerID', axis=1, inplace=True)


In [None]:
print(df.info())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Hedef değişken dağılımı
plt.figure(figsize=(4,4))
df['Churn'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['skyblue','salmon'])
plt.title('Churn Dağılımı')
plt.ylabel('')
plt.show()

# Contract'a göre Churn
plt.figure(figsize=(6,4))
sns.countplot(x='Contract', hue='Churn', data=df, palette='Set2')
plt.title('Contract Türüne Göre Churn')
plt.show()

# InternetService'a göre Churn
plt.figure(figsize=(6,4))
sns.countplot(x='InternetService', hue='Churn', data=df, palette='Set1')
plt.title('Internet Service ve Churn')
plt.show()

# PaymentMethod'a göre Churn
plt.figure(figsize=(6,4))
sns.countplot(x='PaymentMethod', hue='Churn', data=df, palette='Set3')
plt.title('Payment Method ve Churn')
plt.xticks(rotation=45)
plt.show()

# tenure ve MonthlyCharges scatter
plt.figure(figsize=(6,4))
sns.scatterplot(x='tenure', y='MonthlyCharges', hue='Churn', data=df)
plt.title('tenure ve MonthlyCharges')
plt.show()


In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print(cat_cols)


In [None]:
# One-hot encoding
df_encoded = pd.get_dummies(df, drop_first=True)

# Kontrol edelim
print(df_encoded.head())


In [72]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)


In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [73]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_res_scaled, y_train_res)


In [None]:
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest modeli
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# SMOTE sonrası veriyi kullanalım
rf_model.fit(X_train_res, y_train_res)


In [None]:
# Tahmin
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Performans metrikleri
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Feature importance
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(10,8))
plt.title("Feature Importances - Random Forest")
plt.show()


In [None]:
!pip install xgboost


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier


In [None]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    scale_pos_weight=1  # class weight dengesini kontrol etmek için
)

xgb_model.fit(X_train_res, y_train_res)


In [None]:
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_xgb))


In [None]:
import matplotlib.pyplot as plt

xgb.plot_importance(xgb_model, height=0.8, max_num_features=20)
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}


In [None]:
xgb = XGBClassifier(random_state=42)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           scoring='roc_auc', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_res, y_train_res)


In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)


In [None]:
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_best))


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Tahminleri tekrar hatırlatalım (zaten elinde var)
# y_test, y_pred_best, y_proba_best

# Skorları hesapla
acc = accuracy_score(y_test, y_pred_best)
prec = precision_score(y_test, y_pred_best)
rec = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)
roc = roc_auc_score(y_test, y_proba_best)

# Çıktıyı bastır
print("===== Model Performans Özeti =====")
print(f"Accuracy  : {acc:.2%}")
print(f"Precision : {prec:.2%}")
print(f"Recall    : {rec:.2%}")
print(f"F1-Score  : {f1:.2%}")
print(f"ROC-AUC   : {roc:.2%}")
print("===================================")


In [None]:
#+1 koyuyoruz çünkü tenure=0 olanlar var → sıfıra bölme hatasını engellemek için.
df['AvgMonthlyCharge'] = df['TotalCharges'] / (df['tenure'] + 1)


In [None]:
def tenure_group(tenure):
    if tenure <= 12:
        return '0-1 year'
    elif tenure <= 24:
        return '1-2 year'
    elif tenure <= 48:
        return '2-4 year'
    elif tenure <= 60:
        return '4-5 year'
    else:
        return '5+ year'

df['TenureGroup'] = df['tenure'].apply(tenure_group)


In [None]:
services = ['PhoneService', 'MultipleLines', 'InternetService',
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies']

df['TotalServices'] = df[services].apply(lambda x: sum(x == 'Yes'), axis=1)


In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [None]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.20, random_state=42, stratify=y)


In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

xgb = XGBClassifier(random_state=42)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           scoring='roc_auc', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_res, y_train_res)


In [None]:
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Kısa özet çıktı
acc = accuracy_score(y_test, y_pred_best)
prec = precision_score(y_test, y_pred_best)
rec = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)
roc = roc_auc_score(y_test, y_proba_best)

print("===== Model Performans Özeti =====")
print(f"Accuracy  : {acc:.2%}")
print(f"Precision : {prec:.2%}")
print(f"Recall    : {rec:.2%}")
print(f"F1-Score  : {f1:.2%}")
print(f"ROC-AUC   : {roc:.2%}")
print("===================================")


In [None]:
# TotalCharges / tenure (zaten yapmıştık), ek interaction:
df['ChargeTenureInteraction'] = df['MonthlyCharges'] * (df['tenure'] + 1)


In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [None]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y)


In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

xgb = XGBClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=xgb, param_grid=param_grid,
    scoring='roc_auc', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_res, y_train_res)


In [74]:
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

acc = accuracy_score(y_test, y_pred_best)
prec = precision_score(y_test, y_pred_best)
rec = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)
roc = roc_auc_score(y_test, y_proba_best)

print("===== Boost Model Performans Özeti =====")
print(f"Accuracy  : {acc:.2%}")
print(f"Precision : {prec:.2%}")
print(f"Recall    : {rec:.2%}")
print(f"F1-Score  : {f1:.2%}")
print(f"ROC-AUC   : {roc:.2%}")
print("===================================")


===== Boost Model Performans Özeti =====
Accuracy  : 75.51%
Precision : 52.84%
Recall    : 72.19%
F1-Score  : 61.02%
ROC-AUC   : 82.92%


In [None]:
import numpy as np
from sklearn.metrics import roc_curve

# ROC curve hesapla
fpr, tpr, thresholds = roc_curve(y_test, y_proba_best)

# Youden's J statistic (tpr - fpr) ile en iyi threshold bulma
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold}")


In [None]:
# Yeni tahminleri yap
y_pred_optimal = (y_proba_best >= optimal_threshold).astype(int)


In [75]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

acc = accuracy_score(y_test, y_pred_optimal)
prec = precision_score(y_test, y_pred_optimal)
rec = recall_score(y_test, y_pred_optimal)
f1 = f1_score(y_test, y_pred_optimal)
roc = roc_auc_score(y_test, y_proba_best)

print("===== Threshold Tuned Model Performans Özeti =====")
print(f"Accuracy  : {acc:.2%}")
print(f"Precision : {prec:.2%}")
print(f"Recall    : {rec:.2%}")
print(f"F1-Score  : {f1:.2%}")
print(f"ROC-AUC   : {roc:.2%}")
print("===================================")


===== Threshold Tuned Model Performans Özeti =====
Accuracy  : 72.25%
Precision : 48.64%
Recall    : 81.02%
F1-Score  : 60.78%
ROC-AUC   : 82.92%


In [76]:
from sklearn.ensemble import VotingClassifier

# Ana modeller
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Modelleri tanımla
lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42)

# Ensemble modeli
ensemble = VotingClassifier(estimators=[
    ('lr', lr),
    ('rf', rf),
    ('xgb', xgb)
], voting='soft')  # soft voting: probability ortalaması

# SMOTE/SMOTEENN uygulanmış train seti ile eğit
ensemble.fit(X_train_res, y_train_res)

# Tahmin yap
y_pred_ens = ensemble.predict(X_test)
y_proba_ens = ensemble.predict_proba(X_test)[:, 1]

# Performans ölç
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

acc = accuracy_score(y_test, y_pred_ens)
prec = precision_score(y_test, y_pred_ens)
rec = recall_score(y_test, y_pred_ens)
f1 = f1_score(y_test, y_pred_ens)
roc = roc_auc_score(y_test, y_proba_ens)

print("===== Ensemble Model Performans Özeti =====")
print(f"Accuracy  : {acc:.2%}")
print(f"Precision : {prec:.2%}")
print(f"Recall    : {rec:.2%}")
print(f"F1-Score  : {f1:.2%}")
print(f"ROC-AUC   : {roc:.2%}")
print("===================================")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


===== Ensemble Model Performans Özeti =====
Accuracy  : 75.37%
Precision : 52.60%
Recall    : 72.99%
F1-Score  : 61.14%
ROC-AUC   : 83.69%


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [None]:
# Base modeller
lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42)


In [None]:
# Meta model: Logistic Regression
stacking_model = StackingClassifier(
    estimators=[
        ('lr', lr),
        ('rf', rf),
        ('xgb', xgb)
    ],
    final_estimator=LogisticRegression(),
    cv=3,
    n_jobs=-1
)


In [None]:
# SMOTEENN sonrası dengelenmiş veriyi kullanıyoruz
stacking_model.fit(X_train_res, y_train_res)


In [77]:
y_pred_stack = stacking_model.predict(X_test)
y_proba_stack = stacking_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

acc = accuracy_score(y_test, y_pred_stack)
prec = precision_score(y_test, y_pred_stack)
rec = recall_score(y_test, y_pred_stack)
f1 = f1_score(y_test, y_pred_stack)
roc = roc_auc_score(y_test, y_proba_stack)

print("===== STACKING Model Performans Özeti =====")
print(f"Accuracy  : {acc:.2%}")
print(f"Precision : {prec:.2%}")
print(f"Recall    : {rec:.2%}")
print(f"F1-Score  : {f1:.2%}")
print(f"ROC-AUC   : {roc:.2%}")
print("===================================")
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_best))


===== STACKING Model Performans Özeti =====
Accuracy  : 74.17%
Precision : 50.92%
Recall    : 74.33%
F1-Score  : 60.43%
ROC-AUC   : 83.15%
[[794 241]
 [104 270]]
              precision    recall  f1-score   support

           0       0.88      0.77      0.82      1035
           1       0.53      0.72      0.61       374

    accuracy                           0.76      1409
   macro avg       0.71      0.74      0.72      1409
weighted avg       0.79      0.76      0.77      1409

ROC-AUC: 0.8292050944224857
