In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv('Churn_prediction.csv')
df = df.drop('customerID',axis=1)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)
df[['tenure','MonthlyCharges','TotalCharges']] = df[['tenure','MonthlyCharges','TotalCharges']].astype(float)

In [113]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24.0,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,Female,0,Yes,Yes,72.0,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,0,Yes,Yes,11.0,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4.0,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [105]:
categorical_data = ['gender','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod']
numerical_data = ['TotalCharges','SeniorCitizen']
target = ['Churn']

x = df[categorical_data+numerical_data]
y = df['Churn'].map({'No': 0, 'Yes': 1})

from sklearn.preprocessing import StandardScaler
x_cat = pd.get_dummies(x[categorical_data])
scaler = StandardScaler()
x_num = pd.DataFrame(scaler.fit_transform(x[numerical_data]), columns=numerical_data)
x_preprocessd = pd.concat([x_cat,x_num],axis=1)

from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=20)
x_selected = selector.fit_transform(x_preprocessd, y)

# train/test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.2, random_state=42)

#SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Original class distribution:\n", y_train.value_counts())
print("After SMOTE:\n", y_train_resampled.value_counts())


from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=2,  # ratio of majority/minority class
    learning_rate=0.1,
    max_depth=4,
    n_estimators=100
)
model.fit(x_train_resampled,y_train_resampled)

y_pred = model.predict(x_test)
from sklearn.metrics import accuracy_score, classification_report
y_probs = model.predict_proba(x_test)[:, 1]  # Probabilities for churn = 1

threshold = 0.65
y_pred_thresh = (y_probs >= threshold).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_thresh))
acc = accuracy_score(y_test,y_pred_thresh)
print(acc)

Original class distribution:
 Churn
0    4138
1    1496
Name: count, dtype: int64
After SMOTE:
 Churn
0    4138
1    4138
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.92      0.75      0.83      1036
           1       0.54      0.81      0.65       373

    accuracy                           0.77      1409
   macro avg       0.73      0.78      0.74      1409
weighted avg       0.82      0.77      0.78      1409

0.7672107877927609


In [108]:
import numpy as np
import pandas as pd
print(pd.Series(y_pred).value_counts())
import numpy as np
import pandas as pd
print(pd.Series(y_pred_thresh).value_counts())


0    708
1    701
Name: count, dtype: int64
0    848
1    561
Name: count, dtype: int64


In [109]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.64      0.76      1036
           1       0.47      0.88      0.61       373

    accuracy                           0.70      1409
   macro avg       0.70      0.76      0.68      1409
weighted avg       0.81      0.70      0.72      1409



In [112]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_thresh).ravel()

print(f"True Positives (Correctly predicted Yes): {tp}")
print(f"False Positives (Predicted Yes but actually No): {fp}")
print(f"True Negatives (Correctly predicted No): {tn}")
print(f"False Negatives (Predicted No but actually Yes): {fn}")


True Positives (Correctly predicted Yes): 303
False Positives (Predicted Yes but actually No): 258
True Negatives (Correctly predicted No): 778
False Negatives (Predicted No but actually Yes): 70
