In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [50]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [51]:
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2258,9150-HEPMB,Male,0,No,No,56,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,52.7,3019.7,No
2633,4837-QUSFT,Female,0,Yes,No,65,Yes,Yes,Fiber optic,Yes,...,No,Yes,Yes,No,One year,Yes,Bank transfer (automatic),100.15,6643.5,No
349,4654-DLAMQ,Female,1,Yes,No,64,Yes,No,Fiber optic,Yes,...,Yes,No,No,Yes,One year,No,Bank transfer (automatic),97.0,6430.9,No
841,0392-BZIUW,Female,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,...,Yes,No,Yes,Yes,Two year,Yes,Credit card (automatic),105.0,7589.8,No
3258,7365-BVCJH,Male,0,No,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,24.4,24.4,No


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [53]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [54]:
df.duplicated().sum()

0

In [55]:
df.sample()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
6352,2347-WKKAE,Male,0,Yes,No,42,Yes,Yes,Fiber optic,Yes,...,Yes,No,No,Yes,Month-to-month,Yes,Electronic check,94.4,4014.6,No


In [56]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [57]:
# drop customerID as it's not important
df = df.drop(columns='customerID', axis=1)

In [58]:
df.sample(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4583,Female,0,Yes,No,71,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Electronic check,90.1,6310.9,No
2885,Male,1,No,No,67,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),116.1,7839.85,No


In [59]:
#converting the dtype of 'TotalCharges' as it's a numeric value rather string
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [61]:
X = df.drop(columns='Churn')
y = df['Churn']

In [62]:
y = y.map({'No':0, 'Yes': 1})

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
numeric_col = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_col = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod']

In [65]:
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')

df[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies']] = df[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies']].replace('No internet service', 'No')

In [66]:
num_pipe = make_pipeline(SimpleImputer(strategy='mean'),
                         StandardScaler())

cat_pipe = make_pipeline(OneHotEncoder(drop='first'))

In [67]:
trf = ColumnTransformer([('num', num_pipe, numeric_col),
                         ('cat', cat_pipe, categorical_col)],
                        remainder='passthrough')

In [68]:
X_train_trf = trf.fit_transform(X_train) 

X_test_trf = trf.transform(X_test)

In [90]:
model_lr = LogisticRegression()
model_lr.fit(X_train_trf, y_train)

In [91]:
accuracy_score(y_train, model_lr.predict(X_train_trf))

0.80386936457224

In [92]:
accuracy_score(y_test, model_lr.predict(X_test_trf))

0.8225691980127751

In [72]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [73]:
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()

In [74]:
model_dt.fit(X_train_trf, y_train)

In [75]:
accuracy_score(y_train, model_dt.predict(X_train_trf))

0.9985800496982605

In [76]:
accuracy_score(y_test, model_dt.predict(X_test_trf))

0.7125621007806955

In [77]:
model_rf.fit(X_train_trf, y_train)

In [78]:
accuracy_score(y_train, model_rf.predict(X_train_trf)), accuracy_score(y_test, model_rf.predict(X_test_trf))

(0.9985800496982605, 0.7934705464868701)

In [79]:
X_trf = trf.fit_transform(X)

In [80]:
np.mean(cross_val_score(model_lr, X_trf, y, cv=5, scoring='accuracy'))

0.8039197085295825

In [81]:
np.mean(cross_val_score(model_dt, X_trf, y, cv=5, scoring='accuracy'))

0.7236991096199754

In [82]:
np.mean(cross_val_score(model_rf, X_trf, y, cv=5, scoring='accuracy'))

0.7898640638105684

In [83]:
full_pipe_rf = Pipeline(steps=[
    ('preprocessor', trf),
    ('model', model_rf)
])

In [84]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10, 15],
    'model__min_samples_leaf': [1, 5, 10]
}

In [85]:
grid_search = GridSearchCV(
    estimator=full_pipe_rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print("Starting Grid Search...")
grid_search.fit(X_train, y_train)
print("Grid Search complete.")

Starting Grid Search...
Grid Search complete.


In [86]:
print("Best parameters found:")
print(grid_search.best_params_)

print("\nBest cross-validation accuracy:")
print(grid_search.best_score_)

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"\nTest set accuracy of best model: {test_score:.4f}")

Best parameters found:
{'model__max_depth': 10, 'model__min_samples_leaf': 5, 'model__n_estimators': 100}

Best cross-validation accuracy:
0.8033355345381645

Test set accuracy of best model: 0.8126


In [87]:
import xgboost as xgb # Use this import

pipe_xgb = Pipeline(steps=[
    ('preprocessor', trf),
    ('model', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')) 
])


param_grid_xgb = {
    'model__n_estimators': [100, 200],         
    'model__learning_rate': [0.1, 0.05],
    'model__max_depth': [3, 5] 
}

grid_xgb = GridSearchCV(pipe_xgb, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_xgb.fit(X_train, y_train)

print("XGBoost Best Score:", grid_xgb.best_score_)
print("XGBoost Best Params:", grid_xgb.best_params_)

XGBoost Best Score: 0.8040463293202059
XGBoost Best Params: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 200}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
import joblib

best_model_pipeline = grid_xgb.best_estimator_

filename = 'churn_predictor_pipeline.joblib'
joblib.dump(best_model_pipeline, filename)

print(f"Model pipeline saved to {filename}")

Model pipeline saved to churn_predictor_pipeline.joblib


In [93]:
filename = 'churn_predictor_lr_model.joblib'
joblib.dump(model_lr, filename)

print(f"Model pipeline saved to {filename}")

Model pipeline saved to churn_predictor_lr_model.joblib
