In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings 
warnings.filterwarnings('ignore')

In [57]:
df = pd.read_csv('data/telco_customer_churn_preprocessed.csv')

In [58]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [59]:
print(df['gender'].unique())

['Female' 'Male']


In [60]:
for col in df.columns:
    print(f"Column {col} data type: {df[col].dtype}")

Column gender data type: object
Column SeniorCitizen data type: int64
Column Partner data type: object
Column Dependents data type: object
Column tenure data type: int64
Column PhoneService data type: object
Column MultipleLines data type: object
Column InternetService data type: object
Column OnlineSecurity data type: object
Column OnlineBackup data type: object
Column DeviceProtection data type: object
Column TechSupport data type: object
Column StreamingTV data type: object
Column StreamingMovies data type: object
Column Contract data type: object
Column PaperlessBilling data type: object
Column PaymentMethod data type: object
Column MonthlyCharges data type: float64
Column TotalCharges data type: float64
Column Churn data type: object


In [61]:
for col in df.columns:
    print(f"Column {col} values: {df[col].unique()}")

Column gender values: ['Female' 'Male']
Column SeniorCitizen values: [0 1]
Column Partner values: ['Yes' 'No']
Column Dependents values: ['No' 'Yes']
Column tenure values: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Column PhoneService values: ['No' 'Yes']
Column MultipleLines values: ['No phone service' 'No' 'Yes']
Column InternetService values: ['DSL' 'Fiber optic' 'No']
Column OnlineSecurity values: ['No' 'Yes' 'No internet service']
Column OnlineBackup values: ['Yes' 'No' 'No internet service']
Column DeviceProtection values: ['No' 'Yes' 'No internet service']
Column TechSupport values: ['No' 'Yes' 'No internet service']
Column StreamingTV values: ['No' 'Yes' 'No internet service']
Column StreamingMovies values: ['No' 'Yes' 'No internet service']
Column Contract values: ['Month-to-month' 'One year' 'Two y

In [62]:
binary_columns = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"]
multi_value_columns = ["MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
dummies_columns = ["Contract", "PaymentMethod"]
numeric_columns = ["tenure", "MonthlyCharges", "TotalCharges"]

In [63]:
def preprocess_binary_columns(df):
    df = df.copy()
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def preprocess_multi_value_columns(df):
    # One-hot encoding
    df = df.copy()
    for col in multi_value_columns:
        # If value is "Yes", then 1, else 0
        df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def preprocess_dummies_columns(df):
    df = df.copy()
    df = pd.get_dummies(df, columns=dummies_columns)
    # Convert all columns to 1 and 0
    for col in df.columns:
        # if col starts with Contract or PaymentMethod
        if col.startswith("Contract") or col.startswith("PaymentMethod"):
            df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    return df

def preprocess_numeric_columns(df):
    df = df.copy()
    for col in numeric_columns:
        df[col] = df[col].astype(float)
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df

def preprocess_target_column(df):
    df = df.copy()
    df['Churn'] = df['Churn'].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def preprocess_df(df):
    df = df.copy()
    df = preprocess_binary_columns(df)
    df = preprocess_multi_value_columns(df)
    df = preprocess_dummies_columns(df)
    df = preprocess_numeric_columns(df)
    df = preprocess_target_column(df)
    return df

In [64]:
df_processed = preprocess_df(df)

In [65]:
df_processed.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,MonthlyCharges,TotalCharges,Churn,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,-1.277445,0,0,0,0,1,...,-1.160323,-0.992611,0,1,0,0,0,0,1,0
1,0,0,0,0,0.066327,1,0,0,1,0,...,-0.259629,-0.172165,0,0,1,0,0,0,0,1
2,0,0,0,0,-1.236724,1,0,0,1,1,...,-0.36266,-0.958066,1,1,0,0,0,0,0,1
3,0,0,0,0,0.514251,0,0,0,1,0,...,-0.746535,-0.193672,0,0,1,0,1,0,0,0
4,0,0,0,0,-1.236724,1,0,0,0,0,...,0.197365,-0.938874,1,1,0,0,0,0,1,0


In [66]:
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (5634, 24)
y_train shape: (5634,)
X_test shape: (1409, 24)
y_test shape: (1409,)


In [68]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

xgb_model.fit(X_train, y_train)

In [69]:
y_pred = xgb_model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_pred)}")

Accuracy: 0.7913413768630234
Precision: 0.6295081967213115
Recall: 0.514745308310992
F1: 0.5663716814159292
AUC: 0.7028359746188164
Confusion matrix:
 [[923 113]
 [181 192]]
Classification report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1036
           1       0.63      0.51      0.57       373

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [70]:
# Cross validation
from sklearn.model_selection import cross_val_score

scoring_list = ["accuracy", "precision", "recall", "f1", "roc_auc"]

for scoring in scoring_list:
    scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring=scoring)
    print(f"Scoring: {scoring}")
    print(f"Scores: {scores}")
    print(f"Mean: {scores.mean()}")
    print(f"Standard deviation: {scores.std()}")
    print()

Scoring: accuracy
Scores: [0.79591837 0.78615794 0.7755102  0.77639752 0.76909414]
Mean: 0.7806156333874966
Standard deviation: 0.009396740022667787

Scoring: precision
Scores: [0.62825279 0.61983471 0.59504132 0.59022556 0.58369099]
Mean: 0.6034090744392356
Standard deviation: 0.01743352419640423

Scoring: recall
Scores: [0.56521739 0.50167224 0.48160535 0.52333333 0.4548495 ]
Mean: 0.505335562987737
Standard deviation: 0.03751036845226551

Scoring: f1
Scores: [0.59507042 0.55452865 0.5323475  0.55477032 0.5112782 ]
Mean: 0.5495990182626314
Standard deviation: 0.027873974973627997

Scoring: roc_auc
Scores: [0.83886102 0.81318566 0.80687032 0.81144297 0.80429121]
Mean: 0.8149302353941529
Standard deviation: 0.012377705860559666



In [74]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore', category=FutureWarning, module='xgboost')

param_grid = {
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "n_estimators": [100, 200, 300],
    "subsample": [0.5, 0.7, 1.0],
    "colsample_bytree": [0.5, 0.7, 1.0],
}

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or

Best score: 0.8042233187969758
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}


In [75]:
# Using best parameters
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.5)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_pred)}")

Accuracy: 0.8076650106458482
Precision: 0.67
Recall: 0.5388739946380697
F1: 0.5973254086181278
AUC: 0.7216570745391122
Confusion matrix:
 [[937  99]
 [172 201]]
Classification report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1036
           1       0.67      0.54      0.60       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

