In [1]:
# Import packages
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import optuna
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no


In [3]:
# Encode target variable
df["churn"] = df["churn"].replace(("yes", "no"), (1, 0))
data = df.drop(["churn"], axis = 1)
target = df["churn"]

In [4]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 0)

In [5]:
# Define categorical and numerical features
categorical_columns = list(X_train.select_dtypes(include = ["object"]).columns.values.tolist())
numeric_columns = list(X_train.select_dtypes(exclude = ["object"]).columns.values.tolist())

In [6]:
# Create feature engineering pipeline functions
def get_total_net_minutes(df):
    df["total_net_minutes"] = df["total_day_minutes"] + df["total_eve_minutes"] + df["total_night_minutes"]
    return df

def get_total_net_calls(df):
    df["total_net_calls"] = df["total_day_calls"] + df["total_eve_calls"] + df["total_night_calls"]
    return df

def get_total_net_charge(df):
    df["total_net_charge"] = df["total_day_charge"] + df["total_eve_charge"] + df["total_night_charge"]
    return df

def cs_calls_per_month(df):
    df["cs_calls_per_month"] = (df["number_customer_service_calls"] + df["number_vmail_messages"]) / df["account_length"]
    return df

In [7]:
# Create a feature engineering pipeline with ColumnTransformer
feature_engineering = ColumnTransformer([
    ("total_net_minutes", FunctionTransformer(get_total_net_minutes, validate = False), ["total_day_minutes", "total_eve_minutes", "total_night_minutes"]),
    ("total_net_calls", FunctionTransformer(get_total_net_calls, validate = False), ["total_day_calls", "total_eve_calls", "total_night_calls"]),
    ("total_net_charge", FunctionTransformer(get_total_net_charge, validate = False), ["total_day_charge", "total_eve_charge", "total_night_charge"]),
    ("cs_calls_per_month", FunctionTransformer(cs_calls_per_month, validate = False), ["account_length", "number_customer_service_calls", "number_vmail_messages"]),
])

In [8]:
# Create a numerical and categorical pipeline
numeric_transformer = SimpleImputer(strategy = "constant")

categorical_transformer = Pipeline(steps = [("imputer", SimpleImputer(strategy = "constant", fill_value = "missing")),
                                        ("onehot", OneHotEncoder(handle_unknown = "ignore"))])

In [9]:
# Create a feature selection and scaling pipeline
feature_selection = Pipeline(steps = [("scaler", StandardScaler()),
                                      ("feature_selection", SelectFromModel(estimator = LogisticRegression(), threshold = "median"))])

In [10]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(transformers = [("feature_engineering", feature_engineering, numeric_columns),
                                                 ("numeric_transformers", numeric_transformer, numeric_columns),
                                                 ("categorical_transformers", categorical_transformer, categorical_columns)])

In [11]:
# Create model using XGBClassifier
model = XGBClassifier(random_state = 0)
pipeline = Pipeline(steps = [("preprocessor", preprocessor), ("feature_selection", feature_selection), ("model", model)])

In [12]:
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

In [13]:
# Evaluate model’s performance
print("Accuracy:", round(accuracy_score(y_test, predictions), 3))
print("AUC:", round(roc_auc_score(y_test, predictions), 3))

Accuracy: 0.973
AUC: 0.907


In [14]:
# Use Optuna to tune XGBClassifier hyperparameters
def objective(trial):
    params = {
        "model__n_estimators": trial.suggest_int("model__n_estimators", 100, 1000),
        "model__learning_rate": trial.suggest_float("model__learning_rate", 0.01, 0.1),
        "model__max_depth": trial.suggest_int("model__max_depth", 3, 10),
        "model__min_child_weight": trial.suggest_int("model__min_child_weight", 1, 10),
        "model__gamma": trial.suggest_float("model__gamma", 0.01, 0.1),
        "model__subsample": trial.suggest_float("model__subsample", 0.01, 1.0),
        "model__colsample_bytree": trial.suggest_float("model__colsample_bytree", 0.01, 1.0),
        "model__reg_alpha": trial.suggest_float("model__reg_alpha", 1e-5, 10.0),
        "model__reg_lambda": trial.suggest_float("model__reg_lambda", 1e-5, 10.0),
        "model__scale_pos_weight": trial.suggest_float("model__scale_pos_weight", 1e-5, 10.0),
        "model__n_jobs": 4
    }
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    return roc_auc_score(y_test, predictions)

In [15]:
# Run Optuna optimization
study = optuna.create_study(study_name = "churn model", direction = "maximize")
study.optimize(objective, n_trials = 100)

[I 2024-03-28 15:39:13,361] A new study created in memory with name: churn model
[I 2024-03-28 15:39:13,720] Trial 0 finished with value: 0.9073351079562261 and parameters: {'model__n_estimators': 112, 'model__learning_rate': 0.035815274973332374, 'model__max_depth': 8, 'model__min_child_weight': 2, 'model__gamma': 0.04980012760623743, 'model__subsample': 0.9769556709989113, 'model__colsample_bytree': 0.8417523479445749, 'model__reg_alpha': 7.084635734791159, 'model__reg_lambda': 3.463898830347577, 'model__scale_pos_weight': 1.9063737860998413}. Best is trial 0 with value: 0.9073351079562261.
[I 2024-03-28 15:39:14,092] Trial 1 finished with value: 0.9126293995859214 and parameters: {'model__n_estimators': 447, 'model__learning_rate': 0.09198336912572205, 'model__max_depth': 4, 'model__min_child_weight': 7, 'model__gamma': 0.071007003741068, 'model__subsample': 0.9205010937146388, 'model__colsample_bytree': 0.36183792024365, 'model__reg_alpha': 5.560445223367975, 'model__reg_lambda': 0

In [16]:
print("Best parameters=", study.best_params, end = "\n\n")
print("Best score=", round(study.best_value, 3), end = "\n\n")
print("Best model=", study.best_trial)

Best parameters= {'model__n_estimators': 560, 'model__learning_rate': 0.08147880972543659, 'model__max_depth': 9, 'model__min_child_weight': 3, 'model__gamma': 0.07308435953343387, 'model__subsample': 0.551296973520393, 'model__colsample_bytree': 0.6499015800571791, 'model__reg_alpha': 8.386404011946796, 'model__reg_lambda': 4.333314223121343, 'model__scale_pos_weight': 5.222153010780475}

Best score= 0.92

Best model= FrozenTrial(number=83, state=1, values=[0.9196983141082519], datetime_start=datetime.datetime(2024, 3, 28, 15, 39, 50, 854679), datetime_complete=datetime.datetime(2024, 3, 28, 15, 39, 51, 340326), params={'model__n_estimators': 560, 'model__learning_rate': 0.08147880972543659, 'model__max_depth': 9, 'model__min_child_weight': 3, 'model__gamma': 0.07308435953343387, 'model__subsample': 0.551296973520393, 'model__colsample_bytree': 0.6499015800571791, 'model__reg_alpha': 8.386404011946796, 'model__reg_lambda': 4.333314223121343, 'model__scale_pos_weight': 5.22215301078047

In [17]:
# Re-fit model with the best parameters
pipeline.set_params(**study.best_params)
pipeline.fit(X_train, y_train)

In [18]:
predictions = pipeline.predict(X_test)

In [19]:
# Evaluate the performance of the final model
print("Accuracy:", round(accuracy_score(y_test, predictions), 3))
print("AUC:", round(roc_auc_score(y_test, predictions), 3))

Accuracy: 0.975
AUC: 0.92


In [20]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       735
           1       0.97      0.84      0.90       115

    accuracy                           0.98       850
   macro avg       0.97      0.92      0.94       850
weighted avg       0.98      0.98      0.97       850



**Conclusion:** We get back an accuracy score of 97.6% on the test dataset, with an AUC score of 0.92. The tuned model is performing better than the untuned model and we are now able to predict with very high accuracy which of customers are going to churn and which will be retained, purely based on their tenure, call plan, call charges and usage, and how often they needed to contact customer service.