In [1]:
import pandas as pd
from loguru import logger
import json
from xgboost import XGBClassifier

In [2]:
df_train_featurized = pd.read_parquet('../data/gold/telecom-customer-churn-featurized.parquet')
logger.info(f"Data loaded with shape: {df_train_featurized.shape}")

[32m2024-10-10 02:05:10.610[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mData loaded with shape: (80000, 109)[0m


In [3]:
df_train_featurized

Unnamed: 0,drop_vce_Mean,drop_dat_Mean,blck_vce_Mean,blck_dat_Mean,unan_vce_Mean,unan_dat_Mean,plcd_vce_Mean,plcd_dat_Mean,recv_vce_Mean,recv_sms_Mean,...,crclscod_Z,crclscod_Z1,crclscod_Z2,crclscod_Z4,crclscod_Z5,crclscod_ZA,crclscod_ZY,asl_flag_Y,Customer_ID,churn
0,-0.255636,-0.044145,-0.287568,-0.016558,0.013389,-0.057119,-0.176420,-0.059700,-0.245900,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1099794,0
1,-0.218783,-0.044145,-0.097401,-0.016558,0.358164,-0.057119,0.405258,-0.096753,1.649606,-0.021693,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1034754,0
2,3.687658,-0.044145,0.061072,-0.016558,1.073570,-0.057119,1.994895,-0.096753,1.318463,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1046686,0
3,-0.587315,-0.044145,0.282934,0.189073,2.056176,0.580459,0.835741,0.384928,0.168980,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1054989,0
4,-0.218783,-0.044145,-0.034012,0.189073,0.108202,-0.057119,-0.008426,-0.059700,0.210848,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1036759,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,-0.034516,-0.044145,-0.097401,-0.016558,-0.098662,-0.057119,0.413657,-0.096753,-0.101263,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1005266,1
79996,0.444575,-0.044145,-0.160790,-0.016558,1.039092,-0.057119,0.598450,-0.096753,-0.211644,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1078121,0
79997,-0.329342,-0.044145,0.029377,-0.016558,-0.486533,-0.057119,-0.224718,-0.096753,-0.519949,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1022696,0
79998,3.171713,-0.044145,-0.129095,-0.016558,0.789131,-0.057119,1.129729,-0.096753,0.975902,-0.021693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1007424,0


In [4]:
# split the train data to train and test set:
from sklearn.model_selection import train_test_split

X = df_train_featurized.drop(columns=['Customer_ID', 'churn'])
y = df_train_featurized['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Train Crudely

In [5]:
# train the model
model = XGBClassifier()
model.fit(X_train, y_train)

In [6]:
# evaluate the model
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
clf_report = classification_report(y_test, y_pred)

In [7]:
print(clf_report)

              precision    recall  f1-score   support

           0       0.60      0.59      0.59      8070
           1       0.59      0.60      0.59      7930

    accuracy                           0.59     16000
   macro avg       0.59      0.59      0.59     16000
weighted avg       0.59      0.59      0.59     16000



## Hyperparams optimization

In [8]:
# perform hyperparameter tuning
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   2.5s
[CV] END ...learning_rate=0.1, max_depth=4, n_estimators=100; total time=   3.2s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   3.3s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   3.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   3.8s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time=   4.2s
[CV] END ...learning_rate=0.1, max_depth=4, n_estimators=100; total time=   2.0s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   4.4s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time=   4.6s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   4.6s
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=300; total time=   4.7s
[CV] END ...learning_rate=0.1, max_depth=4, n_e

In [9]:
# retrain the model with the best params
model = XGBClassifier(**best_params)
model.fit(X_train, y_train)

# evaluate the model
y_pred = model.predict(X_test)
clf_report = classification_report(y_test, y_pred)
print(clf_report)

              precision    recall  f1-score   support

           0       0.61      0.60      0.60      8070
           1       0.60      0.61      0.60      7930

    accuracy                           0.60     16000
   macro avg       0.60      0.60      0.60     16000
weighted avg       0.60      0.60      0.60     16000



In [10]:
## try with another algorithm

# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# clf_report = classification_report(y_test, y_pred)
# print(clf_report)

              precision    recall  f1-score   support

           0       0.59      0.62      0.60      8070
           1       0.59      0.56      0.57      7930

    accuracy                           0.59     16000
   macro avg       0.59      0.59      0.59     16000
weighted avg       0.59      0.59      0.59     16000



In [11]:
# # try an automl library TPOT
# from tpot import TPOTClassifier

# model = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=99)
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# clf_report = classification_report(y_test, y_pred)
# print(clf_report)

In [12]:
# use gradient boosting:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
clf_report = classification_report(y_test, y_pred)
print(clf_report)

              precision    recall  f1-score   support

           0       0.61      0.59      0.60      8070
           1       0.60      0.61      0.60      7930

    accuracy                           0.60     16000
   macro avg       0.60      0.60      0.60     16000
weighted avg       0.60      0.60      0.60     16000



In [14]:
# # gradient boosting with hyperparameter tuning
# model = GradientBoostingClassifier()
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.1, 0.01, 0.001]
# }

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_

In [None]:
# # retrain the model with the best params
# model = GradientBoostingClassifier(**best_params)

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# clf_report = classification_report(y_test, y_pred)