In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
data= pd.read_csv('data/Churn_Modelling.csv')

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
class ChurnModel:
    def __init__(self):
        self.le1=LabelEncoder()
        self.le2=LabelEncoder()
        self.model= XGBClassifier(
            n_estimators= 300,
            max_depth= 3,
            learning_rate= 0.03,
            min_child_weight= 5,
            subsample= 0.7,
            colsample_bytree=0.8,
            gamma= 0.1,
            reg_alpha= 0.5,
            reg_lambda= 1.0,
            tree_method= 'hist',
            random_state=42
        )
        self.features_= None
    def _feature_engineering(self, x, fit=False):
        x= x.copy()
        x= x.drop(columns=['RowNumber','CustomerId','Surname'])
        #encoding values
        if fit:
            x['Geography']= self.le1.fit_transform(x['Geography'])
            x['Gender']= self.le2.fit_transform(x['Gender'])
        else:
            x['Geography']= self.le1.transform(x['Geography'])
            x['Gender']= self.le2.transform(x['Gender'])     

        #feature interactions
        x['old_inactive']= ((x['IsActiveMember']==0)&(x['Age']>=40)).astype(int)
        x['young_active']= ((x['IsActiveMember']==1)&(x['Age']<=40)).astype(int)
        x['log_balance']= np.log1p(x['Balance'])
        x['log_salary']= np.log1p(x['EstimatedSalary'])
        x['log_credit']= np.log1p(x['CreditScore'])
        x['log_products']= np.log1p(x['NumOfProducts'])
        x['sqrt_balance']= np.sqrt(x['Balance'])
        x['sqrt_salary']= np.sqrt(x['EstimatedSalary'])
        x['sqrt_credit']= np.sqrt(x['CreditScore'])
        x['sqrt_products']= np.sqrt(x['NumOfProducts'])

        x['balance x products']= x['Balance']*x['NumOfProducts']
        x['balance x age']= x['Balance']*x['Age']
        x['products x age']= x['NumOfProducts']*x['Age']

        x['no balance no active']= ((x['Balance']==0)&(x['IsActiveMember']==0)).astype(int)
        return x

    def fit(self, x, y):
        x_transformed = self._feature_engineering(x, fit=True)
        self.features_ = x_transformed.columns
        self.model.fit(x_transformed, y)
        return self

    def predict(self, x):
        x_transformed = self._feature_engineering(x, fit=False)
        x_transformed = x_transformed[self.features_]
        return self.model.predict(x_transformed)

    def predict_proba(self, x):
        x_transformed = self._feature_engineering(x, fit=False)
        x_transformed = x_transformed[self.features_]
        return self.model.predict_proba(x_transformed)[:, 1]

In [5]:
model= ChurnModel()

In [6]:
x= data.copy()

In [7]:
x= x.drop(columns=['Exited'])

In [8]:
y=data['Exited']

In [9]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state=42)

In [10]:
model.fit(x_train, y_train)

<__main__.ChurnModel at 0x7fdfcc9c5010>

In [11]:
prob = model.predict_proba(x_test)

In [12]:
pred = (prob >= 0.3).astype(int)

In [13]:
pred

array([0, 0, 0, ..., 1, 0, 0], shape=(2000,))

In [14]:
cm= confusion_matrix(y_test, pred)
acs= accuracy_score(y_test, pred)
ps= precision_score(y_test, pred)
rs= recall_score(y_test, pred)
fs= f1_score(y_test, pred)
roc_auc = roc_auc_score(y_test, prob)

In [15]:
print(cm)
print(acs)
print(ps)
print(rs)
print(fs)
print(roc_auc)

[[1442  165]
 [ 130  263]]
0.8525
0.6144859813084113
0.6692111959287532
0.6406820950060901
0.870748363948438


In [23]:
X_test_transformed = model._feature_engineering(x_test, fit=False)
X_test_transformed = X_test_transformed[model.features_]

result = permutation_importance(
    model.model,
    X_test_transformed,
    y_test,
    n_repeats=10,
    random_state=42,
    scoring='accuracy'
)

perm_importance_df = pd.DataFrame({
    'feature': model.features_,
    'importance': result.importances_mean*100
}).sort_values(by='importance', ascending=False)

perm_importance_df

Unnamed: 0,feature,importance
3,Age,4.77
6,NumOfProducts,2.335
10,old_inactive,1.515
1,Geography,0.995
22,products x age,0.765
0,CreditScore,0.415
5,Balance,0.41
20,balance x products,0.405
8,IsActiveMember,0.38
21,balance x age,0.3


In [22]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)
scores = []
for train_idx, val_idx in skf.split(x, y):
    X_train, X_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = ChurnModel()
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    scores.append(accuracy_score(y_val, preds))

print("CV Accuracy:", (scores))

CV Accuracy: [0.858, 0.8585, 0.862, 0.8715, 0.8595]
