## 1.Import Libraries

In [1]:
import numpy as np

import os

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	MinMaxScaler,
	FunctionTransformer
)

import matplotlib.pyplot as plt

import warnings

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

## 2. Import Dataset

In [16]:
Project_Def=r"C:\Users\Debasish Das\Desktop\CHURN_PREDICTION"
Data_Dir="Dataset"
def get_data(name):
    file_name=f"{name}.csv"
    file_path=os.path.join(Project_Def,Data_Dir,file_name)
    return pd.read_csv(file_path)

In [17]:
train=get_data("train")

In [18]:
test=get_data("test")

In [19]:
val=get_data("val")

In [20]:
x_train=train.drop(columns=["exited"])
y_train=train.exited.copy()

## 3. Display Setting

In [21]:
pd.set_option("display.max_column",None) #Display all the columns
sklearn.set_config(transform_output="pandas") # sklearn output pandas setting
warnings.filterwarnings("ignore")

In [22]:
train

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,Ozoemena,365,Germany,Male,30,0,127760.07,1,1,0,81537.85,1
1,Henry,553,Spain,Male,38,1,181110.13,2,1,0,184544.59,0
2,Romani,778,Germany,Female,29,6,150358.97,1,1,0,62454.01,1
3,Wang,850,France,Male,42,0,0.00,2,1,0,44165.84,0
4,Sung,674,Germany,Female,36,6,100762.64,1,1,0,182156.86,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6395,Howells,466,France,Female,30,3,0.00,1,1,0,193984.60,0
6396,Tu,789,Spain,Female,40,4,0.00,2,1,0,137402.27,0
6397,Nicholson,691,France,Male,28,1,0.00,2,0,0,92865.41,0
6398,Macadam,758,Spain,Male,35,5,0.00,2,1,0,95009.60,0


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   surname          6400 non-null   object 
 1   creditscore      6400 non-null   int64  
 2   geography        6400 non-null   object 
 3   gender           6400 non-null   object 
 4   age              6400 non-null   int64  
 5   tenure           6400 non-null   int64  
 6   balance          6400 non-null   float64
 7   numofproducts    6400 non-null   int64  
 8   hascrcard        6400 non-null   int64  
 9   isactivemember   6400 non-null   int64  
 10  estimatedsalary  6400 non-null   float64
 11  exited           6400 non-null   int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 600.1+ KB


## 4. Transfromation

### 4.1 Creditscore

In [27]:
x_train.describe()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
count,6400.0,6400.0,6400.0,6400.0,6400.0,6400.0,6400.0,6400.0
mean,651.445625,38.935469,4.982188,76243.429923,1.533906,0.7025,0.515938,100297.819319
std,96.333724,10.491905,2.879514,62539.512273,0.58081,0.457194,0.499785,57497.848304
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,90.07
25%,585.0,32.0,2.0,0.0,1.0,0.0,0.0,51385.55
50%,653.0,37.0,5.0,96292.52,1.0,1.0,1.0,100038.165
75%,718.0,44.0,7.0,127638.0275,2.0,1.0,1.0,149539.715
max,850.0,83.0,10.0,238387.56,4.0,1.0,1.0,199970.74


In [38]:
def credit_score(x,poor=499,average=649,good=749):
    return (
        x
		.assign(creditscore_cat=np.select([x.creditscore.lt(poor),
									    x.creditscore.between(poor, average, inclusive="left"),
                                        x.creditscore.between(average,good, inclusive="left")],
									   ["poor", "average","good"],
									   default="excellent")).drop(columns=["creditscore"])
	)

In [39]:
# Create the pipeline
credit_transform = Pipeline(
    steps=[
        ("cat", FunctionTransformer(func=credit_score, validate=False)),
        ("encoder", OrdinalEncoder(categories=[["poor", "average", "good", "excellent"]]))
    ]
)

credit_transform.fit_transform(x_train.loc[:,["creditscore"]])

Unnamed: 0,creditscore_cat
0,0.0
1,1.0
2,3.0
3,3.0
4,2.0
...,...
6395,0.0
6396,3.0
6397,2.0
6398,3.0


### 4.2 Geography

In [42]:
x_train.geography.value_counts()

geography
France     3218
Germany    1613
Spain      1569
Name: count, dtype: int64

In [48]:
geography_transfrom= Pipeline(
    steps=[
        ("encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
    ]
)
geography_transfrom.fit_transform(x_train.loc[:,["geography"]])

Unnamed: 0,geography_France,geography_Germany,geography_Spain
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
6395,1.0,0.0,0.0
6396,0.0,0.0,1.0
6397,1.0,0.0,0.0
6398,0.0,0.0,1.0


### 4.3 gender

In [49]:
gender_transfrom=Pipeline(
    steps=[
        ("encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
    ]
)
gender_transfrom.fit_transform(x_train.loc[:,["gender"]])

Unnamed: 0,gender_Female,gender_Male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
...,...,...
6395,1.0,0.0
6396,1.0,0.0
6397,0.0,1.0
6398,0.0,1.0


### 4.4 age

In [50]:
def age_catagory(x,young=30,middle=45,nold=65):
    return (
        x
		.assign(age_cat=np.select([x.age.lt(young),
									    x.age.between(young, middle, inclusive="left"),
                                        x.age.between(middle,nold, inclusive="left")],
									   ["young", "middle","nold"],
									   default="old")).drop(columns=["age"])
	)

In [53]:
age_transfrom=Pipeline(steps=[
    ("cat_age",FunctionTransformer(func=age_catagory)),
    ("encoder",OrdinalEncoder(categories=[["young", "middle","nold","old"]]))
])
age_transfrom.fit_transform(x_train.loc[:,["age"]])

Unnamed: 0,age_cat
0,1.0
1,1.0
2,0.0
3,1.0
4,1.0
...,...
6395,1.0
6396,1.0
6397,0.0
6398,1.0


### 4.5 tenure

In [55]:
x_train.tenure.value_counts()

tenure
7     670
1     668
5     666
2     663
4     660
8     647
3     635
6     614
9     593
10    313
0     271
Name: count, dtype: int64

### 4.6 balance

In [56]:
x_train.balance

0       127760.07
1       181110.13
2       150358.97
3            0.00
4       100762.64
          ...    
6395         0.00
6396         0.00
6397         0.00
6398         0.00
6399         0.00
Name: balance, Length: 6400, dtype: float64

In [59]:
balence_tranfrom=Pipeline(
    steps=[
        ("sclar",MinMaxScaler())
    ]
)
balence_tranfrom.fit_transform(x_train.loc[:,["balance"]])

Unnamed: 0,balance
0,0.535934
1,0.759730
2,0.630733
3,0.000000
4,0.422684
...,...
6395,0.000000
6396,0.000000
6397,0.000000
6398,0.000000


### 4.7 estimatedsalary

In [60]:
x_train.estimatedsalary

0        81537.85
1       184544.59
2        62454.01
3        44165.84
4       182156.86
          ...    
6395    193984.60
6396    137402.27
6397     92865.41
6398     95009.60
6399    156598.23
Name: estimatedsalary, Length: 6400, dtype: float64

In [61]:
salary_tranfrom=Pipeline(
    steps=[
        ("sclar",MinMaxScaler())
    ]
)
balence_tranfrom.fit_transform(x_train.loc[:,["estimatedsalary"]])

Unnamed: 0,estimatedsalary
0,0.407482
1,0.922823
2,0.312006
3,0.220510
4,0.910877
...,...
6395,0.970051
6396,0.686971
6397,0.464154
6398,0.474881


## 5. Data Processing

In [68]:
Column_Transformer=ColumnTransformer(transformers=
                                     [
                             ("cs",credit_transform,["creditscore"]),
                             ("geo",geography_transfrom,["geography"]),
                             ("gen",gender_transfrom,["gender"]),
                             ("age",age_transfrom,["age"]),
                             ("bal",balence_tranfrom,["balance"]),
                             ("es",salary_tranfrom,["estimatedsalary"])
                                         
                                     ], remainder="passthrough"
)

In [78]:
"""# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) """

# preprocessor
preprocessor = Pipeline(steps=[
("ct", Column_Transformer)
])

In [89]:
preprocessor.fit(
    train.drop(columns=["surname"]),
    train.exited.copy()
)

In [90]:
pre_train=preprocessor.transform(train)
pre_val=preprocessor.transform(val)
pre_test=preprocessor.transform(test)
pre_train

Unnamed: 0,cs__creditscore_cat,geo__geography_France,geo__geography_Germany,geo__geography_Spain,gen__gender_Female,gen__gender_Male,age__age_cat,bal__balance,es__estimatedsalary,remainder__tenure,remainder__numofproducts,remainder__hascrcard,remainder__isactivemember,remainder__exited
0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.535934,0.407482,0,1,1,0,1
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.759730,0.922823,1,2,1,0,0
2,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.630733,0.312006,6,1,1,0,1
3,3.0,1.0,0.0,0.0,0.0,1.0,1.0,0.000000,0.220510,0,2,1,0,0
4,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.422684,0.910877,6,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.000000,0.970051,3,1,1,0,0
6396,3.0,0.0,0.0,1.0,1.0,0.0,1.0,0.000000,0.686971,4,2,1,0,0
6397,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.464154,1,2,0,0,0
6398,3.0,0.0,0.0,1.0,0.0,1.0,1.0,0.000000,0.474881,5,2,1,0,0


## 6.Model 

In [92]:



# Assume pre_train and pre_val are your training and validation datasets
X_train, y_train = pre_train.drop(columns="remainder__exited"), pre_train["remainder__exited"]
X_val, y_val = pre_val.drop(columns="remainder__exited"), pre_val["remainder__exited"]



In [94]:
# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_xgb = xgb_model.predict(X_val)
xgb_accuracy = accuracy_score(y_val, y_val_pred_xgb)

print(f"XGBoost Accuracy on validation set: {xgb_accuracy:.4f}")
print(classification_report(y_val, y_val_pred_xgb))

# Create the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_rf = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, y_val_pred_rf)

print(f"Random Forest Accuracy on validation set: {rf_accuracy:.4f}")
print(classification_report(y_val, y_val_pred_rf))

# Create the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_lr = lr_model.predict(X_val)
lr_accuracy = accuracy_score(y_val, y_val_pred_lr)

print(f"Logistic Regression Accuracy on validation set: {lr_accuracy:.4f}")
print(classification_report(y_val, y_val_pred_lr))

# Create the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_gb = gb_model.predict(X_val)
gb_accuracy = accuracy_score(y_val, y_val_pred_gb)

print(f"Gradient Boosting Accuracy on validation set: {gb_accuracy:.4f}")
print(classification_report(y_val, y_val_pred_gb))


XGBoost Accuracy on validation set: 0.8488
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1273
           1       0.69      0.47      0.56       327

    accuracy                           0.85      1600
   macro avg       0.78      0.71      0.73      1600
weighted avg       0.84      0.85      0.84      1600

Random Forest Accuracy on validation set: 0.8431
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1273
           1       0.68      0.43      0.53       327

    accuracy                           0.84      1600
   macro avg       0.77      0.69      0.72      1600
weighted avg       0.83      0.84      0.83      1600

Logistic Regression Accuracy on validation set: 0.8113
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1273
           1       0.62      0.20      0.30       327

    accuracy                       

In [101]:
""""# Define the XGBoost model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

# Define the parameter grid
xgb_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 2, 3, 4, 5]
}

# Perform GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, scoring='accuracy', cv=5, n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

# Evaluate the best model on the validation set
xgb_best_model = xgb_grid_search.best_estimator_
y_val_pred_xgb = xgb_best_model.predict(X_val)
xgb_accuracy = accuracy_score(y_val, y_val_pred_xgb)

print(f"XGBoost Best Accuracy on validation set: {xgb_accuracy:.4f}")
print(f"XGBoost Best hyperparameters: {xgb_grid_search.best_params_}")
print(classification_report(y_val, y_val_pred_xgb))"""


'"# Define the XGBoost model\nxgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)\n\n# Define the parameter grid\nxgb_param_grid = {\n    \'n_estimators\': [100, 200, 300, 400, 500],\n    \'max_depth\': [3, 4, 5, 6, 7],\n    \'learning_rate\': [0.01, 0.05, 0.1, 0.2],\n    \'gamma\': [0, 0.1, 0.2, 0.3, 0.4],\n    \'min_child_weight\': [1, 2, 3, 4, 5]\n}\n\n# Perform GridSearchCV\nxgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, scoring=\'accuracy\', cv=5, n_jobs=-1)\nxgb_grid_search.fit(X_train, y_train)\n\n# Evaluate the best model on the validation set\nxgb_best_model = xgb_grid_search.best_estimator_\ny_val_pred_xgb = xgb_best_model.predict(X_val)\nxgb_accuracy = accuracy_score(y_val, y_val_pred_xgb)\n\nprint(f"XGBoost Best Accuracy on validation set: {xgb_accuracy:.4f}")\nprint(f"XGBoost Best hyperparameters: {xgb_grid_search.best_params_}")\nprint(classification_report(y_val, y_val_pred_xgb))'

In [None]:
"""# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, scoring='accuracy', cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Evaluate the best model on the validation set
rf_best_model = rf_grid_search.best_estimator_
y_val_pred_rf = rf_best_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, y_val_pred_rf)

print(f"Random Forest Best Accuracy on validation set: {rf_accuracy:.4f}")
print(f"Random Forest Best hyperparameters: {rf_grid_search.best_params_}")
print(classification_report(y_val, y_val_pred_rf))"""


In [None]:
"""# Define the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Define the parameter grid
lr_param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Perform GridSearchCV
lr_grid_search = GridSearchCV(estimator=lr_model, param_grid=lr_param_grid, scoring='accuracy', cv=5, n_jobs=-1)
lr_grid_search.fit(X_train, y_train)

# Evaluate the best model on the validation set
lr_best_model = lr_grid_search.best_estimator_
y_val_pred_lr = lr_best_model.predict(X_val)
lr_accuracy = accuracy_score(y_val, y_val_pred_lr)

print(f"Logistic Regression Best Accuracy on validation set: {lr_accuracy:.4f}")
print(f"Logistic Regression Best hyperparameters: {lr_grid_search.best_params_}")
print(classification_report(y_val, y_val_pred_lr))"""


## 7. Hyperpameter Tuning

In [95]:
# Define the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Define the parameter grid
gb_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

# Perform GridSearchCV
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

# Evaluate the best model on the validation set
gb_best_model = gb_grid_search.best_estimator_
y_val_pred_gb = gb_best_model.predict(X_val)
gb_accuracy = accuracy_score(y_val, y_val_pred_gb)

print(f"Gradient Boosting Best Accuracy on validation set: {gb_accuracy:.4f}")
print(f"Gradient Boosting Best hyperparameters: {gb_grid_search.best_params_}")
print(classification_report(y_val, y_val_pred_gb))


Gradient Boosting Best Accuracy on validation set: 0.8681
Gradient Boosting Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.7}
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1273
           1       0.81      0.46      0.59       327

    accuracy                           0.87      1600
   macro avg       0.84      0.72      0.76      1600
weighted avg       0.86      0.87      0.85      1600



In [None]:
"""# Create the Gradient Boosting model
gb_model = GradientBoostingClassifier(learning_rate= 0.01, max_depth= 5, n_estimators= 400, subsample= 0.7)
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_gb = gb_model.predict(X_val)
gb_accuracy = accuracy_score(y_val, y_val_pred_gb)

print(f"Gradient Boosting Accuracy on validation set: {gb_accuracy:.4f}")
print(classification_report(y_val, y_val_pred_gb))"""

## 7. Model Evaluation & Save the Model

In [97]:
import pickle

# Saving the model
with open("GB-model", "wb") as f:
    pickle.dump(gb_best_model, f)
with open("GB-model", "rb") as f:
    best=pickle.load(f)


In [102]:
def evaluate_model(data,name):
    
    X = data.drop(columns="remainder__exited")
    y = data.remainder__exited.copy()

    pred = best.predict(X)
    gd_acc=accuracy_score(y, pred)
    k=f"{name} data Accuracy: {gd_acc:.4f}"
    return  k

In [103]:
evaluate_model(pre_test,"Test")

'Test data Accuracy: 0.8545'