In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

In [2]:
np.random.seed(42)
n_samples = 10000
X,y=make_classification(n_samples=n_samples, n_features=6, n_informative=4, n_redundant=2, n_clusters_per_class=2, flip_y=0.1, random_state=42)

In [3]:
df = pd.DataFrame(X, columns=['Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Feature 6'])
df.drop('Feature 6', axis=1, inplace=True)

In [4]:
df['CustomerID'] = np.arange(1, n_samples + 1)
df['Gender'] = np.random.choice(['Male', 'Female'], size=n_samples)
df['HasCrCard'] = np.random.choice([0,1], size=n_samples)
df['IsActiveMember'] = np.random.choice([0,1], size=n_samples)

In [5]:
df['Churn'] = y
df = df[['CustomerID', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Churn']]

In [6]:
df['Age'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min()) * 60 + 18 # Age between 18 and 78
df['Tenure'] = (df['Tenure'] - df['Tenure'].min()) / (df['Tenure'].max() - df['Tenure'].min()) * 10 # Between 0 and 10 year tenure
df['Balance'] = (df['Balance'] - df['Balance'].min()) / (df['Balance'].max() - df['Balance'].min()) * 100000 # Balance between 0 and 100_000
df['EstimatedSalary'] = (df['EstimatedSalary'] - df['EstimatedSalary'].min()) / (df['EstimatedSalary'].max() - df['EstimatedSalary'].min()) * 150000 # Salary between 0 and 150_000

In [7]:
print(df.head())

   CustomerID  Gender        Age    Tenure       Balance  NumOfProducts  \
0           1    Male  41.092634  6.354253  25609.316205      -2.531088   
1           2  Female  45.154855  4.879708  49299.499223      -1.021109   
2           3    Male  42.314889  3.112427  58728.649946      -1.188596   
3           4    Male  54.154353  3.674375  54817.350616       0.091454   
4           5    Male  37.576909  4.839103  56877.919915      -0.672132   

   HasCrCard  IsActiveMember  EstimatedSalary  Churn  
0          0               0    112477.937614      1  
1          1               1     68674.818701      1  
2          0               1     54072.323713      0  
3          1               1     57809.625261      1  
4          1               0     67843.648958      1  


In [8]:
df.to_csv('churn_prediction_dataset.csv', index=False)

In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('churn_prediction_dataset.csv')

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_encoder = LabelEncoder()

In [13]:
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [14]:
X = df.drop(columns=['CustomerID', 'Churn'])
y = df['Churn']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [19]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
y_pred = xgb_model.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("Accuracy: ", accuracy)
print('Confusion Matrix:')
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy:  0.8955
Confusion Matrix:
[[893  90]
 [119 898]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90       983
           1       0.91      0.88      0.90      1017

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000

