In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('/content/Bank Customer Churn Prediction.csv')

In [3]:
print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} number of columns")

Dataset has 10000 rows and 12 number of columns


In [4]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,8
object,2
float64,2


In [5]:
df.dtypes

Unnamed: 0,0
customer_id,int64
credit_score,int64
country,object
gender,object
age,int64
tenure,int64
balance,float64
products_number,int64
credit_card,int64
active_member,int64


In [6]:
df.describe()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [7]:
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
target_col = 'churn'
unwanted_col = 'customer_id'

In [9]:

def clean_missing(df):

  for col in df.columns:
    if col != target_col and col != unwanted_col:
      missing_cnt = df[col].isnull().sum()
      missing_pct = 100 * missing_cnt / len(df)
      if missing_pct > 50:
        df.drop(columns = col, inplace = True)
        continue

      if df[col].dtype in ('int64', 'float64'):
        mean = df[col].mean()
        df[col] = df[col].fillna(mean)
        print(f"{col} filled na with mean: {mean:.4f}")

      else:
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)
        print(f"{col} filled na with mode: {mode}")


  return df


clean_missing(df)


credit_score filled na with mean: 650.5288
country filled na with mode: France
gender filled na with mode: Male
age filled na with mean: 38.9218
tenure filled na with mean: 5.0128
balance filled na with mean: 76485.8893
products_number filled na with mean: 1.5302
credit_card filled na with mean: 0.7055
active_member filled na with mean: 0.5151
estimated_salary filled na with mean: 100090.2399


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,15569892,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,15584532,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,15682355,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [10]:
numerical_cols = []
categorical_cols = []

for col in df.columns:
  if col != target_col and col != unwanted_col:
    if df[col].dtypes in ('int64', 'float64'):
      numerical_cols.append(col)

    else:
      categorical_cols.append(col)



In [11]:
numerical_cols

['credit_score',
 'age',
 'tenure',
 'balance',
 'products_number',
 'credit_card',
 'active_member',
 'estimated_salary']

In [12]:
categorical_cols

['country', 'gender']

In [13]:
# T-TEST
from scipy.stats import ttest_ind

def t_test_analysis(df, numerical_col, target_col = 'churn'):

  churn = df[df[target_col] == 1][numerical_col]
  non_churn = df[df[target_col] == 0][numerical_col]

  t_test, p_value = ttest_ind(churn, non_churn)

  print(f"FEATURE: {numerical_col}")
  print(f"T-TEST value: {t_test:.4f}")
  print(f"p_value value: {p_value:.4f}")

  if p_value < .05:
    print(f"SIGNIFICANT FEATURE. Keep {numerical_col}")

  else:
    print(f"NOT SIGNIFICANT FEATURE. Remove {numerical_col}")

  return t_test, p_value

for num_col in numerical_cols:
  t_test, p_value = t_test_analysis(df, num_col, target_col = 'churn')
  print()

FEATURE: credit_score
T-TEST value: -2.7101
p_value value: 0.0067
SIGNIFICANT FEATURE. Keep credit_score

FEATURE: age
T-TEST value: 29.7668
p_value value: 0.0000
SIGNIFICANT FEATURE. Keep age

FEATURE: tenure
T-TEST value: -1.4001
p_value value: 0.1615
NOT SIGNIFICANT FEATURE. Remove tenure

FEATURE: balance
T-TEST value: 11.9362
p_value value: 0.0000
SIGNIFICANT FEATURE. Keep balance

FEATURE: products_number
T-TEST value: -4.7870
p_value value: 0.0000
SIGNIFICANT FEATURE. Keep products_number

FEATURE: credit_card
T-TEST value: -0.7137
p_value value: 0.4754
NOT SIGNIFICANT FEATURE. Remove credit_card

FEATURE: active_member
T-TEST value: -15.8051
p_value value: 0.0000
SIGNIFICANT FEATURE. Keep active_member

FEATURE: estimated_salary
T-TEST value: 1.2097
p_value value: 0.2264
NOT SIGNIFICANT FEATURE. Remove estimated_salary



In [14]:
cols_to_remove = ['tenure', 'credit_card', 'estimated_salary']

df = df.drop(columns=cols_to_remove)

numerical_cols = [col for col in numerical_cols if col not in cols_to_remove]


In [17]:
# CHI-SQUARE
from scipy.stats import chi2_contingency

def chi_square_test(df, categorical_col, target_col = 'churn'):

  table = pd.crosstab(df[categorical_col], df[target_col])
  chi_2, p_value, dof, expected = chi2_contingency(table)

  print(f"FEATURE: {categorical_col}")
  print(f"CHI SQUARED value: {chi_2:.4f}")
  print(f"p_value value: {p_value:.4f}")
  print(f"Degrees of Freedom: {dof}")

  if p_value < .05:
    print(f"SIGNIFICANT FEATURE. Keep {categorical_col}")

  else:
    print(f"NOT SIGNIFICANT FEATURE. Remove {categorical_col}")

  return chi_2, p_value, dof

for cat_col in categorical_cols:
  chi_2, p_value, dof = chi_square_test(df, cat_col, target_col = 'churn')
  print()

FEATURE: country
CHI SQUARED value: 301.2553
p_value value: 0.0000
Degrees of Freedom: 2
SIGNIFICANT FEATURE. Keep country

FEATURE: gender
CHI SQUARED value: 112.9186
p_value value: 0.0000
Degrees of Freedom: 1
SIGNIFICANT FEATURE. Keep gender



In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

X = df.drop(columns = 'churn')
y = df['churn']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .4, random_state = 42)
X_train = pd.get_dummies(X_train, columns= ['country', 'gender'])
X_val = pd.get_dummies(X_val, columns= ['country', 'gender'])

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [20]:
X_train_scaled.shape

(6000, 11)

In [21]:
y_train.shape

(6000,)

In [22]:
X_val_scaled.shape

(4000, 11)

In [23]:
y_val.shape

(4000,)

In [31]:
logistic_regression_model = LogisticRegression(max_iter = 1000)
logistic_regression_model.fit(X_train_scaled, y_train)

y_pred = logistic_regression_model.predict(X_val_scaled)
y_prob = logistic_regression_model.predict_proba(X_val_scaled)[:, 1]


accuracy = accuracy_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.8077
Recall: 0.2111
Precision: 0.5681
ROC AUC Score: 0.7679


In [33]:
random_forest_model = RandomForestClassifier(n_estimators = 100, max_depth = 5, min_samples_leaf=2, min_samples_split=2)
random_forest_model = RandomForestClassifier(n_estimators = 1000, max_depth = 5, min_samples_leaf=2, min_samples_split=2)
random_forest_model.fit(X_train_scaled, y_train)

y_pred = random_forest_model.predict(X_val_scaled)
y_prob = random_forest_model.predict_proba(X_val_scaled)[:, 1]


accuracy = accuracy_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.8492
Recall: 0.3123
Precision: 0.8462
ROC AUC Score: 0.8488


In [34]:
xgb_model = XGBClassifier(learning_rate = .01, n_estimators = 100, max_depth = 3,
                          subsample = 0.2, colsample_bytree = .4,
                          reg_alpha = .05)
xgb_model.fit(X_train_scaled, y_train)

y_pred = xgb_model.predict(X_val_scaled)
y_prob = xgb_model.predict_proba(X_val_scaled)[:, 1]


accuracy = accuracy_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.8025
Recall: 0.0247
Precision: 1.0000
ROC AUC Score: 0.8519


In [36]:
xgb_model = XGBClassifier(learning_rate = .01, n_estimators = 1000, max_depth = 6,
                          subsample = 0.4, colsample_bytree = .6,
                          reg_alpha = .05)
xgb_model.fit(X_train_scaled, y_train)

y_pred = xgb_model.predict(X_val_scaled)
y_prob = xgb_model.predict_proba(X_val_scaled)[:, 1]


accuracy = accuracy_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.8650
Recall: 0.4728
Precision: 0.7722
ROC AUC Score: 0.8583
