# Importing Libraries and Loading the Data:

In [246]:
import numpy as np
import pandas as pd

In [247]:
from pathlib import Path
import pandas as pd
import urllib.request
import tarfile
import zipfile

def load_data(url, archive_name, extract_dir, csv_relative_path):
    extract_path = Path(extract_dir)
    archive_path = extract_path / archive_name

    extract_path.mkdir(parents=True, exist_ok=True)

    if not archive_path.is_file():
        print(f"Downloading {archive_name} ...")
        urllib.request.urlretrieve(url, archive_path)

    # Extract
    if archive_name.endswith(".zip"):
        with zipfile.ZipFile(archive_path, 'r') as zip_ref:
            zip_ref.extractall(path=extract_path)
    else:
        raise ValueError("Unsupported archive format!")

    # Load CSV WITHOUT index
    df = pd.read_csv(extract_path / csv_relative_path)

    return df.reset_index(drop=True)

In [248]:
churn = load_data(
    url="https://www.kaggle.com/api/v1/datasets/download/shrutimechlearn/churn-modelling",
    archive_name="datafile.zip",
    extract_dir="datasets",
    csv_relative_path="Churn_Modelling.csv"
)
churn.index = churn.index + 1

In [249]:
churn

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
4,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
5,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9997,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9998,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9999,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# Preprocessing the Data:

In [250]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 1 to 10000
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [251]:
churn.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [252]:
# Categorical Columns:
churn.dtypes[churn.dtypes == 'object']

Surname      object
Geography    object
Gender       object
dtype: object

In [253]:
churn.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


## Removing Irrelavant features:

In [254]:
churn = churn.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [255]:
churn

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
4,699,France,Female,39,1,0.00,2,0,0,93826.63,0
5,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9996,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9997,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9998,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9999,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


## Encoding Categorical Features into Numerical Features:

In [256]:
from sklearn.preprocessing import LabelEncoder

In [257]:
gender_label = LabelEncoder()
churn['Gender'] = gender_label.fit_transform(churn['Gender'])

In [258]:
gender_label.classes_

array(['Female', 'Male'], dtype=object)

In [259]:
from sklearn.preprocessing import OneHotEncoder
geography_label = OneHotEncoder(sparse_output=False)

In [260]:
geography = geography_label.fit_transform(churn[['Geography']])

In [261]:
geography

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], shape=(10000, 3))

In [262]:
geography_label.categories_

[array(['France', 'Germany', 'Spain'], dtype=object)]

In [263]:
geography_label.get_feature_names_out()

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [264]:
geography_df = pd.DataFrame(geography,columns=geography_label.get_feature_names_out())

In [265]:
geography_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


# Updating data with Encoded Features and Removing the old one's:

In [266]:
churn = churn.drop("Geography",axis=1)
churn = pd.concat([churn,geography_df],axis=1)
churn

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
1,619.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88,1.0,0.0,0.0,1.0
2,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0.0,1.0,0.0,0.0
3,502.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57,1.0,1.0,0.0,0.0
4,699.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63,0.0,0.0,0.0,1.0
5,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9997,516.0,1.0,35.0,10.0,57369.61,1.0,1.0,1.0,101699.77,0.0,1.0,0.0,0.0
9998,709.0,0.0,36.0,7.0,0.00,1.0,0.0,1.0,42085.58,1.0,0.0,1.0,0.0
9999,772.0,1.0,42.0,3.0,75075.31,2.0,1.0,0.0,92888.52,1.0,1.0,0.0,0.0
10000,792.0,0.0,28.0,4.0,130142.79,1.0,1.0,0.0,38190.78,0.0,,,


In [267]:
churn.isnull().sum()

CreditScore          1
Gender               1
Age                  1
Tenure               1
Balance              1
NumOfProducts        1
HasCrCard            1
IsActiveMember       1
EstimatedSalary      1
Exited               1
Geography_France     1
Geography_Germany    1
Geography_Spain      1
dtype: int64

In [268]:
churn[churn.isnull().any(axis=1)]

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
10000,792.0,0.0,28.0,4.0,130142.79,1.0,1.0,0.0,38190.78,0.0,,,
0,,,,,,,,,,,1.0,0.0,0.0


## Handling the Missing Values in the Data:

In [269]:
from sklearn.impute import SimpleImputer

In [270]:
numeric_cols = ['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary','Exited']
geo_cols = ['Geography_France', 'Geography_Germany', 'Geography_Spain']

In [271]:
num_imputer = SimpleImputer(strategy='median')
churn[numeric_cols] = num_imputer.fit_transform(churn[numeric_cols])

geo_imputer = SimpleImputer(strategy='most_frequent')
churn[geo_cols] = geo_imputer.fit_transform(churn[geo_cols])

In [272]:
churn.loc[10000]

CreditScore             792.00
Gender                    0.00
Age                      28.00
Tenure                    4.00
Balance              130142.79
NumOfProducts             1.00
HasCrCard                 1.00
IsActiveMember            0.00
EstimatedSalary       38190.78
Exited                    0.00
Geography_France          1.00
Geography_Germany         0.00
Geography_Spain           0.00
Name: 10000, dtype: float64

In [273]:
churn.loc[0]

CreditScore             652.000
Gender                    1.000
Age                      37.000
Tenure                    5.000
Balance               97198.540
NumOfProducts             1.000
HasCrCard                 1.000
IsActiveMember            1.000
EstimatedSalary      100193.915
Exited                    0.000
Geography_France          1.000
Geography_Germany         0.000
Geography_Spain           0.000
Name: 0, dtype: float64

In [274]:
print(churn.isnull().sum())

CreditScore          0
Gender               0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_France     0
Geography_Germany    0
Geography_Spain      0
dtype: int64


In [275]:
churn[churn.isnull().any(axis=1)]

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain


## Saving the Encoding Instances with Pickle:

In [276]:
import pickle
import os

In [277]:
# Saving the Gender Encoder created with Label Encoding:
if os.path.exists('gender_label.pkl'):
    print("File Already Exist!")
else:
    with open('gender_label.pkl','wb') as f:
        pickle.dump(gender_label,f)
        print("File created and 'gender_label.pkl' saved.")

File Already Exist!


In [278]:
# Saving the Geography Encoder created with One Hot Encoding:
if os.path.exists('geography_label.pkl'):
    print("File Already Exist!")
else:
    with open('geography_label.pkl','wb') as f:
        pickle.dump(geography_label,f)
        print("File created and 'geography_label.pkl' saved.")

File Already Exist!


# Let's divide the Dataset into Independent and Dependent Features:

In [279]:
X = churn.drop('Exited',axis=1)
y = churn['Exited']
X.shape,y.shape

((10001, 12), (10001,))

In [280]:
X

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
1,619.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.880,0.0,0.0,1.0
2,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.580,1.0,0.0,0.0
3,502.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.570,1.0,0.0,0.0
4,699.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.630,0.0,0.0,1.0
5,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.100,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9997,516.0,1.0,35.0,10.0,57369.61,1.0,1.0,1.0,101699.770,1.0,0.0,0.0
9998,709.0,0.0,36.0,7.0,0.00,1.0,0.0,1.0,42085.580,0.0,1.0,0.0
9999,772.0,1.0,42.0,3.0,75075.31,2.0,1.0,0.0,92888.520,1.0,0.0,0.0
10000,792.0,0.0,28.0,4.0,130142.79,1.0,1.0,0.0,38190.780,1.0,0.0,0.0


In [281]:
y

1        1.0
2        0.0
3        1.0
4        0.0
5        0.0
        ... 
9997     0.0
9998     1.0
9999     1.0
10000    0.0
0        0.0
Name: Exited, Length: 10001, dtype: float64

# Saving the Data into csv for Training the Model:

In [282]:
if os.path.exists("X_data.csv"):
    print("File already Exist!")
else:
    X.to_csv("X_data.csv")
    print("File X_data created and saved Successfully.")

File already Exist!


In [283]:
if os.path.exists("y_data.csv"):
    print("File already Exist!")
else:
    y.to_csv("y_data.csv")
    print("File y_data created and saved Successfully.")

File already Exist!
