# Importing Libraries and Loading the Data:

In [420]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [421]:
from pathlib import Path
import pandas as pd
import urllib.request
import tarfile
import zipfile

def load_data(url, archive_name, extract_dir, csv_relative_path):
    extract_path = Path(extract_dir)
    archive_path = extract_path / archive_name

    extract_path.mkdir(parents=True, exist_ok=True)

    if not archive_path.is_file():
        print(f"Downloading {archive_name} ...")
        urllib.request.urlretrieve(url, archive_path)

    # Extract
    if archive_name.endswith((".tgz", ".tar", ".tar.gz")):
        with tarfile.open(archive_path) as tar:
            tar.extractall(path=extract_path)
    elif archive_name.endswith(".zip"):
        with zipfile.ZipFile(archive_path, 'r') as zip_ref:
            zip_ref.extractall(path=extract_path)
    else:
        raise ValueError("Unsupported archive format!")

    # ⬇ Load CSV WITHOUT index
    df = pd.read_csv(extract_path / csv_relative_path)

    # ⬇ Remove RowNumber if exists
    if "RowNumber" in df.columns:
        df = df.drop(columns=["RowNumber"])

    return df.reset_index(drop=True)

In [422]:
churn = load_data(
    url="https://www.kaggle.com/api/v1/datasets/download/shrutimechlearn/churn-modelling",
    archive_name="datafile.zip",
    extract_dir="datasets",
    csv_relative_path="Churn_Modelling.csv"
)
churn.index = churn.index + 1


In [423]:
churn

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# Preprocessing the Data:

In [424]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


In [425]:
churn.columns

Index(['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')

In [426]:
# Categorical Columns:
churn.dtypes[churn.dtypes == 'object']

Surname      object
Geography    object
Gender       object
dtype: object

In [427]:
churn.describe()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


## Removing Irrelavant features:

In [428]:
churn = churn.drop(['CustomerId','Surname'],axis=1)

In [429]:
churn

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
4,699,France,Female,39,1,0.00,2,0,0,93826.63,0
5,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9996,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9997,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9998,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9999,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


## Encoding Categorical Features into Numerical Features:

In [430]:
from sklearn.preprocessing import LabelEncoder

In [431]:
gender_label = LabelEncoder()
churn['Gender'] = gender_label.fit_transform(churn['Gender'])

In [432]:
gender_label.classes_

array(['Female', 'Male'], dtype=object)

In [433]:
from sklearn.preprocessing import OneHotEncoder
geography_label = OneHotEncoder(sparse_output=False)

In [434]:
geography = geography_label.fit_transform(churn[['Geography']])

In [435]:
geography

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], shape=(10000, 3))

In [436]:
geography_label.categories_

[array(['France', 'Germany', 'Spain'], dtype=object)]

In [437]:
geography_label.get_feature_names_out()

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [438]:
geography_df = pd.DataFrame(geography,columns=geography_label.get_feature_names_out())

In [439]:
geography_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


# Updating data with Encoded Features and Removing the old one's:

In [440]:
churn = churn.drop("Geography",axis=1)
churn = pd.concat([churn,geography_df],axis=1)
churn

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
1,619.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88,1.0,0.0,0.0,1.0
2,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0.0,1.0,0.0,0.0
3,502.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57,1.0,1.0,0.0,0.0
4,699.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63,0.0,0.0,0.0,1.0
5,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9997,516.0,1.0,35.0,10.0,57369.61,1.0,1.0,1.0,101699.77,0.0,1.0,0.0,0.0
9998,709.0,0.0,36.0,7.0,0.00,1.0,0.0,1.0,42085.58,1.0,0.0,1.0,0.0
9999,772.0,1.0,42.0,3.0,75075.31,2.0,1.0,0.0,92888.52,1.0,1.0,0.0,0.0
10000,792.0,0.0,28.0,4.0,130142.79,1.0,1.0,0.0,38190.78,0.0,,,


## Saving the Encoding Instances with Pickle:

In [441]:
import pickle
import os

In [442]:
# Saving the Gender Encoder created with Label Encoding:
if os.path.exists('gender_label.pkl'):
    print("File Already Exist!")
else:
    with open('gender_label.pkl','wb') as f:
        pickle.dump(gender_label,f)
        print("File created and 'gender_label.pkl' saved.")

File Already Exist!


In [443]:
# Saving the Geography Encoder created with One Hot Encoding:
if os.path.exists('geography_label.pkl'):
    print("File Already Exist!")
else:
    with open('geography_label.pkl','wb') as f:
        pickle.dump(geography_label,f)
        print("File created and 'geography_label.pkl' saved.")

File Already Exist!


# Let's divide the Dataset into Independent and Dependent Features:

In [444]:
X = churn.drop('Exited',axis=1)
y = churn['Exited']

In [445]:
X.shape,y.shape

((10001, 12), (10001,))

## Splitting Data into Tranining and Testing sets:

In [446]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [447]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((8000, 12), (2001, 12), (8000,), (2001,))

## Standardizing the Data:

In [448]:
from sklearn.preprocessing import StandardScaler

In [449]:
scaler = StandardScaler()

In [450]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Saving the Standard Scale instance with Pickle:

In [451]:
if os.path.exists("scaler.pkl"):
    print("File already exist!")
else:
    with open("scaler.pkl","wb") as f:
        pickle.dump(scaler,f)
        print("File created and 'scaler.pkl' saved.")

File already exist!
