In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
df = pd.read_csv('Resources/Customer-Churn-Records.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [3]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Complain',
       'Satisfaction Score', 'Card Type', 'Point Earned'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RowNumber           10000 non-null  int64  
 1   CustomerId          10000 non-null  int64  
 2   Surname             10000 non-null  object 
 3   CreditScore         10000 non-null  int64  
 4   Geography           10000 non-null  object 
 5   Gender              10000 non-null  object 
 6   Age                 10000 non-null  int64  
 7   Tenure              10000 non-null  int64  
 8   Balance             10000 non-null  float64
 9   NumOfProducts       10000 non-null  int64  
 10  HasCrCard           10000 non-null  int64  
 11  IsActiveMember      10000 non-null  int64  
 12  EstimatedSalary     10000 non-null  float64
 13  Exited              10000 non-null  int64  
 14  Complain            10000 non-null  int64  
 15  Satisfaction Score  10000 non-null  int64  
 16  Card 

In [5]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [6]:
df['Card Type'].unique()

array(['DIAMOND', 'GOLD', 'SILVER', 'PLATINUM'], dtype=object)

**Target Value**  
- Exited  

**Values to be Removed**  
- Remove index rows: `RowNumber` and `CustomerId`  
- Remove `Surname`  

**Numerical X Values**  
- Credit Score  
- Age  
- Tenure  
- Balance  
- NumOfProducts  
- HasCrCard  
- IsActiveMember  
- EstimatedSalary  
- Complain  
- Satisfaction Score  
- Points Earned  

**Encoded X Variables**  
- Geography  
- Gender  

**Ordinal X**  
- Card Type

In [7]:
y = df['Exited']

In [8]:
X = df.drop(columns=['Exited', 'RowNumber', 'CustomerId', 'Surname'])

In [9]:
encoder = OneHotEncoder(sparse_output=False)
ordinal_encoder = OrdinalEncoder(categories=['SILVER', 'GOLD', 'PLATINUM', 'DIAMOND'])

In [10]:
categorical_columns = ['Geography', 'Gender']
ordinal_columns = ['Card Type']

In [11]:
encoded_columns = encoder.fit_transform(df[categorical_columns])
#Create a DataFrame with the encoded columns

one_hot_df = pd.DataFrame(encoded_columns, 
                          columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([X.drop(categorical_columns, axis=1), one_hot_df], axis=1)
df_encoded.head()


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,2,DIAMOND,464,1.0,0.0,0.0,1.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,1,3,DIAMOND,456,0.0,0.0,1.0,1.0,0.0
2,502,42,8,159660.8,3,1,0,113931.57,1,3,DIAMOND,377,1.0,0.0,0.0,1.0,0.0
3,699,39,1,0.0,2,0,0,93826.63,0,5,GOLD,350,1.0,0.0,0.0,1.0,0.0
4,850,43,2,125510.82,1,1,1,79084.1,0,5,GOLD,425,0.0,0.0,1.0,1.0,0.0


In [12]:
ordinal_encoder = OrdinalEncoder(categories=[['SILVER', 'GOLD', 'PLATINUM', 'DIAMOND']])
ordinal_column = ordinal_encoder.fit_transform(df[['Card Type']])
ordinal_df = pd.DataFrame(ordinal_column, 
                          columns=ordinal_encoder.get_feature_names_out(['Card Type']))

# Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df_encoded.drop(['Card Type'], axis=1), ordinal_df], axis=1)
df_encoded.head()


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Point Earned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Card Type
0,619,42,2,0.0,1,1,1,101348.88,1,2,464,1.0,0.0,0.0,1.0,0.0,3.0
1,608,41,1,83807.86,1,0,1,112542.58,1,3,456,0.0,0.0,1.0,1.0,0.0,3.0
2,502,42,8,159660.8,3,1,0,113931.57,1,3,377,1.0,0.0,0.0,1.0,0.0,3.0
3,699,39,1,0.0,2,0,0,93826.63,0,5,350,1.0,0.0,0.0,1.0,0.0,1.0
4,850,43,2,125510.82,1,1,1,79084.1,0,5,425,0.0,0.0,1.0,1.0,0.0,1.0
