In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F

## Data Wrangling

In [20]:
df = pd.read_csv("dataset/BankChurners.csv")
df.head()

Unnamed: 0,CustomerId,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,15762418,Spain,3,121681.82,1,1,0,128643.35,1,8
1,15749905,Spain,6,0.0,1,1,0,50213.81,1,7
2,15600911,France,2,182888.08,1,1,0,3061.0,0,7
3,15572762,Germany,2,102278.79,2,1,0,89822.48,0,2
4,15627848,France,7,109346.13,2,1,0,102665.92,0,7


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       9000 non-null   int64  
 1   Geography        9000 non-null   object 
 2   Tenure           9000 non-null   int64  
 3   Balance          9000 non-null   float64
 4   NumOfProducts    9000 non-null   int64  
 5   HasCrCard        9000 non-null   int64  
 6   IsActiveMember   9000 non-null   int64  
 7   EstimatedSalary  9000 non-null   float64
 8   Exited           9000 non-null   int64  
 9   CreditLevel      9000 non-null   int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 703.2+ KB


In [22]:
df.describe()

Unnamed: 0,CustomerId,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,15690900.0,5.016111,76168.443138,1.531778,0.706778,0.513556,99868.712786,0.205556,6.374222
std,71922.27,2.90025,62418.871634,0.580732,0.455265,0.499844,57593.168344,0.404129,1.786207
min,15565700.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,1.0
25%,15628530.0,2.0,0.0,1.0,0.0,0.0,50773.8975,0.0,5.0
50%,15690740.0,5.0,96889.925,1.0,1.0,1.0,99691.065,0.0,6.0
75%,15753160.0,8.0,127591.8825,2.0,1.0,1.0,149373.5275,0.0,8.0
max,15815690.0,10.0,250898.09,4.0,1.0,1.0,199970.74,1.0,10.0


## Data Preprocessing

Target: Convert the "Geography" data to one-hot numeric array

In [23]:
df["Geography"].unique()

array(['Spain', 'France', 'Germany'], dtype=object)

In [24]:
df["Geography"].unique()

array(['Spain', 'France', 'Germany'], dtype=object)

In [33]:
geo_onehot = pd.get_dummies(df["Geography"], prefix="Geo")
geo_onehot.head()

Unnamed: 0,Geo_France,Geo_Germany,Geo_Spain
0,0,0,1
1,0,0,1
2,1,0,0
3,0,1,0
4,1,0,0


In [26]:
train_df = df.copy()
train_df = train_df.join(geo_onehot)
train_df.head()

Unnamed: 0,CustomerId,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel,France,Germany,Spain
0,15762418,Spain,3,121681.82,1,1,0,128643.35,1,8,0,0,1
1,15749905,Spain,6,0.0,1,1,0,50213.81,1,7,0,0,1
2,15600911,France,2,182888.08,1,1,0,3061.0,0,7,1,0,0
3,15572762,Germany,2,102278.79,2,1,0,89822.48,0,2,0,1,0
4,15627848,France,7,109346.13,2,1,0,102665.92,0,7,1,0,0


In [29]:
train_df = train_df.drop(columns=["Geography"])
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       9000 non-null   int64  
 1   Tenure           9000 non-null   int64  
 2   Balance          9000 non-null   float64
 3   NumOfProducts    9000 non-null   int64  
 4   HasCrCard        9000 non-null   int64  
 5   IsActiveMember   9000 non-null   int64  
 6   EstimatedSalary  9000 non-null   float64
 7   Exited           9000 non-null   int64  
 8   CreditLevel      9000 non-null   int64  
 9   France           9000 non-null   uint8  
 10  Germany          9000 non-null   uint8  
 11  Spain            9000 non-null   uint8  
dtypes: float64(2), int64(7), uint8(3)
memory usage: 659.3 KB


In [36]:
df["CreditLevel"].unique()

array([ 8,  7,  2,  6,  5,  4, 10,  3,  9,  1])

In [64]:
df.loc[0][["Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary","Exited"]].to_list()

[3, 121681.82, 1, 1, 0, 128643.35, 1]

## Design Customize Data Loader

In [119]:
train_df.loc[1][["France","Germany","Spain"]]

France     0.0
Germany    0.0
Spain      1.0
Name: 1, dtype: float64

In [138]:
class BankChurnersDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file):
        df = pd.read_csv(csv_file)
        # create one-hot encoder for geography
        geo_onehot = pd.get_dummies(df["Geography"], prefix="Geo")
        # combine with the orignal dataset
        df = df.join(geo_onehot)
        # drop unusfull columns 
        df.drop(columns=["CustomerId", "Geography"], inplace=True)
        # set the label ranges to 0-9
        df["CreditLevel"] = df["CreditLevel"]-1
        self.df = df

    def __getitem__(self, index):
        # set the features
        train_columns = ["Geo_France","Geo_Germany","Geo_Spain","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary","Exited"]
        # set the label
        label_column = "CreditLevel"
        # return feature, label
        return self.df.loc[index][train_columns].to_numpy(np.float32), self.df.loc[index][label_column].astype(np.int64)

    def __len__(self):
        return len(self.df)


In [127]:
train_dataset = BankChurnersDataset("dataset/BankChurners.csv")
print(len(train_dataset))

9000


In [128]:
class BCM(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(10, 20)
        self.fc2 = nn.Linear(20, 15)
        self.fc3 = nn.Linear(15, 10)

    def forward(self, x):
        # x = self.pool(F.relu(self.conv1(x)))
        # x = self.pool(F.relu(self.conv2(x)))
        # x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x    

In [129]:
learning_rate = 0.01
batch_size = 64

In [130]:
model = BCM()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr =learning_rate)

In [143]:
train_dataset = BankChurnersDataset("dataset/BankChurners.csv")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Train the network
for epoch in range(100):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        x, y = data
        optimizer.zero_grad()
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 1000 == 1:
            print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss/2000))
            running_loss = 0.0
print('Finished Training')            


[1,     2] loss: 0.002
[2,     2] loss: 0.002
[3,     2] loss: 0.002
[4,     2] loss: 0.002
[5,     2] loss: 0.002
[6,     2] loss: 0.002
[7,     2] loss: 0.002
[8,     2] loss: 0.002
[9,     2] loss: 0.002
[10,     2] loss: 0.002
[11,     2] loss: 0.002
[12,     2] loss: 0.002
[13,     2] loss: 0.002
[14,     2] loss: 0.002
[15,     2] loss: 0.002
[16,     2] loss: 0.002
[17,     2] loss: 0.002
[18,     2] loss: 0.002
[19,     2] loss: 0.002
[20,     2] loss: 0.002
[21,     2] loss: 0.002
[22,     2] loss: 0.002
[23,     2] loss: 0.002
[24,     2] loss: 0.002
[25,     2] loss: 0.002
[26,     2] loss: 0.002
[27,     2] loss: 0.002
[28,     2] loss: 0.002
[29,     2] loss: 0.002
[30,     2] loss: 0.002
[31,     2] loss: 0.002
[32,     2] loss: 0.002
[33,     2] loss: 0.002
[34,     2] loss: 0.002
[35,     2] loss: 0.002
[36,     2] loss: 0.002
[37,     2] loss: 0.002
[38,     2] loss: 0.002
[39,     2] loss: 0.002
[40,     2] loss: 0.002
[41,     2] loss: 0.002
[42,     2] loss: 0.002
[

In [163]:
# save state
PATH = './bankchurn_model.pth'
torch.save(model.state_dict(), PATH)

In [170]:
test_dataset = BankChurnersDataset("dataset/New_BankChurners.csv")
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = BankChurnersDataset("dataset/BankChurners.csv")
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [180]:
model = BCM()
model.load_state_dict(torch.load(PATH))

correct  = 0
total = 0

with torch.no_grad():
    for data in valid_loader:
        inputs, labels = data
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        if (labels.size(0)!=predicted.size(0)):
            print(labels.size(0), predicted.size(0))
            print(labels)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 9000 test: %d %%' % (100 * correct / total))       

Accuracy of the network on the 9000 test: 21 %


In [181]:
print(correct)

1899
