In [1]:
import numpy as np

In [2]:
X_train = np.array([[1000.0, 2.0], [1500.0, 3.0]])

In [3]:
X_train

array([[1000.,    2.],
       [1500.,    3.]])

In [4]:
#grab all the rows of the first column
X_train[:,0]

array([1000., 1500.])

In [5]:
#grab all rows of the second column
X_train[:,1]

array([2., 3.])

In [10]:
#minimum value of the first column
np.min(X_train[:,0])
# X_train[:,0].min()

1000.0

In [12]:
#mean of the first column
np.mean(X_train[:,0])
# X_train[:,0].mean()

1250.0

In [29]:
#Obtain the minimum value and divide it by the range of the column
def max_min_s(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    

In [30]:
max_min_s(X_train)

array([[0., 0.],
       [1., 1.]])

In [31]:
#Using sklearn's built in function for max_min_s
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X_train)
X_minmax

array([[0., 0.],
       [1., 1.]])

In [35]:
#Calculate the value minus the mean, over the standard deviation
def standard_s(X):
    return (X - X.mean(axis=0)) / (X.std(axis=0))
    

In [36]:
standard_s(X_train)

array([[-1., -1.],
       [ 1.,  1.]])

In [37]:
#Using sklearn's built in function for standard_s

standard_scaler = preprocessing.StandardScaler()
X_ss = standard_scaler.fit_transform(X_train)
X_ss

array([[-1., -1.],
       [ 1.,  1.]])

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Read in the CSV file
df = pd.read_csv('Notebooks/Datasets/Churn_Modelling.csv')

# Print out the first 5 rows of each column in a readable format
print(df.head())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [2]:
# Print the unique values from the Geography column
print(df['Geography'].unique())

['France' 'Spain' 'Germany']


In [3]:
# Print the unique values from the Gender column
print(df['Gender'].unique())

['Female' 'Male']


In [4]:
# Feature matrix
# We don't care about first 3 columns (RowNumber, CustomerId, Surname),
# as those don't factor in to whether a person churns or not
X = df.iloc[:, 3:13].values

In [5]:
# Target column
# We want to predict the exit value,
# which therefore makes it our target column
y = df.iloc[:, 13].values

In [6]:
print(X[0:10,:])

[[619 'France' 'Female' 42 2 0.0 1 1 1 101348.88]
 [608 'Spain' 'Female' 41 1 83807.86 1 0 1 112542.58]
 [502 'France' 'Female' 42 8 159660.8 3 1 0 113931.57]
 [699 'France' 'Female' 39 1 0.0 2 0 0 93826.63]
 [850 'Spain' 'Female' 43 2 125510.82 1 1 1 79084.1]
 [645 'Spain' 'Male' 44 8 113755.78 2 1 0 149756.71]
 [822 'France' 'Male' 50 7 0.0 2 1 1 10062.8]
 [376 'Germany' 'Female' 29 4 115046.74 4 1 0 119346.88]
 [501 'France' 'Male' 44 4 142051.07 2 0 1 74940.5]
 [684 'France' 'Male' 27 2 134603.88 1 1 1 71725.73]]


In [7]:
from sklearn.preprocessing import LabelEncoder
#France, Spain, and Germany changed to numbers
#Male and Female also changed to numbers
label_encoder_X_1 = LabelEncoder()
X[:, 1] = label_encoder_X_1.fit_transform(X[:, 1])
label_encoder_X_2 = LabelEncoder()
X[:, 2] = label_encoder_X_2.fit_transform(X[:, 2])
print(X[0:10,:])
print(X.shape)

[[619 0 0 42 2 0.0 1 1 1 101348.88]
 [608 2 0 41 1 83807.86 1 0 1 112542.58]
 [502 0 0 42 8 159660.8 3 1 0 113931.57]
 [699 0 0 39 1 0.0 2 0 0 93826.63]
 [850 2 0 43 2 125510.82 1 1 1 79084.1]
 [645 2 1 44 8 113755.78 2 1 0 149756.71]
 [822 0 1 50 7 0.0 2 1 1 10062.8]
 [376 1 0 29 4 115046.74 4 1 0 119346.88]
 [501 0 1 44 4 142051.07 2 0 1 74940.5]
 [684 0 1 27 2 134603.88 1 1 1 71725.73]]
(10000, 10)


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot_encoder = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [1, 2])], remainder='passthrough')
X = one_hot_encoder.fit_transform(X)
print(pd.DataFrame(X[0:10,:]))

     0    1    2    3    4    5    6      7     8    9         10   11   12  \
0  1.0  0.0  1.0  0.0  1.0  1.0  0.0  619.0  42.0  2.0       0.00  1.0  1.0   
1  1.0  0.0  0.0  1.0  0.0  1.0  0.0  608.0  41.0  1.0   83807.86  1.0  0.0   
2  1.0  0.0  1.0  0.0  1.0  1.0  0.0  502.0  42.0  8.0  159660.80  3.0  1.0   
3  1.0  0.0  1.0  0.0  1.0  1.0  0.0  699.0  39.0  1.0       0.00  2.0  0.0   
4  1.0  0.0  0.0  1.0  0.0  1.0  0.0  850.0  43.0  2.0  125510.82  1.0  1.0   
5  1.0  0.0  0.0  1.0  0.0  0.0  1.0  645.0  44.0  8.0  113755.78  2.0  1.0   
6  1.0  0.0  1.0  0.0  1.0  0.0  1.0  822.0  50.0  7.0       0.00  2.0  1.0   
7  0.0  1.0  1.0  0.0  0.0  1.0  0.0  376.0  29.0  4.0  115046.74  4.0  1.0   
8  1.0  0.0  1.0  0.0  1.0  0.0  1.0  501.0  44.0  4.0  142051.07  2.0  0.0   
9  1.0  0.0  1.0  0.0  1.0  0.0  1.0  684.0  27.0  2.0  134603.88  1.0  1.0   

    13         14  
0  1.0  101348.88  
1  1.0  112542.58  
2  0.0  113931.57  
3  0.0   93826.63  
4  1.0   79084.10  
5  0.0  14

In [13]:
import pandas as pd

X = df.iloc[:, 3:13]
y = df.iloc[:, 13]
pd.get_dummies(X).head(10)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,1,0
5,645,44,8,113755.78,2,1,0,149756.71,0,0,1,0,1
6,822,50,7,0.0,2,1,1,10062.8,1,0,0,0,1
7,376,29,4,115046.74,4,1,0,119346.88,0,1,0,1,0
8,501,44,4,142051.07,2,0,1,74940.5,1,0,0,0,1
9,684,27,2,134603.88,1,1,1,71725.73,1,0,0,0,1


In [14]:
pd.get_dummies(X).head(10).values

array([[6.1900000e+02, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05,
        1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00],
       [6.0800000e+02, 4.1000000e+01, 1.0000000e+00, 8.3807860e+04,
        1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.1254258e+05,
        0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0000000e+00,
        0.0000000e+00],
       [5.0200000e+02, 4.2000000e+01, 8.0000000e+00, 1.5966080e+05,
        3.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.1393157e+05,
        1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00],
       [6.9900000e+02, 3.9000000e+01, 1.0000000e+00, 0.0000000e+00,
        2.0000000e+00, 0.0000000e+00, 0.0000000e+00, 9.3826630e+04,
        1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        0.0000000e+00],
       [8.5000000e+02, 4.3000000e+01, 2.0000000e+00, 1.2551082e+05,
        1.0000000e+0