# Import Python Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import the Data Set

In [2]:
dataset = pd.read_csv('/users/jyu/desktop/datapreprocess.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# Independent variable vector
X = dataset.iloc[:, :-1].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
# Dependent variable vector
Y = dataset.iloc[:, 3].values
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Handling Missing Data

In [8]:
from sklearn.impute import SimpleImputer

In [10]:
missingvalues = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)

In [11]:
missingvalues = missingvalues.fit(X[:, 1:3])
# 1:3 upperbound is excluded, lowerbound is included

In [12]:
X[:, 1:3] = missingvalues.transform(X[:, 1:3])

In [13]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encoding Categorical Data

In [14]:
# Encoding independent variable
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [15]:
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

In [17]:
X = np.array(ct.fit_transform(X), dtype = np.float)
X

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.40000000e+01, 7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        3.00000000e+01, 5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.50000000e+01, 5.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        5.00000000e+01, 

In [18]:
# Encoding dependent variable
from sklearn.preprocessing import LabelEncoder

In [20]:
Y = LabelEncoder().fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting Data Set into Training Set and Test Set

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler 

In [25]:
sc_X = StandardScaler()

In [30]:
X_train = sc_X.fit_transform(X_train)
X_train

array([[ 1.        , -1.        ,  2.64575131, -0.77459667,  0.26306757,
         0.12381479],
       [-1.        ,  1.        , -0.37796447, -0.77459667, -0.25350148,
         0.46175632],
       [ 1.        , -1.        , -0.37796447,  1.29099445, -1.97539832,
        -1.53093341],
       [ 1.        , -1.        , -0.37796447,  1.29099445,  0.05261351,
        -1.11141978],
       [-1.        ,  1.        , -0.37796447, -0.77459667,  1.64058505,
         1.7202972 ],
       [ 1.        , -1.        , -0.37796447,  1.29099445, -0.0813118 ,
        -0.16751412],
       [-1.        ,  1.        , -0.37796447, -0.77459667,  0.95182631,
         0.98614835],
       [-1.        ,  1.        , -0.37796447, -0.77459667, -0.59788085,
        -0.48214934]])

In [31]:
X_test = sc_X.transform(X_test)
X_test

array([[ 1.        , -1.        ,  2.64575131, -0.77459667, -1.45882927,
        -0.90166297],
       [ 1.        , -1.        ,  2.64575131, -0.77459667,  1.98496442,
         2.13981082]])

In [32]:
sc_Y = StandardScaler()
sc_Y

StandardScaler(copy=True, with_mean=True, with_std=True)

In [33]:
Y_train = sc_Y.fit_transform(Y_train.reshape(-1,1))
Y_train

array([[ 0.77459667],
       [ 0.77459667],
       [ 0.77459667],
       [-1.29099445],
       [ 0.77459667],
       [-1.29099445],
       [-1.29099445],
       [ 0.77459667]])