# Handelling Missing Values, Categorical Data

In [1]:
# Data Preprocessing
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the datasets
dataset = pd.read_csv('Data Preprocessing/Data.csv')

In [3]:
type(dataset)

pandas.core.frame.DataFrame

In [4]:
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [5]:
# Creating the matrix of features. In the imported dataset Country, Age, Salary are independent variables
X = dataset.iloc[:, :-1].values

In [6]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [7]:
# Creating the dependent variable
y = dataset.iloc[:, 3].values

In [8]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [9]:
type(X)

numpy.ndarray

In [10]:
X[:,1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [11]:
from sklearn.preprocessing import Imputer # Taking care of missing data

In [12]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) # Creating the object of Imputer class

In [13]:
imputer = imputer.fit(X[:, 1:3]) # fit imputer object to data X (Matrix of feature X)

In [14]:
X[:, 1:3] = imputer.transform(X[:, 1:3])    # Replace the missing data of column by mean

In [15]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [16]:
# Encoding categorical data
# Creating dummy variables using OneHotEncoder class
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

# Creating the object of LabelEncoder class
labelencoder_X = LabelEncoder()

# fit labelencoder_X object to first coulmn Country of matrix X
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [17]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [18]:
# Creating dummy variables
# Creating the object of OneHotEncoder class
onehotencoder = OneHotEncoder(categorical_features = [0])

# fit onehotencoder object to first column - Country of matrix X
X = onehotencoder.fit_transform(X).toarray()

In [19]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [20]:
# Creating the object of LabelEncoder class
labelencoder_y = LabelEncoder()

# fit labelencoder_y object to last coulmn Purchased, we will get encoded vector
y = labelencoder_y.fit_transform(y)

In [21]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [22]:
X.size

50

In [23]:
# Splitting the dataset into training set and test set
from sklearn.cross_validation import train_test_split

# Choosing 20% data as test data, so we will have 80% data in training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)



In [24]:
print(X_train.size)
print(X_test.size)
print(y_train.size)
print(y_test.size)

40
10
8
2


# Feature Scaling - go to part b
EXP-1b-Pre-processing

In [25]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Creating the object of StandardScaler
sc_X = StandardScaler()

# fit and transform training set
X_train = sc_X.fit_transform(X_train)

# transform test set
X_test = sc_X.transform(X_test)

In [26]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [27]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])