In [1]:
# this first lesson appears to be a study in data pre-processing steps and techniques to help
# prepare a dataset for developing a ML model.
#
# import the number processing libraries, "np" and "pd" are defacto aliases for these packages
import numpy as np
import pandas as pd

In [5]:
# import the data set, you'll have to adjust the path for your directory layout
datasetPath = '../Materials/datasets/'
dataset = pd.read_csv(datasetPath + 'Data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:,3].values
print x
print y

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [7]:
# handle any missing data using imputation
# imputation utilizes existing data in the data set as a basis for creating data instances for missing data
# in this example, the mean of the existing data in a given column is used to fill in missing data values
# if you look at the dataset, only two columns have numbers, so only those columns, for all rows, are targeted to impute
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(x[ : , 1:3])
x[ : , 1:3] = imputer.transform(x[ : , 1:3])
#
print x

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [8]:
# encoding categorical data
# categorical data is typically data used to put data values into categories or bins, e.g., male-female, yes-no,
#   child-adult-senior, France-Spain-Germany
# the data set has one column of categorical data, yes-no
# when you encode categorical data, you convert the category values to unqiue numerical values so mathematical analysis
# can be applied to them along with the other data values, e.g, yes=1, no=0, and you add a new column for each transformed
# category variable. These new columns are represented by "dummy" variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
x[ : , 0] = labelencoder_x.fit_transform(x[ : , 0])
# now create the dummay variables for the transformed category variables
onehotencoder = OneHotEncoder(categorical_features = [0])
x = onehotencoder.fit_transform(x).toarray()
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
#
print x
print y

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]]
[0 1 0 0 1 1 0 1 0 1]


In [9]:
# when training a model, a known set of results are needed along with the training data
# so typically a dataset of known values are split into a training dataset and a test dataset
# a common split size is 80% training, 20% test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
#
print x_train
print x_test
print y_train
print y_test

[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   4.00000000e+01
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.70000000e+01
    6.70000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.87777778e+01
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.80000000e+01
    7.90000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   3.80000000e+01
    6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.40000000e+01
    7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   3.50000000e+01
    5.80000000e+04]]
[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   3.00000000e+01
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   5.00000000e+01
    8.30000000e+04]]
[1 1 1 0 1 0 0 1]
[0 0]


In [12]:
# feature scaling is essentially data normalization applied to the dataset to bring all of the data within
# a particular range. Normalized data can help speed the calculations in the modeling effort but this activity
# should be used sparingly in data pre-processing, and only after an initial analysis of the data is performed
# because feature scaling can hide outlier behaviors, and cause other undesireable data transformations
from sklearn.preprocessing import StandardScaler
x_norm = StandardScaler()
x_train = x_norm.fit_transform(x_train)
x_test = x_norm.fit_transform(x_test)
#
print x_train
print x_test

[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
[[ 0.  0.  0. -1. -1.]
 [ 0.  0.  0.  1.  1.]]
