## Ch 4: Building Good Training Datasets -- Data Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np

###### Identifying missing values in tabular data

In [2]:
# Creating simple example dataframe from a CSV
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

In [3]:
csv_data

'A,B,C,D\n1.0,2.0,3.0,4.0\n5.0,6.0,,8.0\n10.0,11.0,12.0,'

In [4]:
df = pd.read_csv(StringIO(csv_data)) # StringIO allows us to read the list as if it were a CSV file

In [5]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [6]:
df.isnull().sum() # Can use the isnull().sum() methods to returns the total # of NULLs --> isnull() converts to T/F

A    0
B    0
C    1
D    1
dtype: int64

In [7]:
df.values # Scikit-learn was made to handle NumPy arrays but generally works well with Pandas dataframes

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

###### Eliminating training examples or features w/ missing values

In [8]:
# Removing rows or columns w/ NULLs is easy but limits training/testing data

In [9]:
df.dropna(axis=0) # axis = row = x

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [10]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [11]:
df.dropna(how='all') # will only drop rows where all columns are NaN

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [12]:
# drop rows that have fewer than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [13]:
# only drop rows where NaN appear in specific columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


###### Inputing missing values

In [14]:
# Can use interpolation techniques to estimate missing values from other training examples in our dataset

- Mean imputation - replacing missing value w/ mean value

In [15]:
# Mean imputation w/ SimpleImputer class
from sklearn.impute import SimpleImputer

imr = SimpleImputer(missing_values=np.nan, strategy='mean') # Could possibly describe missing_values as anything?
    #...some other options for strategy: 'median', 'most_frequent' --> 'most_frequent' is useful for categorical data
    # keep in mind that the strategy is based on the feature (column), not the row
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)

print(f'df.values: \n {df.values}')
print('\n')
print(f'imputed_data: \n {imputed_data}')

df.values: 
 [[ 1.  2.  3.  4.]
 [ 5.  6. nan  8.]
 [10. 11. 12. nan]]


imputed_data: 
 [[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]


In [16]:
# A more convenient way of imputing missing values
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


### Handling Categorical Data

***
- Ordinal: can be ordered (t-shirt size)
- Nominal: cannot be ordered (colors)

In [17]:
# Create a dataframe with categorical data
df = pd.DataFrame([['green', 'M', 10.1, 'class2'], 
                   ['red', 'L', 13.5, 'class1'], 
                   ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [18]:
# Map categorical ordinal values to integers
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [19]:
# to reverse the size mapping we can apply the inverse
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping) # .map() matches the dictionary key to the df and changes the df value to the dict value

0     M
1     L
2    XL
Name: size, dtype: object

In [20]:
size_mapping.items()

dict_items([('XL', 3), ('L', 2), ('M', 1)])

In [21]:
inv_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

#### Encoding Class Labels

- Class labels are the nominal categorical values
- Most ML algorithms can map a value to the label automatically but it is good to do it manually to be aware
- Since no order, doesn't matter how you map the value just need to know what is mapped where

In [22]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [23]:
np.unique(df['classlabel'])

array(['class1', 'class2'], dtype=object)

In [24]:
for x in enumerate(np.unique(df['classlabel'])): print(x)

(0, 'class1')
(1, 'class2')


In [25]:
enumerate(np.unique(df['classlabel']))

# This link provides some context on why only a hash value is returned: 
#... https://stackoverflow.com/questions/51561509/why-is-the-output-not-printing-when-using-printiterable

<enumerate at 0x1f1222eda40>

In [26]:
# Transform class labels into integers in dataframe
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [27]:
# reverse the class label mapping
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [28]:
# To map labels back to original can use the method previously or can just use sklearn
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
# fit_transform is just shortcut for calling fit and transform separately
y

array([1, 0, 1])

In [29]:
df['classlabel'].values

array(['class2', 'class1', 'class2'], dtype=object)

In [30]:
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

#### Performing one-hot encoding on nominal features

In [31]:
# Could use scikit-learn's LabelEncoder() on nominal data
X = df[['color', 'size', 'price']].values # just created a copy of the df for demonstration

color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0]) # changed all values of first column (green, red, blue)
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

- The problem with the above is that our ML algorithm could think deem the column of colors (nominal values) as having some order based on their encoding
    - This is where one-hot encoding is used
    - The idea behind one-hot encoding is to create a new feature for each unique value of the nominal values and treat the new features as dummy variables for which ever color they wish to indicate

In [32]:
from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [33]:
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray() # Creates a new column for each unique color w/ dummy variables

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [34]:
# ^^^ we only modified columns in a one features array (X[:, 0])
# to selectively transform columns in a multi-feature array, we can use ColumnTransformer
#... ColumnTransformer accepts a list of (name, transformer, columns) tupes

from sklearn.compose import ColumnTransformer

X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]), # Apply OneHotEncoder() to the first column
    ('nothing', 'passthrough', [1, 2]) # Do nothing for the other columns
])

c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [35]:
# Can also use Pandas for one-hot encoding
# Pandas get_dummies method will only convert string columns and leave all other columns unchanged
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


- When we use one-hot encoding we have to keep in mind that this introduces multi-collinearity
    - This can be an issue for certain methods (ex: methods that require matrix inversion)
        - Matrices that are highly correlted are computationally difficult to invert, which can lead to numerically unstable estimates
    - To reduce the correlation among variables, we can simply remove one feature column from the one-hot encoded array --> don't lose any information since all other color columns being zero one indicate the dropped feature would've been 1

In [36]:
# drop one color column of the created dummy variable columns
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


In [37]:
# Can do this via OneHotEncoder() by setting drop='first' and categories='auto'
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),
    ('nothing', 'passthrough', [1, 2])
])

c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

#### Encoding Ordinal Features

- If we are unsure about the numerical differences between categories or ordinal features, we can encode them using a threshold encoding with 0/1 values
- Ex: we can split the feature 'size' with values M, L, and XL into 2 new features 'x > M' and 'x > L'

In [38]:
df = pd.DataFrame([['green', 'M', 10.1, 'class2'], 
                   ['red', 'L', 13.5, 'class1'], 
                   ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [39]:
df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)

del df['size']
df

Unnamed: 0,color,price,classlabel,x > M,x > L
0,green,10.1,class2,0,0
1,red,13.5,class1,1,0
2,blue,15.3,class2,1,1


### Partitioning a dataset into separate training and test datasets 

In [40]:
# Get wine dataset from open source directory
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

In [41]:
df_wine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [42]:
# define column headers
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols'
                  ,'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue'
                  ,'0D280/0D315 of diluted wines', 'Proline']

In [43]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,0D280/0D315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [44]:
# What are the unique classes --> each class refers to a type of grape
print('Class labels', np.unique(df_wine['Class label']))

Class labels [1 2 3]


In [45]:
# Randomly partition dataset into training and test sets
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
# stratify=y ensures that same proportion of each class ('y') are in the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [46]:
df_wine.iloc[:, 1:].values # all rows after the first column (after class label)

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [47]:
df_wine.iloc[:, 0].values # just the class labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3], dtype=int64)

- Commonly used splits for train/test data are 60:40, 70:30, and 80:20
- For large datasets can increase your train %

#### Feature Scaling

- Decision trees and random forest are two of the few ML algorithms that don't need feature scaling

In [48]:
# Min-max scaling feature normalization
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train) # Fit and transformed trained data
X_test_norm = mms.transform(X_test) # is transformed by the fit of the trained data

In [49]:
# comparing min max normalization vs standardization
ex = np.array([0, 1, 2, 3, 4, 5])
standardized = (ex - ex.mean()) / ex.std()
normalized = (ex - ex.min()) / (ex.max() - ex.min())
pd.DataFrame({'standardized': standardized, 'normalized': normalized}, index=ex)

Unnamed: 0,standardized,normalized
0,-1.46385,0.0
1,-0.87831,0.2
2,-0.29277,0.4
3,0.29277,0.6
4,0.87831,0.8
5,1.46385,1.0


In [50]:
# Standardization feature scaling
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test) # is transformed by the fit of the trained data

- Other more advanced methods of feature scaling exist and are available in scikit-learn

#### Selecting Meaningful Features