## Chapter7

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import (ensemble, preprocessing, tree)
from sklearn.metrics import (auc, confusion_matrix, roc_auc_score, roc_curve)
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from yellowbrick.classifier import(ConfusionMatrix, ROCAUC)
from yellowbrick.model_selection import (LearningCurve)

## Introduction
- This chapter will explore common preprocessing steps using this data


In [5]:
X2 = pd.DataFrame({ "a" : range(5), "b": [-100, -50, 0, 200, 1000]})

In [6]:
X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


## Standardize

- Some algorithms, e.g., SVM perform better when the data is standardized
- Each column should have a mean of 0 and a standard deviation of 1
- Sklearn provides a fit_transform() method that combines both .fit() and .transform()
- Standardization is equivalent to (X - X_mean) / X.std()

In [7]:
from sklearn import preprocessing

In [8]:
std = preprocessing.StandardScaler()

In [9]:
std.fit_transform(X2)

array([[-1.41421356, -0.75995002],
       [-0.70710678, -0.63737744],
       [ 0.        , -0.51480485],
       [ 0.70710678, -0.02451452],
       [ 1.41421356,  1.93664683]])

After fitting, there are various attributes we can inspect

In [10]:
std.scale_

array([  1.41421356, 407.92156109])

In [11]:
std.mean_

array([  2., 210.])

In [12]:
std.var_


array([2.000e+00, 1.664e+05])

## Scale to Range

- Scaling to range is translating data so it is between 0 and 1 inclusive
- Careful using this if you have outliers
- Use MinMaxScaler of preprocessing module

In [15]:
from sklearn import preprocessing
mms = preprocessing.MinMaxScaler()
mms.fit_transform(X2)


array([[0.        , 0.        ],
       [0.25      , 0.04545455],
       [0.5       , 0.09090909],
       [0.75      , 0.27272727],
       [1.        , 1.        ]])

## Dummy Variables
- Pandas can be used to create dummy variables from categorical data
- This is also referred to as one-hot encoding or indicator encoding
- They are useful if the data is norminal (unordered)
- The get_dummies function in pandas create multiple columns for a categorical column, each with a 1 or 0 if the original column had that value

In [16]:
X_cat = pd.DataFrame({"name": ["George", "Paul"], "inst" : ["Bass", "Guitar"]})

In [17]:
X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


In [18]:
pd.get_dummies(X_cat)

Unnamed: 0,name_George,name_Paul,inst_Bass,inst_Guitar
0,1,0,1,0
1,0,1,0,1


In [19]:
pd.get_dummies(X_cat, drop_first=True)

Unnamed: 0,name_Paul,inst_Guitar
0,0,0
1,1,1


If we have high cardinality data, we can use label encoding

## Label Encoder

- An alternative to dummy variable encoding is label encoding
- This will take categorical data and assign each value a number
- It is useful for high cardinality data
- This encoder imposes ordinality which may or may not be desired
- It can take up less space than one-hot encoding and some (tree) algorithms can deal with this encoding

In [30]:
from sklearn import preprocessing
lab = preprocessing.LabelEncoder()
# label encoder can only deal with one column at a time
lab.fit_transform(X_cat['inst'])

array([0, 1])

In [31]:
# if we have encoded values, applying lab.inverse_transform method decodes them
lab.inverse_transform([1,1,1,0,0,1,1])

array(['Guitar', 'Guitar', 'Guitar', 'Bass', 'Bass', 'Guitar', 'Guitar'],
      dtype=object)

## Frequency Encoding
- Another option for handling high cardinality categorical data is to frequency encode it
- This means replacing the name of the category with the count it had in the training data

In [32]:
mapping = X_cat.name.value_counts()

In [33]:
X_cat.name.map(mapping)

0    1
1    1
Name: name, dtype: int64

## Date Feature Engineering

In [52]:
from fastai.tabular.data import add_datepart
# add_datepart mutates the DataFrame, so be careful

In [69]:
dates = pd.DataFrame({"A": pd.to_datetime(["9/17/2001", "Jan 1, 2002"])})

In [70]:
dates

Unnamed: 0,A
0,2001-09-17
1,2002-01-01


In [71]:
add_datepart(dates.copy(), "A")[['AMonth', 'ADay']]

Unnamed: 0,AMonth,ADay
0,9,17
1,1,1


In [72]:
dates

Unnamed: 0,A
0,2001-09-17
1,2002-01-01


## Manual Feature Engineering

In [73]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [76]:
agg = (df.groupby("cabin").agg("min,max,mean,sum".split(",")).reset_index())

In [77]:
agg

Unnamed: 0_level_0,cabin,pclass,pclass,pclass,pclass,survived,survived,survived,survived,age,...,parch,parch,fare,fare,fare,fare,body,body,body,body
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,sum,min,max,mean,sum,min,...,mean,sum,min,max,mean,sum,min,max,mean,sum
0,A10,1,1,1.0,1,0,0,0.0,0,36.0,...,0.0,0,40.1250,40.1250,40.1250,40.1250,,,,0.0
1,A11,1,1,1.0,1,1,1,1.0,1,33.0,...,0.0,0,27.7208,27.7208,27.7208,27.7208,,,,0.0
2,A14,1,1,1.0,1,0,0,0.0,0,,...,0.0,0,52.0000,52.0000,52.0000,52.0000,,,,0.0
3,A16,1,1,1.0,1,1,1,1.0,1,48.0,...,0.0,0,39.6000,39.6000,39.6000,39.6000,,,,0.0
4,A18,1,1,1.0,1,0,0,0.0,0,39.0,...,0.0,0,29.7000,29.7000,29.7000,29.7000,133.0,133.0,133.0,133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,F33,2,2,2.0,8,1,1,1.0,4,22.0,...,0.0,0,10.5000,13.0000,11.1250,44.5000,,,,0.0
182,F38,3,3,3.0,3,0,0,0.0,0,,...,0.0,0,7.7500,7.7500,7.7500,7.7500,,,,0.0
183,F4,2,2,2.0,8,1,1,1.0,4,1.0,...,1.5,6,39.0000,39.0000,39.0000,156.0000,,,,0.0
184,G6,3,3,3.0,15,0,1,0.6,3,1.0,...,1.2,6,10.4625,16.7000,14.2050,71.0250,,,,0.0


In [78]:
agg.columns = ["_".join(c).strip("_") for c in agg.columns.values]

In [79]:
agg.columns


Index(['cabin', 'pclass_min', 'pclass_max', 'pclass_mean', 'pclass_sum',
       'survived_min', 'survived_max', 'survived_mean', 'survived_sum',
       'age_min', 'age_max', 'age_mean', 'age_sum', 'sibsp_min', 'sibsp_max',
       'sibsp_mean', 'sibsp_sum', 'parch_min', 'parch_max', 'parch_mean',
       'parch_sum', 'fare_min', 'fare_max', 'fare_mean', 'fare_sum',
       'body_min', 'body_max', 'body_mean', 'body_sum'],
      dtype='object')

In [80]:
agg_df = df.merge(agg, on="cabin")

In [81]:
agg_df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,parch_mean,parch_sum,fare_min,fare_max,fare_mean,fare_sum,body_min,body_max,body_mean,body_sum
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,...,0.5,1,211.3375,211.3375,211.3375,422.675,,,,0.0
1,1,1,"Madill, Miss. Georgette Alexandra",female,15.0000,0,1,24160,211.3375,B5,...,0.5,1,211.3375,211.3375,211.3375,422.675,,,,0.0
2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,...,2.0,8,151.5500,151.5500,151.5500,606.200,135.0,135.0,135.0,135.0
3,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,...,2.0,8,151.5500,151.5500,151.5500,606.200,135.0,135.0,135.0,135.0
4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,...,2.0,8,151.5500,151.5500,151.5500,606.200,135.0,135.0,135.0,135.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,3,1,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24.0000,0,2,PP 9549,16.7000,G6,...,1.2,6,10.4625,16.7000,14.2050,71.025,,,,0.0
291,3,1,"Sandstrom, Miss. Marguerite Rut",female,4.0000,1,1,PP 9549,16.7000,G6,...,1.2,6,10.4625,16.7000,14.2050,71.025,,,,0.0
292,3,0,"Strom, Miss. Telma Matilda",female,2.0000,0,1,347054,10.4625,G6,...,1.2,6,10.4625,16.7000,14.2050,71.025,,,,0.0
293,3,0,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29.0000,1,1,347054,10.4625,G6,...,1.2,6,10.4625,16.7000,14.2050,71.025,,,,0.0


In [82]:
agg_df.isnull()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,parch_mean,parch_sum,fare_min,fare_max,fare_mean,fare_sum,body_min,body_max,body_mean,body_sum
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
291,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
292,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
293,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
