In [None]:
# Last amended: 30th June, 2020
# My folder:    /home/ashok/Documents/5.decisiontree

# Objectives:
#     i)    Read and explore data
#    ii)    Deal with missing values 
#   iii)    OneHotEncode categorical features
#   iv)     Use Pipeline and ColumnTransformer 
#            for data transformation
#    v)     Pipeline for modeling
#    vi)    Nested pipes

In [1]:
# 1.0 Reset memory
%reset -f
# 1.1 Call libraries
import numpy as np
import pandas as pd

# 1.2 for data splitting
from sklearn.model_selection import train_test_split
# 1.3 Class for imputing missing values
from sklearn.impute import SimpleImputer
# 1.4 One hot encode categorical data--Convert to dummy
from sklearn.preprocessing import OneHotEncoder as onehot
# 1.5 Scale numeric data
from sklearn.preprocessing import StandardScaler
# 1.6 Class for applying multiple data transformation jobs
from sklearn.compose import ColumnTransformer
# 1.7 Pipeline class
from sklearn.pipeline import Pipeline
# 1.8 Label encode target column
from sklearn.preprocessing import LabelEncoder

# 1.9 Modeler
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# User guide: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier 

In [2]:
# 1.10 Display outputs of all commands from a cell--not just of the last command
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# 2.0 Import warnings module
import warnings
# 2.1 Do not print warnings on screen
warnings.filterwarnings("ignore")

In [5]:
# 3.0 Create a toy dataset with six columns
#     Every column has at least one NaN except column: 'creditability'
#     'creditability' is our target column so no NaN
df = pd.DataFrame({
                    'creditability' : ['yes','yes','yes','yes','yes','yes','no','no','no','no','no','no','no','no'], # Target column
                    'acc_balance'   : [1,2,1,np.nan,1,2,1,2,1,2,2,np.nan,np.nan,np.nan],
                    'house_owned'   : ['big','small',np.nan,'small','big',np.nan,np.nan,'big','small','big','big','small',np.nan,'small'],
                    'age'           : [21,45,np.nan,40,34,89,23,65,87,np.nan,90,np.nan,60,np.nan],
                    'income'        : [np.nan,7.8,3.4,5.5,2.1,8.9,3.9,np.nan,6.9,9.0,np.nan,8.0,8.5,np.nan],  
                    'credit_amount' : [1011,np.nan,3211,np.nan,1000,2323,1010,1500,1300,1782,1212,np.nan,1232,np.nan]
                  }
               )

df    # (14,6)

Unnamed: 0,creditability,acc_balance,house_owned,age,income,credit_amount
0,yes,1.0,big,21.0,,1011.0
1,yes,2.0,small,45.0,7.8,
2,yes,1.0,,,3.4,3211.0
3,yes,,small,40.0,5.5,
4,yes,1.0,big,34.0,2.1,1000.0
5,yes,2.0,,89.0,8.9,2323.0
6,no,1.0,,23.0,3.9,1010.0
7,no,2.0,big,65.0,,1500.0
8,no,1.0,small,87.0,6.9,1300.0
9,no,2.0,big,,9.0,1782.0


In [6]:
# 3.1 Engineer some new categorical features from 'age' and 'credit_amount'
#     We will have NaN values both in 'age_cat' and 'credit_amount_cat' columns
#     (Note: Strictly speaking this method of creating features outside pipeline is 
#     not recommended as it leaks information about X_test (to be created, see below)
#     to X_train. Recommended way is to wrap it up inside sklearn's FunctionTransformer
#     and then use wrapped transformer within a processing pipeline. See:
#     https://scikit-learn.org/stable/modules/preprocessing.html#custom-transformers)

df['age_cat'] = pd.cut(df['age'],               # Equal interval cuts between min and max
                       3,                       # Three cuts
                       labels=["1","2", "3"]    # Label for each cut
                       )

# 3.1.1
df['credit_amount_cat'] = pd.qcut(df['credit_amount'],       # Equal freq cut
                                  3,
                                  labels=["low","medium", "high"])
df   # (13,8)


Unnamed: 0,creditability,acc_balance,house_owned,age,income,credit_amount,age_cat,credit_amount_cat
0,yes,1.0,big,21.0,,1011.0,1.0,low
1,yes,2.0,small,45.0,7.8,,2.0,
2,yes,1.0,,,3.4,3211.0,,high
3,yes,,small,40.0,5.5,,1.0,
4,yes,1.0,big,34.0,2.1,1000.0,1.0,low
5,yes,2.0,,89.0,8.9,2323.0,3.0,high
6,no,1.0,,23.0,3.9,1010.0,1.0,low
7,no,2.0,big,65.0,,1500.0,2.0,medium
8,no,1.0,small,87.0,6.9,1300.0,3.0,medium
9,no,2.0,big,,9.0,1782.0,,high


In [7]:
# 3.2 Randomly shuffle data as values 
#     in 'credibility' column have an order
df = df.sample(frac = 1) 
df    # (13,8)

Unnamed: 0,creditability,acc_balance,house_owned,age,income,credit_amount,age_cat,credit_amount_cat
2,yes,1.0,,,3.4,3211.0,,high
3,yes,,small,40.0,5.5,,1.0,
0,yes,1.0,big,21.0,,1011.0,1.0,low
11,no,,small,,8.0,,,
9,no,2.0,big,,9.0,1782.0,,high
5,yes,2.0,,89.0,8.9,2323.0,3.0,high
7,no,2.0,big,65.0,,1500.0,2.0,medium
10,no,2.0,big,90.0,,1212.0,3.0,low
12,no,,,60.0,8.5,1232.0,2.0,medium
13,no,,small,,,,,


### Data splitting

In [8]:
# 3.3    Popout target
#        to separate predictors and target

y = df.pop('creditability')
y[:3]      # Pandas Series

# 3.4   Create an alias of predictors dataset 
X = df     # X is another name for df
X.shape    # (13,7)

2    yes
3    yes
0    yes
Name: creditability, dtype: object

(14, 7)

In [9]:
# 4.0 Split dataset. We will preprocess X_train and apply that
#     processing to X_test later
X_train,X_test, y_train, y_test = train_test_split(
                                                    X,                   # Data features
                                                    y,                   # Target column
                                                    test_size = 0.3      # split-ratio
                                                    )

# 4.1 Note the use of f-string for printing
f"X_train shape: {X_train.shape}"    # (9,7)
f"X_test.shape : {X_test.shape}"     # (4,7)
f"y_train shape: {y_train.shape}"    # (9,)
f"y_test shape : {y_test.shape}"     # (4,)

'X_train shape: (9, 7)'

'X_test.shape : (5, 7)'

'y_train shape: (9,)'

'y_test shape : (5,)'

#### Make copy of data set for two separate ways processing

In [10]:
# 4.2   Make a copy of X_train
#       and X_test for two separate
#       ways of data processing
#       without using pipes and with pipes

X_train_c = X_train.copy()
X_test_c  = X_test.copy()

### Separate out categorical and numerical features

In [11]:
### 4.3
###    Which columns are categorical
###    but disguised as integers

# 4.3 How many unique values per column.
#     Check every column
#     We will assume that if number of unique values
#      are 4 or less it is categorical column else numeric

f"Total no of unique values per column are:"
X_train_c.nunique()        # Total no of unique values in each column

# 4.4 If no. of unique values less than 5, it is categorical
f"True are categorical and False are numerical:"
X_train_c.nunique() < 5    # All True are categorical


'Total no of unique values per column are:'

acc_balance          2
house_owned          2
age                  6
income               8
credit_amount        7
age_cat              3
credit_amount_cat    3
dtype: int64

'True are categorical and False are numerical:'

acc_balance           True
house_owned           True
age                  False
income               False
credit_amount        False
age_cat               True
credit_amount_cat     True
dtype: bool

In [12]:
# 4.5 Extract list of cat_cols and num_cols:

# 4.6 First note which are cat and which are num
dg = (X_train_c.nunique() < 5)  
dg    # All True are cat and all False are num

# 4.7 Then filter out names from Series index 
cat_cols = dg[dg==True].index.tolist()
num_cols = dg[dg==False].index.tolist()

acc_balance           True
house_owned           True
age                  False
income               False
credit_amount        False
age_cat               True
credit_amount_cat     True
dtype: bool

In [13]:
# 4.8 Here are the columns
cat_cols    #  4
num_cols    #  3

['acc_balance', 'house_owned', 'age_cat', 'credit_amount_cat']

['age', 'income', 'credit_amount']

In [14]:
# 4.9 We will create two subsets of num_cols
#      One set we will impute using 'mean' 
#       and the other using 'median'
num_cols_mean   = ['age']
num_cols_median = ['income', 'credit_amount']

In [15]:
# 4.10 We will create two sets of cat_cols
#      One set we will fill with 'most_frequent'
#       and the other using a constant value

cat_cols_mf       = ['acc_balance', 'house_owned']       # 'most_frequent' fill
cat_cols_constant = ['age_cat', 'credit_amount_cat']     # 'constant' fill

In [16]:
# 4.11 So we have four datasets for imputing: These are:
X_train[num_cols_mean]              # Num dataset, impute by 'mean'   strategy
X_train[num_cols_median]            # Num dataset, impute by 'median' strategy
X_train[cat_cols_mf]                # Cat dataset, impute by 'most_frequent' strategy
X_train[cat_cols_constant]          # Cat dataset, impute by 'constant' strategy

Unnamed: 0,age
12,60.0
3,40.0
5,89.0
2,
8,87.0
6,23.0
4,34.0
13,
9,


Unnamed: 0,income,credit_amount
12,8.5,1232.0
3,5.5,
5,8.9,2323.0
2,3.4,3211.0
8,6.9,1300.0
6,3.9,1010.0
4,2.1,1000.0
13,,
9,9.0,1782.0


Unnamed: 0,acc_balance,house_owned
12,,
3,,small
5,2.0,
2,1.0,
8,1.0,small
6,1.0,
4,1.0,big
13,,small
9,2.0,big


Unnamed: 0,age_cat,credit_amount_cat
12,2.0,medium
3,1.0,
5,3.0,high
2,,high
8,3.0,medium
6,1.0,low
4,1.0,low
13,,
9,,high


## Part I
## Data preprocessing without Pipelining
Pre-process each one of the four subsets of data separately. And finally manually concatenate all results to create final dataset.

### Impute missing values

#### Pre-process the two subsets of numerical columns first

In [17]:
## 5.1 Impute NaN first in 'num_cols_median'
#      Median imputer

# 5.1.1 Instantiate SimpleImputer object
#        Note the strategy of filling NaN

si_median = SimpleImputer(strategy = 'median')

# 5.1.2 Next, use 'si_median' object to fit 
#       and transform at one go and overwrite 
#       our data-subset

X_train_c[num_cols_median] = si_median.fit_transform(X_train_c[num_cols_median])

# 5.1.3 Observe result. It should have no NaNs
X_train_c[num_cols_median]

Unnamed: 0,income,credit_amount
12,8.5,1232.0
3,5.5,1300.0
5,8.9,2323.0
2,3.4,3211.0
8,6.9,1300.0
6,3.9,1010.0
4,2.1,1000.0
13,6.2,1300.0
9,9.0,1782.0


In [18]:
# 5.2 Mean imputer: Same steps as above but
#     on different data-subset

si_mean = SimpleImputer(strategy = 'mean')
X_train_c[num_cols_mean] = si_mean.fit_transform(X_train_c[num_cols_mean])
X_train_c[num_cols_mean]

Unnamed: 0,age
12,60.0
3,40.0
5,89.0
2,55.5
8,87.0
6,23.0
4,34.0
13,55.5
9,55.5


#### Pre-process two subsets of categorical columns, next

In [19]:
# 5.3 Next impute subset of categorical columns
#     with most_frequent

si_mf = SimpleImputer(strategy = 'most_frequent')
X_train_c[cat_cols_mf] = si_mf.fit_transform(X_train_c[cat_cols_mf])
X_train_c[cat_cols_mf]

Unnamed: 0,acc_balance,house_owned
12,1.0,small
3,1.0,small
5,2.0,small
2,1.0,small
8,1.0,small
6,1.0,small
4,1.0,big
13,1.0,small
9,2.0,big


In [20]:
# 5.4 Next impute subset of categorical columns
#     with constant value. Our constant value= 'missing'

si_constant = SimpleImputer(strategy = 'constant', fill_value = 'missing')
X_train_c[cat_cols_constant] = si_constant.fit_transform(X_train_c[cat_cols_constant])
X_train_c[cat_cols_constant]

Unnamed: 0,age_cat,credit_amount_cat
12,2,medium
3,1,missing
5,3,high
2,missing,high
8,3,medium
6,1,low
4,1,low
13,missing,missing
9,missing,high


### One hot encoding all categorical columns

In [21]:
# 6.0 What does OneHotEncoder do?
#     Demo with 'sparse = False'
# 6.1
ohe = onehot(sparse = False)
# 6.2 Learn data characteristics. 
#     Our demo data is: 4 X 2

ohe.fit([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])

# 6.3 Transform demo data to dummy values

ohe.transform([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

array([[1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [22]:
# 6.1 What does OneHotEncoder do?
#    In one line, it transforms dense data to dummy values (1,0)
#     Demo with 'sparse = True' -- Output is stored in a special compressed format
#     See here dense matrix to sparse matrix conversion example:
#     http://www.btechsmartclass.com/data_structures/ds_images/Triplet_Representation_of_Sparse_Matrix.png
#     https://www.researchgate.net/publication/328995968/figure/fig4/AS:693582436528129@1542374347304/Illustration-of-the-sparse-matrix-format-A-Example-matrix-of-size-8-8-with-5.png

# 6.1.1

ohe = onehot(sparse = True)


# 6.1.2
sp = ohe.fit_transform([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])

# 6.1.3
sp

# 6.1.4
sp.toarray()

<4x5 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

array([[1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [23]:
# 5.4 One Hot Encode all categorical columns, cat_cols 
#     Note, by now all NaNs have been filled

ohe = onehot(sparse = False)
ohe.fit_transform(X_train_c[cat_cols])

array([[1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0.]])

### Standard scaling all numeric columns

In [24]:
# 5.5 Scale all numeric variables in the same manner
#     Note, by now all NaNs have been dealth with
ss = StandardScaler()
ss.fit_transform(X_train_c[num_cols])

array([[ 0.21635338,  1.03030583, -0.54398076],
       [-0.74521719, -0.22843885, -0.44519256],
       [ 1.6106307 ,  1.19813845,  1.0409887 ],
       [ 0.        , -1.10956012,  2.33104633],
       [ 1.51447364,  0.35897533, -0.44519256],
       [-1.56255217, -0.89976934, -0.86649517],
       [-1.03368836, -1.65501615, -0.88102284],
       [ 0.        ,  0.06526824, -0.44519256],
       [ 0.        ,  1.24009661,  0.25504142]])

#### Concatenate pre-processed data

In [25]:
# 6.0 So complete dataset is:
a = ohe.transform(X_train_c[cat_cols])
b = ss.transform(X_train_c[num_cols])
# 6.1 Concatenate now
Xtrain = np.hstack([a,b]) # It is not X_train
Xtrain.shape # (8,14)

(9, 15)

### Label encoding target
Code text values in target column to digits

In [26]:
# 5.6 Label encode target feature
# 5.6.1 Our target
f"Unencoded target column is:"
y_train
# 5.6.2 Encode now
le = LabelEncoder()
le.fit(y_train)
y_train = le.fit_transform(y_train)
f"Encoded y_train is: {y_train}"   # Encoded y

'Unencoded target column is:'

12     no
3     yes
5     yes
2     yes
8      no
6      no
4     yes
13     no
9      no
Name: creditability, dtype: object

LabelEncoder()

'Encoded y_train is: [0 1 1 1 0 0 1 0 0]'

### Decision tree Modeling

In [27]:
# 6.0 Train model using Xtrain
from sklearn.tree import DecisionTreeClassifier 

# 6.1 Instantiate DecisionTreeClassifier class

dt = DecisionTreeClassifier()

# 6.2 Use the classifier object to train 
#     on our data

dt.fit(Xtrain,y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [28]:
# 6.3 We now want to predict values for X_test_c

# 6.4 First, transform X_test_c in the same manner as we did for X_train
#      But this time, there will be no 'fit()'. We will use
#       already fitted objects for transformations.

X_test_c[num_cols_median]  = si_median.transform(X_test_c[num_cols_median])
X_test_c[num_cols_mean]    = si_mean.transform(X_test_c[num_cols_mean])
X_test_c[cat_cols_constant]= si_constant.transform(X_test_c[cat_cols_constant])
X_test_c[cat_cols_mf]      = si_mf.transform(X_test_c[cat_cols_mf]) 
a                          = ohe.transform(X_test_c[cat_cols])
b                          = ss.transform(X_test_c[num_cols])
Xtest = np.hstack([a,b])   # Final transformed X_test

In [29]:
# 6.5 Also label encode, y_test
#     Using earlier fitted 'le' object
f"Values in y_test are:"
y_test

y_test = le.transform(y_test)
f"Transformed y_test is: {y_test}"

'Values in y_test are:'

10     no
11     no
0     yes
1     yes
7      no
Name: creditability, dtype: object

'Transformed y_test is: [0 0 1 1 0]'

In [30]:
# 6.3 Make prediction for Xtest
dt.predict(Xtest)

array([0, 0, 0, 1, 0])

## Part II
## Data preprocessing with Pipelining
Pre-process each one of the four subsets of data though a pipe and also perform modeling in pipe. 

In [31]:
# 7.0 Create pipes for all transformations
# Ref: https://scikit-learn.org/stable/modules/compose.html#pipeline

#     ColumnTransformer: Applies specified transformations parallely to each data-subset
#     Pipeline         :  Applies transformation sequentially through transformers. 
#                         Input to pipe is one data-subset. Output of one transformer
#                         is fed to another.

#     Big picture
#     i)   top_pipeline = Pipeline(ColumnTranformer, Estimator)
#     ii)  ColumnTransformer([(pipe_mean),(pipe_median),(pipe_mf),(pipe_constant],[list of columns])
#     iii) pipe_mean = Pipeline([(imputer), (StandardScaler)]) 
#      iv) pipe_median=....
#
#     The pipeline can be used as any other estimator
#     and avoids leaking the test set into the train set

In [32]:
# 7.1 Instantiate Pipeline object for processing numerical data. Impute = mean

pipe_mean = Pipeline(
                      [
                       ('si', SimpleImputer(strategy='mean')),
                       ('ss1', StandardScaler())
                      ]
                    )

# 7.1.1 Train pipe, just to test if it works
pipe_mean.fit_transform(X_train[num_cols_mean])

array([[ 0.21635338],
       [-0.74521719],
       [ 1.6106307 ],
       [ 0.        ],
       [ 1.51447364],
       [-1.56255217],
       [-1.03368836],
       [ 0.        ],
       [ 0.        ]])

In [33]:
# 7.2 Instantiate Pipeline object for processing numerical data. Impute = median
pipe_median = Pipeline(
                        [
                            ('sm', SimpleImputer(strategy='median')),
                            ('ss2', StandardScaler())
                        ]
                       )

In [34]:
# 7.3 Instantiate Pipeline object for processing cat data. Impute = most_frequent
pipe_mf = Pipeline(
                    [
                        ('mf', SimpleImputer(strategy='most_frequent')),
                        ('ohe', onehot())
                     ]
                   )

In [35]:
# 7.4 Instantiate Pipeline object for processing cat data. Impute = constant
sc = SimpleImputer(strategy="constant", fill_value = 'missing')
pipe_constant = Pipeline(
                          [
                             ('cons', sc),
                             ('ohe', onehot())
                          ]
                        )

In [36]:
# 7.5 Collecting all pipes in column transformer
#     along with column names
#                       some-name  transformer     col-names
ct = ColumnTransformer(
                        [
                          ('pm',    pipe_mean,     num_cols_mean    ),
                          ('pme',   pipe_median,   num_cols_median  ),
                          ('pmf',   pipe_mf,       cat_cols_mf      ),
                          ('pcons', pipe_constant, cat_cols_constant)
                        ]
                      )

In [37]:
# 7.6 Final pipeline for transformation and modeling
final_pipe = Pipeline(
                        [
                            ('ct', ct),                      # Column transformer object
                            ('dt', DecisionTreeClassifier()) # Estimator
                        ]
                      )

#### Train final_pipe on data

In [38]:
# 8.0 Train on data using final_pipe
#     We use (X_train, y_train)
final_pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pm',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                                               

In [39]:
# 8.1 Make prediction on test data
#     Note that there is no need to separately
#     transform X_test. Pipes take care of that

final_pipe.predict(X_test)

array([0, 0, 0, 0, 0])

In [40]:
# 8.2 But what is the actual y_test
y_test
# le.transform(y_test)

array([0, 0, 1, 1, 0])

In [None]:
######## That's all folks ##########