In [None]:
# Last amended:  10th July, 2020
# My folder:    /home/ashok/Documents/5.decisiontree
# VM: lubuntu_machinelearning_I
# Ref Why dummy encoding:
#        https://www.statisticssolutions.com/dummy-coding-the-how-and-why/

# Objectives:
#     i)    Read and explore data
#    ii)    Deal with missing values 
#   iii)    OneHotEncode categorical features
#   iv)     Use Pipeline and ColumnTransformer 
#            for data transformation
#    v)     Pipeline for modeling
#    vi)    Nested pipes

In [None]:
# 1.0 Reset memory
%reset -f
# 1.1 Call libraries

## Data manipulation
import numpy as np
import pandas as pd

# 1.2 for data splitting
from sklearn.model_selection import train_test_split

## Transformers:
# 1.3 Class for imputing missing values
from sklearn.impute import SimpleImputer
# 1.4 One hot encode categorical data--Convert to dummy
from sklearn.preprocessing import OneHotEncoder as onehot
# 1.5 Scale numeric data
from sklearn.preprocessing import StandardScaler
# 1.6 Label encode target column
from sklearn.preprocessing import LabelEncoder


## Composite Transformers
# 1.7 Class for applying multiple data transformation jobs
from sklearn.compose import ColumnTransformer
# 1.8 Pipeline class
from sklearn.pipeline import Pipeline

# 1.9 Estimator
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# User guide: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier 

In [None]:
# 1.10 Display outputs of all commands from a cell--not just of the last command
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 2.0 Import warnings module
import warnings
# 2.1 Do not print warnings on screen
warnings.filterwarnings("ignore")

In [None]:
# 3.0 Create a toy dataset with six columns
#     Every column has at least one NaN except column: 'creditability'
#     'creditability' is our target column so no NaN
df = pd.DataFrame({
                    'creditability' : ['yes','yes','yes','yes','yes','yes','no','no','no','no','no','no','no','no'], # Target column
                    'acc_balance'   : [1,2,1,np.nan,1,2,1,2,1,2,2,np.nan,np.nan,np.nan],
                    'house_owned'   : ['big','small',np.nan,'small','big',np.nan,np.nan,'big','small','big','big','small',np.nan,'small'],
                    'age'           : [21,45,np.nan,40,34,89,23,65,87,np.nan,90,np.nan,60,np.nan],
                    'income'        : [np.nan,7.8,3.4,5.5,2.1,8.9,3.9,np.nan,6.9,9.0,np.nan,8.0,8.5,np.nan],  
                    'credit_amount' : [1011,np.nan,3211,np.nan,1000,2323,1010,1500,1300,1782,1212,np.nan,1232,np.nan]
                  }
               )

df    # (14,6)

In [None]:
# 3.1 Engineer some new categorical features from 'age' and 'credit_amount'
#     We will have NaN values both in 'age_cat' and 'credit_amount_cat' columns
#     (Note: Strictly speaking this method of creating features outside pipeline is 
#     not recommended as it leaks information about X_test (to be created, see below)
#     to X_train. Recommended way is to wrap it up inside sklearn's FunctionTransformer
#     and then use wrapped transformer within a processing pipeline. See:
#     https://scikit-learn.org/stable/modules/preprocessing.html#custom-transformers)
#     Else, use KBinsDiscretizer of sklearn

df['age_cat'] = pd.cut(df['age'],               # Equal interval cuts between min and max
                       3,                       # Three cuts
                       labels=["1","2", "3"]    # Label for each cut
                       )

# 3.1.1
df['credit_amount_cat'] = pd.qcut(df['credit_amount'],       # Equal freq cut
                                  3,
                                  labels=["low","medium", "high"])
df   # (13,8)


In [None]:
# 3.2 Randomly shuffle data as values 
#     in 'credibility' column have an order
df = df.sample(frac = 1) 
df    # (13,8)

### Data splitting

In [None]:
# 3.3    Popout target
#        to separate predictors and target

y = df.pop('creditability')
y[:3]      # Pandas Series

# 3.4   Create an alias of predictors dataset 
X = df     # X is another name for df
X.shape    # (13,7)

In [None]:
# 4.0 Split dataset. We will preprocess X_train and apply that
#     processing to X_test later
X_train,X_test, y_train, y_test = train_test_split(
                                                    X,                   # Data features
                                                    y,                   # Target column
                                                    test_size = 0.3      # split-ratio
                                                    )

# 4.1 Note the use of f-string for printing
f"X_train shape: {X_train.shape}"    # (9,7)
f"X_test.shape : {X_test.shape}"     # (4,7)
f"y_train shape: {y_train.shape}"    # (9,)
f"y_test shape : {y_test.shape}"     # (4,)

#### Make copy of data set for two separate ways processing

In [None]:
# 4.2   Make a copy of X_train
#       and X_test for two separate
#       ways of data processing
#       without using pipes and with pipes

X_train_c = X_train.copy()
X_test_c  = X_test.copy()

### Separate out categorical and numerical features

In [None]:
### 4.3
###    Which columns are categorical
###    but disguised as integers

# 4.3 How many unique values per column.
#     Check every column
#     We will assume that if number of unique values
#      are 4 or less it is categorical column else numeric

f"Total no of unique values per column are:"
X_train_c.nunique()        # Total no of unique values in each column

# 4.4 If no. of unique values less than 5, it is categorical
f"True are categorical and False are numerical:"
X_train_c.nunique() < 5    # All True are categorical


In [None]:
# 4.5 Extract list of cat_cols and num_cols:

# 4.6 First note which are cat and which are num
dg = (X_train_c.nunique() < 5)  
dg    # All True are cat and all False are num

# 4.7 Then filter out names from Series index 
cat_cols = dg[dg==True].index.tolist()
num_cols = dg[dg==False].index.tolist()

In [None]:
# 4.8 Here are the columns
cat_cols    #  4
num_cols    #  3

In [None]:
# 4.9 We will create two subsets of num_cols
#      One set we will impute using 'mean' 
#       and the other using 'median'
num_cols_mean   = ['age']
num_cols_median = ['income', 'credit_amount']

In [None]:
# 4.10 We will create two sets of cat_cols
#      One set we will fill with 'most_frequent'
#       and the other using a constant value

cat_cols_mf       = ['acc_balance', 'house_owned']       # 'most_frequent' fill
cat_cols_constant = ['age_cat', 'credit_amount_cat']     # 'constant' fill

In [None]:
# 4.11 So we have four datasets for imputing: These are:
X_train[num_cols_mean]              # Num dataset, impute by 'mean'   strategy
X_train[num_cols_median]            # Num dataset, impute by 'median' strategy
X_train[cat_cols_mf]                # Cat dataset, impute by 'most_frequent' strategy
X_train[cat_cols_constant]          # Cat dataset, impute by 'constant' strategy

## Part I
## Data preprocessing without Pipelining
Pre-process each one of the four subsets of data separately. And finally manually concatenate all results to create final dataset. We will henceforth use two terms:<br>
<ul>
    <li>
    transformers: Which transform a dataset. Examples: <i>StandardScaler()</i>, <i>Normalizer()</i>, <i>SimpleImputer()</i>,  <i>OneHotEncoder()</i>, <i>PipeLine()</i>, <i>ColumnTransformer()</i>. Transformers have <i>fit()</i>, <i>transform()</i> and <i>fit_transform()</i> methods. Transformers do not make any predictions and hence there is no <i>predict()</i> method.
    </li>
    <li>
estimators:   Which estimate the pattern in a data. Example: <i>DecisionTreeClassifier</i>, <i>KMeans</i>, <i>GMM</i>.      Estimators have <i>fit()</i> and <i>predict()</i> methods. There is no need for any transformation and hence there is no <i>transform()</i> method.
</ul></li>


### Impute missing values

#### Pre-process the two subsets of numerical columns first

In [None]:
## 5.1 Impute NaN first in 'num_cols_median'
#      Median imputer

# 5.1.1 Instantiate SimpleImputer object
#        Note the strategy of filling NaN

si_median = SimpleImputer(strategy = 'median')

# 5.1.2 Next, use 'si_median' object to fit 
#       and transform at one go and overwrite 
#       our data-subset

X_train_c[num_cols_median] = si_median.fit_transform(X_train_c[num_cols_median])

# 5.1.3 Observe result. It should have no NaNs
X_train_c[num_cols_median]

In [None]:
# 5.2 Mean imputer: Same steps as above but
#     on different data-subset

si_mean = SimpleImputer(strategy = 'mean')
X_train_c[num_cols_mean] = si_mean.fit_transform(X_train_c[num_cols_mean])
X_train_c[num_cols_mean]

#### Pre-process two subsets of categorical columns, next

In [None]:
# 5.3 Next impute subset of categorical columns
#     with most_frequent

si_mf = SimpleImputer(strategy = 'most_frequent')
X_train_c[cat_cols_mf] = si_mf.fit_transform(X_train_c[cat_cols_mf])
X_train_c[cat_cols_mf]

In [None]:
# 5.4 Next impute subset of categorical columns
#     with constant value. Our constant value= 'missing'

si_constant = SimpleImputer(strategy = 'constant', fill_value = 'missing')
X_train_c[cat_cols_constant] = si_constant.fit_transform(X_train_c[cat_cols_constant])
X_train_c[cat_cols_constant]

In [None]:
## A Summary
# 5.5 So our fitted transformers are:

si_median         # To transform num_cols_median
si_mean           # To transform num_cols_mean
si_mf             # To transform cat_cols_mf
si_constant       # To transform cat_cols_constant

# 5.6 Our column-colections are:

cat_cols          # cat_cols_mf + cat_cols_constant
num_cols          # num_cols_mean + num_cols_median

cat_cols_mf
cat_cols_constant
num_cols_mean
num_cols_median

# 5.7 Our datasets are:

X_train[:2],X_test[:2],y_train[:2],y_test[:2]

### One hot encoding all categorical columns

In [None]:
# 6.0 What does OneHotEncoder do?
#     Demo with 'sparse = False'

# 6.1 Instantiate onehot class
#     through 'ohe' object

ohe = onehot(sparse = False)

# 6.2 Let 'ohe' learn relevant data
#     properties. Our demo data is: 4 X 2

ohe.fit(
         [                            # A list of lists
            ['big'   , 'yes' ],
            ['small' , 'no'  ],
            ['medium', 'yes' ],
            ['big'   , 'no'  ]
         ]
       )

# 6.3 Use 'ohe' to transform demo data
#     to dummy values

ohe.transform([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])



In [None]:
# 6.4 In future use 'ohe' to tansform any data with such levels.
#     For example:

ohe.transform(
               [
                   ['small'  , 'yes'],
                   ['medium' , 'no' ]
               ])

In [None]:
# 6.5 What does OneHotEncoder do?
#    In one line, it transforms dense data to dummy values (1,0)
#     Demo with 'sparse = True' -- Output is stored in a special compressed format
#     See here dense matrix to sparse matrix conversion example:
#     http://www.btechsmartclass.com/data_structures/ds_images/Triplet_Representation_of_Sparse_Matrix.png
#     https://www.researchgate.net/publication/328995968/figure/fig4/AS:693582436528129@1542374347304/Illustration-of-the-sparse-matrix-format-A-Example-matrix-of-size-8-8-with-5.png

# 6.5.1

ohe = onehot(sparse = True)


# 6.5.2
sp = ohe.fit_transform([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])

# 6.5.3
sp


In [None]:
# 6.5.4  Transform sparse to dense form
sp.toarray()

In [None]:
# 6.6 One Hot Encode all categorical columns, cat_cols 
#     Note, by now all NaNs have been filled

ohe = onehot(sparse = False)
ohe.fit_transform(X_train_c[cat_cols])

### Standard scaling all numeric columns

In [None]:
# 6.7 Scale all numeric variables in the same manner
#     Note, by now all NaNs have been dealth with
ss = StandardScaler()
ss.fit_transform(X_train_c[num_cols])

#### Concatenate pre-processed data

In [None]:
# 7.0 So complete dataset is:
a = ohe.transform(X_train_c[cat_cols])
b = ss.transform(X_train_c[num_cols])

# 7.1 Horizontally concatenate now
Xtrain = np.hstack([a,b]) # It is not X_train
Xtrain.shape # (8,14)

### Label encoding target
Code text values in target column to digits. Our <i>y_train</i> (<i>'Creditability'</i> column) has values 'yes', 'no'...

In [None]:
# 8.0 Label encode target feature
# 8.0.1 Our target
f"Unencoded target column is:"
y_train

# 8.0.2 Encode now
le = LabelEncoder()
le.fit(y_train)

y_train = le.fit_transform(y_train)

# 8.0.3 Just print transformed y_train
f"Encoded y_train is: {y_train}"   # Encoded y

### Decision tree Modeling

In [None]:
# 9.0 Train model using Xtrain
from sklearn.tree import DecisionTreeClassifier 

# 9.1 Instantiate DecisionTreeClassifier class

dt = DecisionTreeClassifier()

# 9.2 Use the classifier object to train 
#     on our data

dt.fit(Xtrain,y_train)


## Transform X_test_c
Before making predictions, we need to tranform columns of <i>X_test_c</i> in the same manner we did to X_train

In [None]:
# 9.3 We now want to predict values for X_test_c

# 9.4 First, transform X_test_c in the same manner as we did for X_train
#      But this time, there will be no 'fit()'. We will use
#       already fitted objects for transformations.

#     NOTE: If you have find the error of there being new levels in Xtest,
#           execute all code quickly from #4.0 onwards.

X_test_c[num_cols_median]  = si_median.transform(X_test_c[num_cols_median])
X_test_c[num_cols_mean]    = si_mean.transform(X_test_c[num_cols_mean])
X_test_c[cat_cols_constant]= si_constant.transform(X_test_c[cat_cols_constant])
X_test_c[cat_cols_mf]      = si_mf.transform(X_test_c[cat_cols_mf]) 
a                          = ohe.transform(X_test_c[cat_cols])
b                          = ss.transform(X_test_c[num_cols])
Xtest = np.hstack([a,b])   # Final transformed X_test

In [None]:
# 9.5 Also label encode, y_test
#     Using earlier fitted 'le' object
f"Values in y_test are:"
y_test

y_test = le.transform(y_test)
f"Transformed y_test is: {y_test}"

In [None]:
# 9.6 Make prediction for Xtest
dt.predict(Xtest)

## Part II
## Data preprocessing with Pipelining
Pre-process each one of the four subsets of data though a pipe and also perform modeling in pipe. 

In [None]:
# 10.0 Create pipes for all transformations
# Ref: https://scikit-learn.org/stable/modules/compose.html#pipeline

#     ColumnTransformer: Applies specified transformations parallely to each data-subset
#     Pipeline         :  Applies transformation sequentially through transformers. 
#                         Input to pipe is one data-subset. Output of one transformer
#                         is fed to another.

#     Big picture
#     i)   top_pipeline = Pipeline(ColumnTranformer, Estimator)
#     ii)  ColumnTransformer([(pipe_mean, cols_mean),(pipe_median, cols_median),
#                             (pipe_mf,   cols_mf),  (pipe_constant,cols_const])
#     iii) pipe_mean = Pipeline([(imputer), (StandardScaler)]) 
#      iv) pipe_median=....
#
#     The pipeline can be used as any other estimator
#     and avoids leaking the test set into the train set

In [None]:
# 10.1 Instantiate Pipeline object for processing numerical data. Impute = mean
#     Pipeline as a composite transformer

pipe_mean_transformer = Pipeline(
                                  [
                                    ('si', SimpleImputer(strategy='mean')),
                                    ('ss1', StandardScaler())
                                  ]
                                 )

# 10.1.1 Train pipe, just to test if it works
pipe_mean_transformer.fit_transform(X_train[num_cols_mean])

In [None]:
# 10.2 Instantiate Pipeline object for processing numerical data. Impute = median
pipe_median_transformer = Pipeline(
                                     [
                                        ('sm', SimpleImputer(strategy='median')),
                                        ('ss2', StandardScaler())
                                      ]
                                   )

In [None]:
# 10.3 Instantiate Pipeline object for processing cat data. Impute = most_frequent
pipe_mf_transfomer = Pipeline(
                                [
                                  ('mf', SimpleImputer(strategy='most_frequent')),
                                  ('ohe', onehot())
                                ]
                              )

In [None]:
# 10.4 Instantiate Pipeline object for processing cat data. Impute = constant
sc = SimpleImputer(strategy="constant", fill_value = 'missing')

pipe_constant_transformer = Pipeline(
                                       [
                                          ('cons', sc),
                                          ('ohe', onehot())
                                        ]
                                      )

In [None]:
# 10.5 Collecting all pipes in column transformer
#     along with column names
#                       some-name  transformer     col-names
ct_transformer = ColumnTransformer(
                                    [
                                       ('pm',    pipe_mean_transformer    ,     num_cols_mean   ),
                                       ('pme',   pipe_median_transformer  ,    num_cols_median  ),
                                       ('pmf',   pipe_mf_transfomer       ,        cat_cols_mf  ),
                                       ('pcons', pipe_constant_transformer,   cat_cols_constant )
                                    ]
                                  )

In [None]:
# 10.6 Final pipeline for transformation and modeling
#     final_pipe is both a 
final_pipe_transformer_estimator = Pipeline(
                                             [
                                                 ('ct', ct_transformer),            # Column transformer object
                                                 ('dt', DecisionTreeClassifier()) # Estimator
                                             ]
                                           )

#### Train final_pipe on data

In [None]:
# 11.0 Train on data using final_pipe
#     We use (X_train, y_train)

final_pipe_transformer_estimator.fit(X_train,y_train)

In [None]:
# 11.1 Make prediction on test data
#     Note that there is no need to separately
#     transform X_test. Pipes take care of that

final_pipe_transformer_estimator.predict(X_test)

In [None]:
# 11.2 But what is the actual y_test
y_test
# le.transform(y_test)

In [None]:
######## That's all folks ##########