In [1]:
# Last amended: 29th June, 2020
# My folder:    /home/ashok/Documents/5.decisiontree

# Objectives:
#     i)    Read and explore data
#    ii)    Deal with missing values 
#   iii)    OneHotEncode categorical features
#   iv)     Use Pipeline and ColumnTransformer 
#            for data transformation
#    v)     Pipeline for modeling
#    vi)    Nested pipes

In [3]:
# 1.0 Reset memory
%reset -f
# 1.1 Call libraries
import numpy as np
import pandas as pd

# 1.2 for data splitting
from sklearn.model_selection import train_test_split
# 1.3 Class for imputing missing values
from sklearn.impute import SimpleImputer
# 1.4 One hot encode categorical data--Convert to dummy
from sklearn.preprocessing import OneHotEncoder as onehot
# 1.5 Scale numeric data
from sklearn.preprocessing import StandardScaler
# 1.6 Class for applying multiple data transformation jobs
from sklearn.compose import ColumnTransformer
# 1.7 Pipeline class
from sklearn.pipeline import Pipeline
# 1.8 Label encode target column
from sklearn.preprocessing import LabelEncoder

# 1.9 Modeler
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# User guide: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier 

In [4]:
# 1.10 Display outputs of all commands from a cell--not just of the last command
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
# 2.0 Import warnings module
import warnings
# 2.1 Do not print warnings on screen
warnings.filterwarnings("ignore")

# 2.2 Change ipython options to display all data columns
pd.options.display.max_columns = 300
np.set_printoptions(threshold=np.inf)

In [18]:
# 3.0 Create a toy dataset with six columns
#     Every column has at least one NaN except column: 'creditability'
#     'creditability' is our target column so no NaN
df = pd.DataFrame({
                    'creditability' : ['yes','yes','yes','yes','yes','yes','no','no','no','no','no','no','no','no'], # Target column
                    'acc_balance'   : [1,2,1,np.nan,1,2,1,2,1,2,2,np.nan,np.nan,np.nan],
                    'house_owned'   : ['big','small',np.nan,'small','big',np.nan,np.nan,'big','small','big','big','small',np.nan,'small'],
                    'age'           : [21,45,np.nan,40,34,89,23,65,87,np.nan,90,np.nan,60,np.nan],
                    'income'        : [np.nan,7.8,3.4,5.5,2.1,8.9,3.9,np.nan,6.9,9.0,np.nan,8.0,8.5,np.nan],  
                    'credit_amount' : [1011,np.nan,3211,np.nan,1000,2323,1010,1500,1300,1782,1212,np.nan,1232,np.nan]
                  }
               )

df    # (14,6)

Unnamed: 0,creditability,acc_balance,house_owned,age,income,credit_amount
0,yes,1.0,big,21.0,,1011.0
1,yes,2.0,small,45.0,7.8,
2,yes,1.0,,,3.4,3211.0
3,yes,,small,40.0,5.5,
4,yes,1.0,big,34.0,2.1,1000.0
5,yes,2.0,,89.0,8.9,2323.0
6,no,1.0,,23.0,3.9,1010.0
7,no,2.0,big,65.0,,1500.0
8,no,1.0,small,87.0,6.9,1300.0
9,no,2.0,big,,9.0,1782.0


In [19]:
# 3.1 Engineer some new categorical features from 'age' and 'credit_amount'
#     We will have NaN values both in 'age_cat' and 'credit_amount_cat' columns

df['age_cat'] = pd.cut(df['age'],               # Equal interval cuts between min and max
                       3,                       # Three cuts
                       labels=["1","2", "3"]    # Label for each cut
                       )

# 3.1.1
df['credit_amount_cat'] = pd.qcut(df['credit_amount'],       # Equal freq cut
                                  3,
                                  labels=["high","medium", "low"])
df   # (13,8)


Unnamed: 0,creditability,acc_balance,house_owned,age,income,credit_amount,age_cat,credit_amount_cat
0,yes,1.0,big,21.0,,1011.0,1.0,high
1,yes,2.0,small,45.0,7.8,,2.0,
2,yes,1.0,,,3.4,3211.0,,low
3,yes,,small,40.0,5.5,,1.0,
4,yes,1.0,big,34.0,2.1,1000.0,1.0,high
5,yes,2.0,,89.0,8.9,2323.0,3.0,low
6,no,1.0,,23.0,3.9,1010.0,1.0,high
7,no,2.0,big,65.0,,1500.0,2.0,medium
8,no,1.0,small,87.0,6.9,1300.0,3.0,medium
9,no,2.0,big,,9.0,1782.0,,low


In [20]:
# 3.2 Randomly shuffle data as values 
#     in 'credibility' column have an order
df = df.sample(frac = 1) 
df    # (13,8)

Unnamed: 0,creditability,acc_balance,house_owned,age,income,credit_amount,age_cat,credit_amount_cat
3,yes,,small,40.0,5.5,,1.0,
13,no,,small,,,,,
12,no,,,60.0,8.5,1232.0,2.0,medium
6,no,1.0,,23.0,3.9,1010.0,1.0,high
0,yes,1.0,big,21.0,,1011.0,1.0,high
11,no,,small,,8.0,,,
10,no,2.0,big,90.0,,1212.0,3.0,high
4,yes,1.0,big,34.0,2.1,1000.0,1.0,high
5,yes,2.0,,89.0,8.9,2323.0,3.0,low
7,no,2.0,big,65.0,,1500.0,2.0,medium


### Data splitting

In [21]:
# 3.3    Popout target
#        Separate predictors and target

y = df.pop('creditability')
y[:3]      # Pandas Series

# 3.4   Create an alias of df
X = df     # X is another name for df
X.shape    # (13,7)

3     yes
13     no
12     no
Name: creditability, dtype: object

(14, 7)

In [29]:
# 4.0 Split dataset. We will preprocess X_train and apply that
#     processing to X_test later
X_train,X_test, y_train, y_test = train_test_split(
                                                    X,                   # Data features
                                                    y,                   # Target column
                                                    test_size = 0.3      # split-ratio
                                                    )

# 4.1 Note the use of f-string for printing
f"X_train shape: {X_train.shape}"    # (9,7)
f"X_test.shape : {X_test.shape}"     # (4,7)
f"y_train shape: {y_train.shape}"    # (9,)
f"y_test shape : {y_test.shape}"     # (4,)

'X_train shape: (9, 7)'

'X_test.shape : (5, 7)'

'y_train shape: (9,)'

'y_test shape : (5,)'

#### In our case of very less data, the following check is important

In [31]:
# 4.2   Make a copy of X_train
#       and X_test for two separate
#       ways of data processing
#       without using pipes and with pipes

X_train_c = X_train.copy()
X_test_c  = X_test.copy()

### Separate out categorical and numerical features

In [35]:
### 4.2
###    We now want to know which of the columns are categorical
###    but disguised as integers

# 4.3 How many unique vales per column.
#     Check every column
#     We will assume that if unique values are 4 or less
#     it is categorical column else numeric

f"Total no of unique values per column are:"
X_train_c.nunique()        # Total no of unique values in each column

# 4.4 If no. of unique values less than 5, it is categorical
f"True are categorical and False are numerical:"
X_train_c.nunique() < 5    # All True are categorical


'Total no of unique values per column:'

acc_balance          2
house_owned          2
age                  6
income               7
credit_amount        6
age_cat              3
credit_amount_cat    3
dtype: int64

'True are categorical and False are numerical:'

acc_balance           True
house_owned           True
age                  False
income               False
credit_amount        False
age_cat               True
credit_amount_cat     True
dtype: bool

In [38]:
# 4.5 Extract list of cat_cols and num_cols:

# 4.6 First note which are cat and which are num
dg = (X_train_c.nunique() < 5)  
dg    # All True are cat and all False are num

# 4.7 Then filter out names from Series index 
cat_cols = dg[dg==True].index.tolist()
num_cols = dg[dg==False].index.tolist()

acc_balance           True
house_owned           True
age                  False
income               False
credit_amount        False
age_cat               True
credit_amount_cat     True
dtype: bool

In [39]:
# 4.8 Here are the columns
cat_cols    #  4
num_cols    #  3

['acc_balance', 'house_owned', 'age_cat', 'credit_amount_cat']

['age', 'income', 'credit_amount']

In [41]:
# 4.9 We will create two sets of num_cols
#      One set we will impute using 'mean' 
#       and the other using 'median'
num_cols_mean   = ['age']
num_cols_median = ['income', 'credit_amount']

In [44]:
# 4.10 We will create two sets of cat_cols
#      One set we will fill with 'most_frequent'
#       and the other using a constant value

cat_cols_mf       = ['acc_balance', 'house_owned']       # 'most_frequent' fill
cat_cols_constant = ['age_cat', 'credit_amount_cat']     # 'constant' fill

In [45]:
# 4.11 So our datasets for imputing are:
X_train[num_cols_mean]              # Num dataset, impute by 'mean'   strategy
X_train[num_cols_median]            # Num dataset, impute by 'median' strategy
X_train[cat_cols_mf]                # Cat dataset, impute by 'most_frequent' strategy
X_train[cat_cols_constant]          # Cat dataset, impute by 'constant' strategy

Unnamed: 0,age
1,45.0
3,40.0
12,60.0
8,87.0
13,
9,
10,90.0
4,34.0
2,


Unnamed: 0,income,credit_amount
1,7.8,
3,5.5,
12,8.5,1232.0
8,6.9,1300.0
13,,
9,9.0,1782.0
10,,1212.0
4,2.1,1000.0
2,3.4,3211.0


Unnamed: 0,acc_balance,house_owned
1,2.0,small
3,,small
12,,
8,1.0,small
13,,small
9,2.0,big
10,2.0,big
4,1.0,big
2,1.0,


Unnamed: 0,age_cat,credit_amount_cat
1,2.0,
3,1.0,
12,2.0,medium
8,3.0,medium
13,,
9,,low
10,3.0,high
4,1.0,high
2,,low


## Data preprocessing without Pipelining
Pre-process a subset of columns, at a time. And finally manually concatenate all results

### Impute missing values

#### Pre-process subsets of numerical columns first

In [46]:
## 5.  Create transformers
#  5.1 Impute NaN first in num cols
#      Median imputer

# 5.1.1 Instnatiate SimpleImputer object
si_median = SimpleImputer(strategy = 'median')

# 5.1.2 Use 'si_median' object to fit and transform at one go
#       and overwrite our data
X_train_c[num_cols_median] = si_median.fit_transform(X_train_c[num_cols_median])

# 5.1.3 Final result without NaNs
X_train_c[num_cols_median]

Unnamed: 0,income,credit_amount
1,7.8,1266.0
3,5.5,1266.0
12,8.5,1232.0
8,6.9,1300.0
13,6.9,1266.0
9,9.0,1782.0
10,6.9,1212.0
4,2.1,1000.0
2,3.4,3211.0


In [47]:
# 5.2 Mean imputer: Same as above but on different dataset
si_mean = SimpleImputer(strategy = 'mean')
X_train_c[num_cols_mean] = si_mean.fit_transform(X_train_c[num_cols_mean])
X_train_c[num_cols_mean]

Unnamed: 0,age
1,45.0
3,40.0
12,60.0
8,87.0
13,59.333333
9,59.333333
10,90.0
4,34.0
2,59.333333


#### Pre-process subsets of categorical columns, next

In [18]:
# 5.3 Next impute subset of categorical columns with most_frequent
si_mf = SimpleImputer(strategy = 'most_frequent')
X_train_c[cat_cols_mf] = si_mf.fit_transform(X_train_c[cat_cols_mf])
X_train_c[cat_cols_mf]

Unnamed: 0,account_balance,apartment_owned
8,2.0,big
9,1.0,big
2,1.0,big
10,1.0,small
1,2.0,small
7,2.0,small
6,1.0,big
4,1.0,big


In [47]:
# 5.4 Next impute subset of categorical columns with constant value
si_constant = SimpleImputer(strategy = 'constant', fill_value = 'missing')
X_train_c[cat_cols_constant] = si_constant.fit_transform(X_train_c[cat_cols_constant])
X_train_c[cat_cols_constant]

Unnamed: 0,age_cat,credit_amount_cat
8,3,medium
9,missing,low
2,missing,low
10,3,missing
1,2,missing
7,2,medium
6,1,high
4,1,high


### One hot encoding all categorical columns

In [20]:
# 6.0 What does OneHotEncoder do?
#     Demo with 'sparse = False'
# 6.1
ohe = onehot(sparse = False)
# 6.2
ohe.fit([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])
# 6.3
ohe.transform([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

array([[1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [21]:
# 6.1 What does OneHotEncoder do?
#     Demo with 'sparse = True' -- Output is stored in a special compressed format
#     See here dense matrix to sparse matrix conversion example:
#     http://www.btechsmartclass.com/data_structures/ds_images/Triplet_Representation_of_Sparse_Matrix.png
#     https://www.researchgate.net/publication/328995968/figure/fig4/AS:693582436528129@1542374347304/Illustration-of-the-sparse-matrix-format-A-Example-matrix-of-size-8-8-with-5.png
# 6.1.1
ohe = onehot(sparse = True)
# 6.1.2
sp = ohe.fit_transform([['big', 'yes'], ['small', 'no'], ['medium', 'yes'],['big','no']])
# 6.1.3
sp
# 6.1.4
sp.toarray()

<4x5 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

array([[1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.]])

In [22]:
# 5.4 One Hot Encode all categorical columns
#     Note, by now all NaNs have been dealt with
ohe = onehot(sparse = False)
ohe.fit_transform(X_train_c[cat_cols])

array([[0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.]])

### Scaling all numeric columns

In [23]:
# 5.5 Scale all numeric variables in the same manner
#     Note, by now all NaNs have been dealth with
ss = StandardScaler()
ss.fit_transform(X_train_c[num_cols])

array([[ 1.34686254,  0.64768413, -0.41563411],
       [ 0.        ,  1.61061274,  0.31186708],
       [ 0.        , -0.95719689,  2.46871189],
       [ 1.48306212, -0.04012203, -0.26470025],
       [-0.55993162,  1.06036782, -0.26470025],
       [ 0.3480656 , -0.04012203, -0.11376639],
       [-1.55872856, -0.72792818, -0.85334229],
       [-1.05933009, -1.55329556, -0.86843568]])

#### Concatenate pre-processed data

In [24]:
# So complete dataset is:
a = ohe.transform(X_train_c[cat_cols])
b = ss.transform(X_train_c[num_cols])
Xtrain = np.hstack([a,b])
Xtrain.shape # (8,14)

(8, 15)

### Label encoding target

In [25]:
# 5.6 Label encode target feature
# 5.6.1 Our target
y_train
# 5.6.2 Encode now
le = LabelEncoder()
le.fit(y_train)
y_train = le.fit_transform(y_train)
y_train   # Encoded y

8      no
9      no
2     yes
10     no
1     yes
7      no
6      no
4     yes
Name: creditability, dtype: object

LabelEncoder()

array([0, 0, 1, 0, 1, 0, 0, 1])

### Decision tree Modeling

In [26]:
# 6.0 Train model using Xtrain
from sklearn.tree import DecisionTreeClassifier 
dt = DecisionTreeClassifier()
dt.fit(Xtrain,y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [27]:
# 6.1 Transform X_test_c in the same manner as we did for X_train
#      But this time, there will be no 'fit().'. We will use
#       already fitted objects.

X_test_c[num_cols_median]  = si_median.transform(X_test_c[num_cols_median])
X_test_c[num_cols_mean]    = si_mean.transform(X_test_c[num_cols_mean])
X_test_c[cat_cols_constant]= si_constant.transform(X_test_c[cat_cols_constant])
X_test_c[cat_cols_mf]      = si_mf.transform(X_test_c[cat_cols_mf]) 
a                        = ohe.transform(X_test_c[cat_cols])
b                        = ss.transform(X_test_c[num_cols])
Xtest = np.hstack([a,b])   # In X_train also ohe transform came first

In [28]:
# 6.2 Also label encode, y_test
# 6.2 Also label encode, y_test
y_test
le.transform(y_test)
f"Transformed y_test is: {le.transform(y_test)}"

3     yes
0     yes
11     no
5      no
Name: creditability, dtype: object

array([1, 1, 0, 0])

'Transformed y_test is: [1 1 0 0]'

In [29]:
# 6.3 Make prediction for X_test
dt.predict(Xtest)

array([0, 0, 0, 0])

### Creating transformational pipes

In [30]:
# 7.0 Create pipes for all transformations
#     Big picture
#     i)   top_pipeline = Pipeline(ColumnTranformer, Estimator)
#     ii)  ColumnTransformer([(pipe_mean),(pipe_median),(pipe_mf),(pipe_constant],[list of columns])
#     iii) pipe_mean = Pipeline((imputer), (StandardScaler)) 
#      iv) pipe_median=....

In [33]:
# 7.1 Instantiate Pipeline object for processing numerical data. Impute = mean
pipe_mean = Pipeline([('si', SimpleImputer(strategy='mean')),('ss1', StandardScaler())])
# 7.1.1 Train pipe, just to test if it works
pipe_mean.fit_transform(X_train[num_cols_mean])

array([[ 1.34686254],
       [ 0.        ],
       [ 0.        ],
       [ 1.48306212],
       [-0.55993162],
       [ 0.3480656 ],
       [-1.55872856],
       [-1.05933009]])

In [34]:
# 7.2 Instantiate Pipeline object for processing numerical data. Impute = median
pipe_median = Pipeline([('sm', SimpleImputer(strategy='median')),('ss2', StandardScaler())])

In [35]:
# 7.3 Instantiate Pipeline object for processing cat data. Impute = most_frequent
pipe_mf = Pipeline([('mf', SimpleImputer(strategy='most_frequent')), ('ohe', onehot())])

In [36]:
# 7.4 Instantiate Pipeline object for processing cat data. Impute = constant
sc = SimpleImputer(strategy="constant", fill_value = 'missing')
pipe_constant = Pipeline([('cons', sc),('ohe', onehot())])

In [37]:
# 7.5 Collecting all pipes in column transformer
#     along with column names
ct = ColumnTransformer(
                        [
                          ('pm',pipe_mean,num_cols_mean),
                          ('pme',pipe_median,num_cols_median),
                          ('pmf',pipe_mf, cat_cols_mf),
                          ('pcons', pipe_constant,cat_cols_constant)
                        ]
                      )

In [41]:
# 7.6 Final pipeline for transformation and modeling
final_pipe = Pipeline([('ct', ct),('dt', DecisionTreeClassifier())])

#### Train final_pipe on data

In [42]:
# 8.0 Train on data using final_pipe
#     We use (X_train, y_train)
final_pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pm',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                                               

In [43]:
# 8.1 Make prediction on test data
#     Note that there is no need to separately
#     transform X_test. Pipes take care of that

final_pipe.predict(X_test)

array([0, 0, 0, 0])

In [44]:
# 8.2 But what is the actual y_test
le.transform(y_test)

array([1, 1, 0, 0])

In [None]:
######## That's all folks ##########