# **3. Initial Characteristic Analysis**
---

Our objectives:
- Bin each predictor
- Obtain WOE for each bin (attribute) in characteristic
- Obtain IV for each characteristic

### **3.1 Characteristic Binning**
---

In [17]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

- We concat the predictors (X) & response (y) data for train set first.
- Update the config file to have those concated data path.

In [18]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [19]:
def concat_data(type):
    """Concat the input (X) & output (y) data"""
    X = utils.pickle_load(CONFIG_DATA[f'{type}_path'][0])
    y = utils.pickle_load(CONFIG_DATA[f'{type}_path'][1])
    
    # Concatenate X and y
    data = pd.concat((X, y),
                     axis = 1)

    # Validate data
    print(f'Data shape:', data.shape)

    # Dump concatenated data
    utils.pickle_dump(data, CONFIG_DATA[f'data_{type}_path'])
   
    return data

In [20]:
# Check the function for train data
data_train = concat_data(type='train')
data_train.head()

Data shape: (800, 10)


Unnamed: 0,Age,Sex,Job,Housing,Saving_accounts,Checking_account,Credit_amount,Duration,Purpose,Risk
485,47,male,3,own,little,moderate,1209,6,car,1
390,30,male,3,own,little,,1820,18,car,0
23,44,male,2,own,moderate,moderate,1804,12,car,0
814,46,male,2,free,little,little,3931,48,car,1
107,32,male,2,own,little,moderate,6078,12,car,0


- Then we bin the concatenated data.
- Categorical columns are already binned, thus we only create binning function for numerical columns.
- Update the config file to have:
    - The numerical column names
    - The categorical column names
    - The missing column names
    - The number of bins
    - The path for binned train set

In [21]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [22]:
# Create a function for binning the numerical predictor
def create_num_binning(data, predictor_label, num_of_bins):
    """Bin the numerical predictor"""
    # Create a new column containing the binned predictor
    data[predictor_label + "_bin"] = pd.qcut(data[predictor_label],
                                             q = num_of_bins)

    return data

In [23]:
def bin_data(type):
    """Bin the numerical and missing data"""
    # Load the concatenated data
    data = utils.pickle_load(CONFIG_DATA[f'data_{type}_path'])

    # Bin the numerical columns
    num_columns = CONFIG_DATA['num_variable']
    num_of_bins = CONFIG_DATA['num_of_bins']

    for column in num_columns:
        data_binned = create_num_binning(data = data,
                                         predictor_label = column,
                                         num_of_bins = num_of_bins)

    # Bin missing values
    missing_columns = CONFIG_DATA['missing_columns']

    for column in missing_columns:
        # Add category 'Missing' to replace the missing values
        # data_binned[column] = (data_binned[column]
        #                             .cat
        #                             .add_categories('Missing'))

        # Replace missing values with category 'Missing'
        data_binned[column].fillna(value = 'Missing',
                                   inplace = True)

    # Validate
    print(f"Original data shape : ", data.shape)
    print(f"Binned data shape  : ", data_binned.shape)

    # Dump binned data
    utils.pickle_dump(data_binned, CONFIG_DATA[f'data_{type}_binned_path'])
        
    return data_binned

In [24]:
# Check the function
binned_train = bin_data(type='train')
binned_train.head()

Original data shape :  (800, 13)
Binned data shape  :  (800, 13)


Unnamed: 0,Age,Sex,Job,Housing,Saving_accounts,Checking_account,Credit_amount,Duration,Purpose,Risk,Age_bin,Credit_amount_bin,Duration_bin
485,47,male,3,own,little,moderate,1209,6,car,1,"(41.25, 75.0]","(249.999, 1359.5]","(3.999, 12.0]"
390,30,male,3,own,little,Missing,1820,18,car,0,"(27.0, 33.0]","(1359.5, 2309.0]","(12.0, 18.0]"
23,44,male,2,own,moderate,moderate,1804,12,car,0,"(41.25, 75.0]","(1359.5, 2309.0]","(3.999, 12.0]"
814,46,male,2,free,little,little,3931,48,car,1,"(41.25, 75.0]","(2309.0, 3973.75]","(24.0, 60.0]"
107,32,male,2,own,little,moderate,6078,12,car,0,"(27.0, 33.0]","(3973.75, 18424.0]","(3.999, 12.0]"


### **3.2 WoE and IV**
---  

- To assess the strength of each characteristic individually as a predictor of the credit performance.
- Update the config file to have
    - crosstab list path
    - WOE table path
    - IV table path

In [25]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [26]:
def create_crosstab_list():
    """Generate the crosstab list (contingency table) for WOE and IV calculation. Only in training data"""
    # load the binned train data
    data_train_binned = utils.pickle_load(CONFIG_DATA['data_train_binned_path'])

    # load the response variable (we will summarize based on the response variable)
    response_variable = CONFIG_DATA['response_variable']

    # iterate over numercial columns
    crosstab_num = []
    num_columns = CONFIG_DATA['num_variable']
    for column in num_columns:
        # Create a contingency table
        crosstab = pd.crosstab(data_train_binned[column + "_bin"],
                               data_train_binned[response_variable],
                               margins = True)

        # Append to the list
        crosstab_num.append(crosstab)

    # iterate over categorical columns
    crosstab_cat = []
    cat_columns = CONFIG_DATA['cat_variable']
    for column in cat_columns:
        # Create a contingency table
        crosstab = pd.crosstab(data_train_binned[column],
                               data_train_binned[response_variable],
                               margins = True)

        # Append to the list
        crosstab_cat.append(crosstab)

    # Put all two in a crosstab_list
    crosstab_list = crosstab_num + crosstab_cat

    # Validate the crosstab_list
    print('number of num bin : ', [bin.shape for bin in crosstab_num])
    print('number of cat bin : ', [bin.shape for bin in crosstab_cat])

    # Dump the result
    utils.pickle_dump(crosstab_list, CONFIG_DATA['crosstab_list_path'])

    return crosstab_list


In [27]:
# Check the function
crosstab_list = create_crosstab_list()
crosstab_list[0]

number of num bin :  [(5, 3), (5, 3), (5, 3)]
number of cat bin :  [(3, 3), (5, 3), (4, 3), (6, 3), (5, 3), (9, 3)]


Risk,0,1,All
Age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(18.999, 27.0]",149,82,231
"(27.0, 33.0]",123,62,185
"(33.0, 41.25]",144,40,184
"(41.25, 75.0]",144,56,200
All,560,240,800


In [28]:
crosstab_list[8]

Risk,0,1,All
Purpose,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
business,56,26,82
car,181,86,267
domestic appliances,7,3,10
education,28,19,47
furniture/equipment,95,47,142
radio/TV,177,48,225
repairs,10,6,16
vacation/others,6,5,11
All,560,240,800


In [29]:
def WOE_and_IV():
    """Get the WoE and IV"""
    # Load the crosstab list
    crosstab_list = utils.pickle_load(CONFIG_DATA['crosstab_list_path'])

    # Create initial storage for WoE and IV
    WOE_list, IV_list = [], []
    
    # Perform the calculation for all crosstab list
    for crosstab in crosstab_list:
        # Calcualte the WoE and IV
        crosstab['p_good'] = crosstab[0]/crosstab[0]['All']                                 # Calculate % Good
        crosstab['p_bad'] = crosstab[1]/crosstab[1]['All']                                  # Calculate % Bad
        crosstab['WOE'] = np.log(crosstab['p_good']/crosstab['p_bad'])                      # Calculate the WOE
        crosstab['contribution'] = (crosstab['p_good']-crosstab['p_bad'])*crosstab['WOE']   # Calculate the contribution value for IV
        IV = crosstab['contribution'][:-1].sum()                                            # Calculate the IV
        
        # Append to list
        WOE_list.append(crosstab)

        add_IV = {'Characteristic': crosstab.index.name, 
                  'Information Value': IV}
        IV_list.append(add_IV)


    # CREATE WOE TABLE
    # Create initial table to summarize the WOE values
    WOE_table = pd.DataFrame({'Characteristic': [],
                              'Attribute': [],
                              'WOE': []})
    for i in range(len(crosstab_list)):
        # Define crosstab and reset index
        crosstab = crosstab_list[i].reset_index()

        # Save the characteristic name
        char_name = crosstab.columns[0]

        # Only use two columns (Attribute name and its WOE value)
        # Drop the last row (average/total WOE)
        crosstab = crosstab.iloc[:-1, [0,-2]]
        crosstab.columns = ['Attribute', 'WOE']

        # Add the characteristic name in a column
        crosstab['Characteristic'] = char_name

        WOE_table = pd.concat((WOE_table, crosstab), 
                                axis = 0)

        # Reorder the column
        WOE_table.columns = ['Characteristic',
                            'Attribute',
                            'WOE']
    

    # CREATE IV TABLE
    # Create the initial table for IV
    IV_table = pd.DataFrame({'Characteristic': [],
                             'Information Value' : []})
    IV_table = pd.DataFrame(IV_list)

    # Define the predictive power of each characteristic
    strength = []

    # Assign the rule of thumb regarding IV
    for iv in IV_table['Information Value']:
        if iv < 0.02:
            strength.append('Unpredictive')
        elif iv >= 0.02 and iv < 0.1:
            strength.append('Weak')
        elif iv >= 0.1 and iv < 0.3:
            strength.append('Medium')
        else:
            strength.append('Strong')

    # Assign the strength to each characteristic
    IV_table = IV_table.assign(Strength = strength)

    # Sort the table by the IV values
    IV_table = IV_table.sort_values(by='Information Value')
    
    # Validate
    print('WOE table shape : ', WOE_table.shape)
    print('IV table shape  : ', IV_table.shape)

    # Dump data
    utils.pickle_dump(WOE_table, CONFIG_DATA['WOE_table_path'])
    utils.pickle_dump(IV_table, CONFIG_DATA['IV_table_path']) 

    return WOE_table, IV_table

In [30]:
# Check the function
WOE_table, IV_table = WOE_and_IV()

WOE table shape :  (38, 3)
IV table shape  :  (9, 3)


In [31]:
WOE_table.head(10)

Unnamed: 0,Characteristic,Attribute,WOE
0,Age_bin,"(18.999, 27.0]",-0.250071
1,Age_bin,"(27.0, 33.0]",-0.162248
2,Age_bin,"(33.0, 41.25]",0.433636
3,Age_bin,"(41.25, 75.0]",0.097164
0,Credit_amount_bin,"(249.999, 1359.5]",-0.070452
1,Credit_amount_bin,"(1359.5, 2309.0]",0.251314
2,Credit_amount_bin,"(2309.0, 3973.75]",0.447748
3,Credit_amount_bin,"(3973.75, 18424.0]",-0.524524
0,Duration_bin,"(3.999, 12.0]",0.454913
1,Duration_bin,"(12.0, 18.0]",0.018868


In [32]:
IV_table

Unnamed: 0,Characteristic,Information Value,Strength
4,Job,0.012696,Unpredictive
3,Sex,0.035177,Weak
0,Age_bin,0.066729,Weak
8,Purpose,0.083705,Weak
5,Housing,0.090442,Weak
1,Credit_amount_bin,0.136457,Medium
2,Duration_bin,0.157111,Medium
6,Saving_accounts,0.204444,Medium
7,Checking_account,0.704839,Strong
