## **4.1 Pre-processing Training Set**
---

- In this part, we preprocess the train set by replacing the values with WOE based from its bin.
- We will save the train woe dataset to config file.

In [1]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

Update the config file to have `WOE_map_dict_path`.

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [3]:
# Function to generate the WOE mapping dictionary
def get_woe_map_dict():
    """Get the WOE mapping dictionary"""
    # Load the WOE table
    WOE_table = utils.pickle_load(CONFIG_DATA['WOE_table_path'])

    # Initialize the dictionary
    WOE_map_dict = {}
    WOE_map_dict['Missing'] = {}
    
    unique_char = set(WOE_table['Characteristic'])
    for char in unique_char:
        # Get the Attribute & WOE info for each characteristics
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     # Filter based on characteristic
                            [['Attribute', 'WOE']])                 # Then select the attribute & WOE
        
        # Get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']

            if attribute != 'Missing':
                WOE_map_dict[char][attribute] = woe
                WOE_map_dict['Missing'][char] = np.nan
                
        if 'Missing' in current_data['Attribute'].tolist():
            WOE_map_dict['Missing'][char] = current_data.loc[current_data['Attribute']=='Missing', 'WOE'].values[0]

    # Validate data
    print('Number of key : ', len(WOE_map_dict.keys()))

    # Dump
    utils.pickle_dump(WOE_map_dict, CONFIG_DATA['WOE_map_dict_path'])

    return WOE_map_dict
    

In [4]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  10


{'Missing': {'Saving_accounts': 0.5728980524083682,
  'Job': nan,
  'Duration_bin': nan,
  'Housing': nan,
  'Checking_account': 1.1836912979066854,
  'Credit_amount_bin': nan,
  'Purpose': nan,
  'Sex': nan,
  'Age_bin': nan},
 'Saving_accounts': {'little': -0.27853439951499387,
  'moderate': -0.13645110272785735,
  'quite rich': 1.0354333870465782,
  'rich': 1.0986122886681098},
 'Job': {0: -0.15415067982725836,
  1: 0.1686227124357929,
  2: 0.007692345623155645,
  3: -0.2006706954621511},
 'Duration_bin': {Interval(3.999, 12.0, closed='right'): 0.4549133835221455,
  Interval(12.0, 18.0, closed='right'): 0.018868484304382736,
  Interval(18.0, 24.0, closed='right'): -0.03053672386008165,
  Interval(24.0, 60.0, closed='right'): -0.6136830092056983},
 'Housing': {'free': -0.6241543090729936,
  'own': 0.19027853769453706,
  'rent': -0.3441942827151232},
 'Checking_account': {'little': -0.902357637570231,
  'moderate': -0.32909212932359044,
  'rich': 0.4336359850748606},
 'Credit_amount_b

In [5]:
WOE_map_dict['Missing']

{'Saving_accounts': 0.5728980524083682,
 'Job': nan,
 'Duration_bin': nan,
 'Housing': nan,
 'Checking_account': 1.1836912979066854,
 'Credit_amount_bin': nan,
 'Purpose': nan,
 'Sex': nan,
 'Age_bin': nan}

- Next, transform the inputed data based on the map dictionary above.
- Update the config file to have the path for the new data contains the WOE values.

In [6]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [7]:
# Function to replace the raw data in the train set with WOE values
def transform_woe(raw_data=None, type=None, CONFIG_DATA=None):
    """Replace data value with WOE"""
    # Load the numerical columns
    num_cols = CONFIG_DATA['num_variable']

    # Load the WOE_map_dict
    WOE_map_dict = utils.pickle_load(CONFIG_DATA['WOE_map_dict_path'])

    # Load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(CONFIG_DATA[f'{type}_path'][0])

    # Map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        # Perbaiki kolom numerik
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col    

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    # Map the data if there is a missing value or out of range value
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col 

        woe_data[col] = woe_data[col].fillna(value=WOE_map_dict['Missing'][map_col])

    # Validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    # Dump data
    if type is not None:
        utils.pickle_dump(woe_data, CONFIG_DATA[f'X_{type}_woe_path'])

    return woe_data

In [8]:
# Transform the train set
X_train_woe = transform_woe(type='train', CONFIG_DATA=CONFIG_DATA)

Raw data shape :  (800, 9)
WOE data shape :  (800, 9)


In [9]:
X_train_woe.head(10)

Unnamed: 0,Age,Sex,Job,Housing,Saving_accounts,Checking_account,Credit_amount,Duration,Purpose
485,0.097164,0.127017,-0.200671,0.190279,-0.278534,-0.329092,-0.070452,0.454913,-0.103148
390,-0.162248,0.127017,-0.200671,0.190279,-0.278534,1.183691,0.251314,0.018868,-0.103148
23,0.097164,0.127017,0.007692,0.190279,-0.136451,-0.329092,0.251314,0.454913,-0.103148
814,0.097164,0.127017,0.007692,-0.624154,-0.278534,-0.902358,0.447748,-0.613683,-0.103148
107,-0.162248,0.127017,0.007692,0.190279,-0.278534,-0.329092,-0.524524,0.454913,-0.103148
973,0.433636,0.127017,0.007692,-0.344194,-0.278534,-0.902358,-0.524524,-0.613683,-0.080043
704,-0.162248,-0.277765,0.007692,0.190279,-0.278534,-0.329092,0.447748,-0.613683,-0.080043
118,-0.250071,-0.277765,0.007692,0.190279,1.035433,-0.902358,-0.524524,-0.613683,-0.143569
918,-0.162248,0.127017,0.007692,0.190279,-0.136451,-0.902358,0.447748,-0.030537,-0.143569
627,-0.162248,0.127017,0.007692,0.190279,-0.136451,-0.329092,0.251314,0.454913,-0.103148
