# **1. Data Preparation**
---

## **1.1 Read Data**
---

In [1]:
# Import library
import pandas as pd

# Load configuration
import src.utils as utils

Create config file to load and dump data.

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [3]:
def read_data():
    """Load data and dump data"""

    # Load data
    data_path = CONFIG_DATA['raw_dataset_path']
    response_variable = CONFIG_DATA['response_variable']
    data = pd.read_csv(data_path).drop(columns=['Unnamed: 0'])
    data[response_variable] = data[response_variable].replace({'good':0,'bad':1})
    data.columns = ['_'.join(x.split(' ')) for x in data.columns]
    
    # Validate data shape
    print("Data shape       :", data.shape)

    # Pickle dumping (save the result)
    dump_path = CONFIG_DATA['dataset_path']
    utils.pickle_dump(data, dump_path)

    return data

In [4]:
# Check the function
data = read_data()
data.head()

Data shape       : (1000, 10)


Unnamed: 0,Age,Sex,Job,Housing,Saving_accounts,Checking_account,Credit_amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,0
1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,49,male,1,own,little,,2096,12,education,0
3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,53,male,2,free,little,little,4870,24,car,1


## **1.2 Sample Splitting**
---

- Split input & output data and dump them
- Update the config file to contain
    - The input & output data path
    - The output variable name
    - The input columns name

In [5]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [6]:
def split_input_output():
    """Split input (predictors) and output (responses)"""
    
    # Load data
    dataset_path = CONFIG_DATA['dataset_path']
    data = utils.pickle_load(dataset_path)

    # Define y
    response_variable = CONFIG_DATA['response_variable']
    y = data[response_variable]

    # Define X
    X = data.drop(columns = [response_variable],
                  axis = 1)
    
    # Validate the splitting
    print('y shape :', y.shape)
    print('X shape :', X.shape)

    # Dumping
    dump_path_predictors = CONFIG_DATA['predictors_set_path']
    utils.pickle_dump(X, dump_path_predictors)

    dump_path_response = CONFIG_DATA['response_set_path']    
    utils.pickle_dump(y, dump_path_response)
    
    return X, y

In [7]:
# Check the function
X, y = split_input_output()

y shape : (1000,)
X shape : (1000, 9)


In [8]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving_accounts,Checking_account,Credit_amount,Duration,Purpose
0,67,male,2,own,,little,1169,6,radio/TV
1,22,female,2,own,little,moderate,5951,48,radio/TV
2,49,male,1,own,little,,2096,12,education
3,45,male,2,free,little,little,7882,42,furniture/equipment
4,53,male,2,free,little,little,4870,24,car


In [9]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: Risk, dtype: int64

Next, split the training and testing set from each predictors (X) and response (y).
- Set `stratify = y` for splitting the sample with stratify, based on the proportion of response y.
- Set `test_size = 0.2` for holding 30% of the sample as a testing set.
- Set `random_state = 123` for reproducibility.

In [10]:
# Import library 
from sklearn.model_selection import train_test_split

Update the config file to have train & test data path and test size.

In [11]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/german_credit_data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'Risk',
 'test_size': 0.2,
 'num_variable': ['Age', 'Credit_amount', 'Duration'],
 'cat_variable': ['Sex',
  'Job',
  'Housing',
  'Saving_accounts',
  'Checking_account',
  'Purpose'],
 'missing_columns': ['Saving_accounts', 'Checking_account'],
 'num_of_bins': 4,
 '

In [12]:
def split_train_test():
    """Split train & test, then dump the data"""
    
    # Load the X and y
    X = utils.pickle_load(CONFIG_DATA['predictors_set_path'])
    y = utils.pickle_load(CONFIG_DATA['response_set_path'])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify = y,
                                                        test_size = CONFIG_DATA['test_size'],
                                                        random_state = 123)
    # Validate splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump data
    utils.pickle_dump(X_train, CONFIG_DATA['train_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_path'][1])

    return X_train, X_test, y_train, y_test

In [13]:
# Check the function
X_train, X_test, y_train, y_test = split_train_test()

X_train shape : (800, 9)
y_train shape : (800,)
X_test shape  : (200, 9)
y_test shape  : (200,)
