In [1]:
import sys
import os
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore')

PROJ_ROOT = os.pardir
data_dir = join(PROJ_ROOT, 'data')

pd.pandas.set_option('display.max_columns', None)

In [2]:
bank_data = join(PROJ_ROOT, 'data', 'bank-additional-full.csv')
bank_data = pd.read_csv(bank_data, sep=';')

In [3]:
# convert 'unkonwn' to np.nan
bank_data.replace('unknown', np.nan, inplace=True)

# drop duration, should not be used for a predictive model
bank_data.drop('duration', axis=1, inplace=True)

# convert response to 0 & 1's
bank_data['y'].replace({'no': 0, 'yes': 1}, inplace=True)

In [4]:
bank_train, bank_val, response_train, response_val = train_test_split(
    bank_data, bank_data['y'], test_size=0.2, random_state=12, stratify=bank_data['y'])

## Missing Values

We will make new labels for 'missing' on the two features, *default* and *education* - these features displayed some predictive power in the visualizations in the previous notebook. 

In [5]:
# make new labels
bank_train[['default', 'education']] = bank_train[['default', 'education']].fillna('Missing')
bank_val[['default', 'education']] = bank_val[['default', 'education']].fillna('Missing')

In [6]:
# drop data points w/ very little missing
bank_train = bank_train.dropna(subset=['job', 'marital'])
bank_val = bank_val.dropna(subset=['job', 'marital'])

## Nonpredictive variables

*Loan* and *housing* did not provide any signal; therefore, we will discard these features. 

In [7]:
bank_train.drop(['loan', 'housing'], axis=1, inplace=True)
bank_val.drop(['loan', 'housing'], axis=1, inplace=True)

In [8]:
# sanity check
print([feat for feat in bank_train.columns if bank_train[feat].isnull().sum() > 0])
print([feat for feat in bank_val.columns if bank_val[feat].isnull().sum() > 0])

[]
[]


## Engineer *pdays* feature

In [9]:
bank_train['prev_contact'] = bank_train['pdays'].apply(lambda row: 'no' if row == 999 else 'yes')
bank_val['prev_contact'] = bank_val['pdays'].apply(lambda row: 'no' if row == 999 else 'yes')

# drop original pdays, was just noise
bank_train.drop('pdays', axis=1, inplace=True)
bank_val.drop('pdays', axis=1, inplace=True)

### Rare Labels

We will remove rare instances from *education* (14) and *default* (3) that account for less than 0.005% of all other instances' categories.  

In [10]:
bank_train = bank_train[~bank_train['education'].str.contains('illiterate')]
bank_val = bank_val[~bank_val['education'].str.contains('illiterate')]

bank_train = bank_train[~bank_train['default'].str.contains('yes')]
bank_val = bank_val[~bank_val['default'].str.contains('yes')]

In [13]:
# save cleaned & processed data for model selection
bank_train_data_clean = join(PROJ_ROOT, 'data', 'bank_train_clean.csv')
bank_train.to_csv(bank_train_data_clean, index=False)

bank_val_data_clean = join(PROJ_ROOT, 'data', 'bank_val_clean.csv')
bank_val.to_csv(bank_val_data_clean, index=False)