## Data Cleaning Summary

### Drop N/A Values
- delinq_2yrs 
- earliest_cr_line
- inq_last_6mths
- open_acc 
- pub_rec
- total_acc
- revol_util
- purpose
- annual_inc

### Drop Columns
- collections_12_mths_ex_med
    - All values are either 0 or N/A
- pymnt_plan, initial_list_status
    - Huge imbalances, will likely lead to overfitting. Double check.
    
### Drop Outliers/Nonsensical Data
- revol_bal == 1207359, this is an outlier value.
- revol_util > 100

### Impute Values
- mths_since_last_record
- mths_since_last_delinq

### Simplify/Cleanup Categorical Data
- emp_title (extra)
    - This can be dramatically cleaned up.
- home_ownership, verification_status
    - Simplify

### Distill Commentary (extra)
- Notes
- purpose
- purpose_cat

In [47]:
import datetime
import numpy as np
import pandas as pd
import pickle

In [48]:
## Helper Functions

def class_balance(df):
    print()
    print('Class Balance:')
    print(df.groupby('is_bad').count()['Id'])

In [49]:
df = pd.read_csv('../data/DR_Demo_Lending_Club.csv')

In [50]:
class_balance(df)


Class Balance:
is_bad
0    8705
1    1295
Name: Id, dtype: int64


# Clean Data

## Dates and Convert String to Int

In [51]:
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])
current_date = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
df['days_since_earliest_cr_line'] = (current_date - df['earliest_cr_line']).dt.days

df = df.drop('earliest_cr_line', axis=1)

df.loc[df['emp_length'] == 'na', 'emp_length'] = np.nan
df['emp_length'] = df['emp_length'].astype('float64')

## Drop NA Values

In [52]:
drop_na_columns = ['delinq_2yrs',
'inq_last_6mths',
'open_acc',
'pub_rec',
'total_acc',
'revol_util',
'purpose',
'annual_inc']

df = df.dropna(axis=0, subset=drop_na_columns)

print('Rows dropped: {}'.format((len(df) - len(df))))
print('New data length: {}'.format(len(df)))

class_balance(df)

Rows dropped: 0
New data length: 9970

Class Balance:
is_bad
0    8679
1    1291
Name: Id, dtype: int64


## Drop Columns

In [53]:
## Won't lead to class balance changes
# df_post_col_drop = df_post_na_drop.drop(['collections_12_mths_ex_med', 'pymnt_plan', 'initial_list_status'], axis=1)

df = df.drop(['collections_12_mths_ex_med', 
             'pymnt_plan', 
             'initial_list_status',
             'mths_since_last_record',
             'mths_since_last_delinq',
             'zip_code'], axis=1)

## Drop Outliers/Nonsensical Data

In [54]:
df = df[df['revol_bal'] != 1207359]
df = df[df['revol_util'] <= 100]

class_balance(df)


Class Balance:
is_bad
0    8677
1    1291
Name: Id, dtype: int64


## Impute Values

For a tree based model, I am okay with imputing these values to -1. XGBoost should be able to handle this.

In [55]:
# df_post_outlier_drop['mths_since_last_record'] = -1
# df_post_outlier_drop['mths_since_last_delinq'] = -1

## Simplify/Cleanup Categorical Data

First pass I am only going to cleanup "home_ownership", however, if I have more time, I will clean up the employer names.

In [56]:
df.groupby(['home_ownership']).count()

Unnamed: 0_level_0,Id,is_bad,emp_title,emp_length,annual_inc,verification_status,Notes,purpose_cat,purpose,addr_state,...,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,mths_since_last_major_derog,policy_code,days_since_earliest_cr_line
home_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MORTGAGE,4436,4436,4181,4326,4436,4436,3028,4436,4436,4436,...,4436,4436,4436,4436,4436,4436,4436,4436,4436,4436
OTHER,34,34,33,34,34,34,34,34,34,34,...,34,34,34,34,34,34,34,34,34,34
OWN,770,770,676,723,770,770,515,770,770,770,...,770,770,770,770,770,770,770,770,770,770
RENT,4728,4728,4493,4638,4728,4728,3233,4728,4728,4728,...,4728,4728,4728,4728,4728,4728,4728,4728,4728,4728


In [57]:
class_balance(df[df['home_ownership'] == 'OTHER'])


Class Balance:
is_bad
0    30
1     4
Name: Id, dtype: int64


In [58]:
value_map = {
    'VERIFIED - income': 'VERIFIED - income',
    'VERIFIED - income source': 'VERIFIED - income',
    'not verified': 'not verified'
}
df['verification_status'] = [value_map[x] for x in df['verification_status']]

## Dropping Unused Columns

In [59]:
df = df.drop(['emp_title', 'Notes','purpose', 'purpose_cat'], axis=1)

## Output to Pickle

In [182]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold


class XGBoostModelData:
    
    def __init__(self, X, y, test_size=0.15, random_state=1234):
        self.test_size = test_size
        self.random_state = random_state      
        self.X_raw = X
        self.y_raw = y
        self.transform()
        self.X_train, self.X_validate, self.y_train, self.y_validate = train_test_split(self.X_transformed, 
                                                                                y, 
                                                                                test_size=test_size, 
                                                                                stratify=y, 
                                                                                random_state=random_state)
        self._create_k_folds(k=5)
        
    def transform(self):
        self.X_transformed = self._create_categorical_dummies()
        
    def _create_categorical_dummies(self):
        df_transform = self.X_raw.copy()
        
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        categorical_columns = df_transform.select_dtypes(exclude=numerics)
        
        for col in categorical_columns:
            dummies = pd.get_dummies(categorical_columns[col])
            df_transform = df_transform.drop(col, axis=1)
            df_transform = pd.concat([df_transform, dummies], axis=1)
            
        return df_transform
    
    def _create_k_folds(self, k=5):
        skf = StratifiedKFold(n_splits=k, random_state=self.random_state, shuffle=True)
        i = 0
        self.k_folds = {}
        for train, test in skf.split(self.X_train, self.y_train):
            self.k_folds['fold_{}'.format(i)] = {
                'X_train':self.X_train.iloc[train],
                'y_train':self.y_train.iloc[train],
                'X_test':self.X_train.iloc[test],
                'y_test':self.y_train.iloc[test],
            }
            i+=1

In [185]:
X=df.drop(['is_bad','Id'], axis=1)
y=df['is_bad']

xgb_train_validation_data = XGBoostModelData(X,y)


In [188]:
pickle.dump(df, open('XGBoost_clean_data_all.p', 'wb'))
pickle.dump(xgb_train_validation_data, open('xgb_train_validation_data.p', 'wb'))

# Transform Data for XGBoost

## Categorical Transforms

In [172]:
xgboost_data = pickle.load(open('xgb_train_validation_data.p', 'rb'))

<__main__.XGBoostModelData at 0x1318456a0>