In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

pd.pandas.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore')

In [2]:
lending_data = pd.read_csv('lending_train.csv')

In [3]:
# clean features
def remove_percentage_sign(df, feat):
    df = df.copy()
    df[feat] = df[feat].str.replace(r'%', '').astype('float')
    return df

for feat in ['int_rate', 'revol_util']:
    lending_data = remove_percentage_sign(lending_data, feat)

In [4]:
lending_train, lending_val, rate_train, rate_val = train_test_split(
    lending_data, lending_data['int_rate'], test_size=0.1, random_state=12)

## Missing Values

### High Percentage

In [5]:
def feats_with_na_above_thresh(df, min_thresh, max_thresh=1.0):
    missing_feats = [feat for feat in df.columns if 
                     (df[feat].isnull().sum() / df.shape[0]) > min_thresh and
                     df[feat].isnull().sum() / df.shape[0] <= max_thresh]
    return df[missing_feats].isnull().mean().sort_values(ascending=False)

high_perc_feats = feats_with_na_above_thresh(lending_train, 0.9)

In [6]:
# drop features that have > 90% missing in training data
lending_train = lending_train.drop(high_perc_feats.index, axis=1)
lending_val = lending_val.drop(high_perc_feats.index, axis=1)

### Categorical Variables

In [7]:
cat_feats_with_na = [
    feat for feat in lending_train.columns if lending_train[feat].isnull().sum() > 0 and
        lending_train[feat].dtypes == 'O'
]

lending_train[cat_feats_with_na].isnull().mean()

term                    0.000015
emp_title               0.073038
emp_length              0.071984
home_ownership          0.000015
verification_status     0.000015
issue_d                 0.000015
pymnt_plan              0.000015
purpose                 0.000015
title                   0.000015
zip_code                0.000018
addr_state              0.000015
earliest_cr_line        0.000015
initial_list_status     0.000015
application_type        0.000015
hardship_flag           0.000015
disbursement_method     0.000015
debt_settlement_flag    0.000015
dtype: float64

In [8]:
# make a new label 'Missing' for the 2 features that have 7% missing
lending_train[['emp_title', 'emp_length']] = lending_train[
    ['emp_title', 'emp_length']].fillna('Missing')
lending_val[['emp_title', 'emp_length']] = lending_val[
    ['emp_title', 'emp_length']].fillna('Missing')

In [9]:
cat_feats_with_small_na = [
    feat for feat in cat_feats_with_na if lending_train[feat].isnull().sum() > 0
]

# drop data points w/ very little missing
lending_train = lending_train.dropna(subset=cat_feats_with_small_na)
lending_val = lending_val.dropna(subset=cat_feats_with_small_na)

In [10]:
# sanity checks
print([feat for feat in cat_feats_with_na if lending_train[feat].isnull().sum() > 0])
print([feat for feat in cat_feats_with_na if lending_val[feat].isnull().sum() > 0])

[]
[]
