In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth', -1)
np.set_printoptions(threshold=np.nan)

# Data preparation

In [7]:
# import data
loans = pd.read_csv('lending-club-data.csv', low_memory = False)

In [9]:
# In order to make this more intuitive and consistent with the lectures, we reassign the target to be:

loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x == 0 else -1 )

# only use a subset of the features

features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [11]:
# split into training and validation

import json 
with open('module-5-assignment-1-validation-idx.json') as valid_index:
    valid_index = json.load(valid_index)
with open('module-5-assignment-1-train-idx.json') as train_index:
    train_index = json.load(train_index)
    
train_data = loans.iloc[train_index]
validation_data = loans.iloc[valid_index]

### As we explored above, our data is disproportionally full of safe loans. Let's create two datasets: one with just the safe loans (safe_loans_raw) and one with just the risky loans (risky_loans_raw).

In [13]:
safe_loans_raw = loans[loans['safe_loans'] == 1]
risky_loans_raw = loans[loans['safe_loans'] == -1]

print "Number of safe loans: %s" %(len(safe_loans_raw))
print "Number of safe loans: %s" %(len(risky_loans_raw))

Number of safe loans: 99457
Number of safe loans: 23150


### One way to combat class imbalance is to undersample the larger class until the class distribution is approximately half and half. Here, we will undersample the larger class (safe loans) in order to balance out our dataset. This means we are throwing away many data points. We used seed=1 so everyone gets the same results.



In [22]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw

import random
random.seed(1)