In [263]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
%matplotlib inline
pd.set_option('display.max_columns', None)

In [264]:
# Columns from kaggle dataset
old_col= '''Index(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term',
       'Credit Score', 'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')'''

# Read in CSV

In [265]:
#df = pd.read_csv('~/Downloads/LoanStats3c.csv',header = 1)
#df.to_pickle('./data/loan_data_2014.zip',compression = 'zip')

# Drop unnecessary columns:

In [266]:
df = pd.read_pickle('./data/loan_data_2014.zip',compression = 'zip')
df.shape

(235631, 145)

In [267]:
# add categorical variable - HARdship END FLAG EXISTS
# THIS EXISTS AS HARDSHIP_STATUS

In [268]:
df=df.drop(columns = [
        # Only looking at loans for individuals, not joint
        'annual_inc_joint',
        'application_type',
        'dti_joint',
        'verification_status_joint',
        'revol_bal_joint', 


        # Don't need specific ID info
        'id',
        'member_id',
    
        # Only care about information available pre loan
        'collection_recovery_fee',
        'collections_12_mths_ex_med',
        # Hard to classify
        'emp_title',
    
        # Keeping the hardship flag but looking at pre-loan metrics
        'hardship_type',
        'hardship_reason',
        'deferral_term',
        'hardship_amount',
        'hardship_start_date',
        'hardship_end_date',
        'payment_plan_start_date',
        'hardship_length',
        'hardship_dpd',
        'hardship_loan_status',
        'orig_projected_additional_accrued_interest',
    
        'last_pymnt_amnt',
        'last_pymnt_d',
        'next_pymnt_d',
        'pymnt_plan',
        'total_pymnt',
        'total_pymnt_inv',
    
        # Trying to extrapolate beyond certain issuing dates of loans
        'issue_d',
        'mths_since_last_major_derog',
        'mths_since_last_record',
        'num_tl_30dpd',
        'num_tl_120dpd_2m',
        
        'debt_settlement_flag',
        'debt_settlement_flag_date',
        'desc',
    
        # This is for investors / loan funding
        'initial_list_status',
        'funded_amnt',
        'funded_amnt_inv',              
        # Probably useful metrics, but all NaN in this dataset              
        'open_acc_6m',                 
        'open_il_12m',
        'open_il_24m',                     
        'open_act_il',   
        'open_rv_12m',
        'open_rv_24m',
                      
        'out_prncp', 
        'out_prncp_inv',
        'policy_code',
        'recoveries',
        # Columns related to debt settlement; we only care about pre loan features              
        'settlement_status',
        'settlement_date',
        'settlement_amount',
        'settlement_percentage',
        'settlement_term',
        'title',
        'total_cu_tl',
                      

        'url',
    
        'zip_code'])

print('Before dropping NaN Rows: ',df.shape)
df.dropna(how='all', inplace = True)
print('After dropping NaN Rows: ',df.shape)

Before dropping NaN Rows:  (235631, 88)
After dropping NaN Rows:  (235629, 88)


In [269]:
df.sample(1)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,open_acc,pub_rec,revol_bal,revol_util,total_acc,total_rec_prncp,total_rec_int,total_rec_late_fee,last_credit_pull_d,acc_now_delinq,tot_coll_amt,tot_cur_bal,mths_since_rcnt_il,total_bal_il,il_util,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_status,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method
169636,6000.0,36 months,13.65%,204.05,C,C1,8 years,RENT,49000.0,Source Verified,Charged Off,credit_card,CT,18.98,0.0,Apr-1995,0.0,65.0,12.0,0.0,18056.0,66.6%,20.0,3547.18,1145.97,0.0,Mar-2019,0.0,0.0,34938.0,,,,,,27100.0,,,3.0,3494.0,7530.0,65.6,0.0,0.0,127.0,228.0,5.0,5.0,0.0,5.0,,13.0,81.0,2.0,7.0,9.0,7.0,10.0,2.0,11.0,17.0,9.0,12.0,0.0,2.0,80.0,71.4,0.0,0.0,47261.0,34938.0,21900.0,20161.0,,,,,,,,,,,N,,,,Cash


In [270]:
#df[df['num_tl_120dpd_2m'] >= 1][['num_tl_30dpd','num_tl_120dpd_2m','num_tl_90g_dpd_24m']]

In [271]:
# df[['int_rate','grade','sub_grade']].sort_values(by = 'int_rate')

# Clean Columns

In [272]:
def drop_percentage_signs(df, column_name):
    new_column_name = column_name + '_percent'
    df[new_column_name] = df[column_name].astype(str).str[:-1].astype(np.float64)
    return df.drop(columns = [column_name])

In [273]:
df.shape

(235629, 88)

In [276]:
df['revol_util'].fillna('0%',inplace = True)
df['revol_util'] =df['revol_util'].astype(str).str[:-1]


In [None]:
df['revol_util']

In [275]:
df.loc[df['revol_util'] == '']

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,open_acc,pub_rec,revol_bal,revol_util,total_acc,total_rec_prncp,total_rec_int,total_rec_late_fee,last_credit_pull_d,acc_now_delinq,tot_coll_amt,tot_cur_bal,mths_since_rcnt_il,total_bal_il,il_util,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_status,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method
1170,26400.0,60 months,17.86%,668.38,D,D5,10+ years,MORTGAGE,141000.00,Verified,Fully Paid,home_improvement,VA,13.90,0.0,Nov-1997,0.0,41.0,5.0,0.0,0.0,,31.0,26400.00,6078.33,0.00,Jun-2015,0.0,0.0,307546.0,,,,,,0.0,,,5.0,61509.0,,,0.0,0.0,130.0,205.0,6.0,6.0,4.0,,,7.0,,1.0,0.0,0.0,0.0,17.0,9.0,0.0,17.0,0.0,5.0,0.0,1.0,96.8,,0.0,0.0,376963.0,42051.0,0.0,57144.0,,,,,,,,,,,N,,,,Cash
3437,28000.0,60 months,8.19%,570.29,A,A5,10+ years,MORTGAGE,180000.00,Source Verified,Current,credit_card,PA,3.29,0.0,Mar-1991,0.0,,3.0,0.0,69840.0,,16.0,23710.22,5937.78,0.00,Mar-2019,0.0,0.0,516172.0,,,,,,75000.0,,,0.0,172057.0,,,0.0,0.0,131.0,168.0,63.0,63.0,1.0,,,16.0,,0.0,0.0,1.0,0.0,4.0,6.0,1.0,8.0,1.0,3.0,0.0,0.0,100.0,,0.0,0.0,572458.0,80107.0,0.0,0.0,,,,,,,,,,,N,,,,Cash
4159,8950.0,36 months,16.49%,316.83,D,D3,8 years,MORTGAGE,30012.00,Source Verified,Charged Off,other,NC,20.52,0.0,Feb-2002,1.0,24.0,3.0,0.0,0.0,,13.0,1203.75,680.83,0.00,Mar-2019,0.0,0.0,28004.0,,,,,,0.0,,,0.0,9335.0,,,0.0,0.0,122.0,154.0,96.0,70.0,0.0,,,1.0,24.0,2.0,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,3.0,0.0,0.0,61.5,,0.0,0.0,54590.0,28004.0,0.0,54590.0,,,,,,,,,,,N,,,,Cash
5695,4850.0,36 months,19.24%,178.38,E,E2,5 years,RENT,80000.00,Source Verified,Fully Paid,debt_consolidation,MI,13.61,4.0,Aug-1999,0.0,7.0,8.0,0.0,2910.0,,41.0,4850.00,755.05,0.00,Mar-2019,0.0,0.0,114508.0,,,,,,3100.0,,,3.0,14314.0,190.0,,0.0,0.0,183.0,184.0,36.0,2.0,0.0,36.0,7.0,2.0,7.0,0.0,2.0,2.0,2.0,10.0,28.0,2.0,13.0,2.0,8.0,0.0,2.0,94.3,100.0,0.0,0.0,116537.0,114508.0,3100.0,113437.0,,,,,,,,,,,N,,,,Cash
7133,8500.0,36 months,8.67%,269.00,B,B1,2 years,MORTGAGE,75000.00,Not Verified,Fully Paid,credit_card,CA,12.29,3.0,Mar-1990,0.0,9.0,4.0,0.0,0.0,,12.0,8500.00,1180.19,0.00,Mar-2019,0.0,0.0,356907.0,,,,,,0.0,,,0.0,89227.0,,,0.0,0.0,109.0,296.0,125.0,34.0,6.0,,,1.0,,1.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,0.0,4.0,0.0,0.0,58.3,,0.0,0.0,382951.0,11601.0,0.0,32951.0,,,,,,,,,,,N,,,,Cash
7322,18000.0,60 months,17.14%,448.71,D,D4,3 years,RENT,65000.00,Source Verified,Current,moving,IL,11.58,0.0,Jun-2006,0.0,35.0,6.0,0.0,0.0,,17.0,14349.72,8600.21,0.00,Mar-2019,0.0,0.0,15938.0,,,,,,0.0,,,2.0,2656.0,,,0.0,0.0,86.0,101.0,38.0,11.0,0.0,,,,35.0,0.0,0.0,0.0,0.0,4.0,9.0,0.0,6.0,0.0,6.0,0.0,1.0,94.1,,0.0,0.0,20124.0,15938.0,0.0,17000.0,,,,,,,,,,,N,,,,Cash
8651,8000.0,36 months,17.14%,285.78,D,D4,,OWN,30000.00,Verified,Fully Paid,debt_consolidation,PA,7.40,0.0,Mar-1997,1.0,,3.0,0.0,0.0,,16.0,8000.00,2272.82,0.00,Dec-2017,0.0,0.0,64597.0,,,,,,0.0,,,2.0,21532.0,,,0.0,0.0,118.0,212.0,149.0,18.0,3.0,,,0.0,,0.0,0.0,0.0,0.0,7.0,5.0,0.0,8.0,0.0,3.0,0.0,0.0,100.0,,0.0,0.0,70900.0,5259.0,0.0,8000.0,,,,,,,,,,,N,,,,Cash
9180,1000.0,36 months,12.39%,33.41,C,C1,6 years,MORTGAGE,85000.00,Not Verified,Fully Paid,major_purchase,IL,21.09,3.0,Mar-2002,0.0,17.0,3.0,0.0,0.0,,26.0,1000.00,62.56,0.00,Jan-2019,0.0,0.0,169790.0,,,,,,0.0,,,2.0,56597.0,,,0.0,0.0,152.0,150.0,40.0,10.0,2.0,,17.0,10.0,17.0,0.0,0.0,0.0,0.0,12.0,7.0,0.0,17.0,0.0,3.0,0.0,2.0,88.5,,0.0,0.0,193100.0,19841.0,0.0,31150.0,,,,,,,,,,,N,,,,Cash
10400,25225.0,60 months,14.31%,591.01,C,C4,10+ years,RENT,55000.00,Verified,Fully Paid,credit_card,CA,22.43,0.0,Mar-1996,1.0,,5.0,0.0,0.0,,37.0,25225.00,8145.24,0.00,Mar-2019,0.0,0.0,173755.0,,,,,,0.0,,,1.0,173755.0,,,0.0,0.0,201.0,224.0,49.0,19.0,0.0,,,6.0,,0.0,0.0,0.0,0.0,11.0,25.0,0.0,12.0,0.0,5.0,0.0,0.0,100.0,,0.0,0.0,163498.0,173755.0,0.0,163498.0,,,,,,,,,,,N,,,,Cash
11258,34850.0,60 months,15.59%,839.92,D,D1,6 years,MORTGAGE,140000.00,Verified,Current,debt_consolidation,NY,13.26,4.0,Jul-1992,0.0,10.0,5.0,0.0,0.0,,13.0,27723.13,15168.95,0.00,Mar-2019,0.0,0.0,438215.0,,,,,,15300.0,,,0.0,109554.0,15300.0,,0.0,0.0,165.0,268.0,40.0,38.0,1.0,268.0,10.0,6.0,10.0,1.0,0.0,0.0,0.0,3.0,5.0,2.0,7.0,0.0,4.0,1.0,0.0,33.3,0.0,0.0,0.0,506795.0,25237.0,15300.0,44737.0,,,,,,,,,,,N,,,,Cash


In [253]:
df.sort_values(by = ['revol_util'])

AttributeError: 'list' object has no attribute 'astype'

In [204]:
# Clean columns
df['term'] = df['term'].str.replace('36 months','short').str.replace('60 months','long').str.strip()
df['int_rate'].dropna(axis = 0,how = 'any', inplace = True)
df=drop_percentage_signs(df,'int_rate')


# Drop co

print('Before dropping NaN Rows: ',df.shape)
df['revol_util'].fillna(value = 0,inplace = True)
print('After dropping NaN Rows: ',df.shape)

df=drop_percentage_signs(df, 'revol_util')

Before dropping NaN Rows:  (235629, 88)
After dropping NaN Rows:  (235629, 88)


ValueError: could not convert string to float: 

In [205]:
df.shape

(235629, 88)

In [57]:
#df.dtypes

In [46]:
df['pymnt_plan'].value_counts()

n    235622
y         7
Name: pymnt_plan, dtype: int64