# Author: Justin Hsi

In [1]:
import time
import dir_constants as dc

# Set some constants __________________________________________________________
now = time.strftime("%Y_%m_%d_%Hh_%Mm_%Ss")
platform = 'lendingclub'

# Set data_path _______________________________________________________________
data_path = dc.home_path + '/rsync_dl_rig/unzipped_lc_csvs'

# Get the loan_info csvs to iterate over ______________________________________
files = os.listdir(data_path)
print(files)
loan_info_files = [
    file_ for file_ in files
    if not (file_.startswith('.') | file_.startswith('lc_') |
            file_.startswith('PMTHIST') | file_.startswith('LCData'))
]

to_concat = []
for file_ in loan_info_files:
    to_concat.append(
        pd.read_csv(
            data_path + '/' + file_, header=1, engine='python', skipfooter=2))

loan_info = pd.concat(to_concat)

# Block to ensure that rows that aren't actually loans are dropped ____________
# All loans must have int/term/funded 
loan_info = loan_info[loan_info['term'].notnull()]
loan_info['int_rate'] = loan_info['int_rate'].str.strip('%').astype(float)
loan_info['term'] = loan_info['term'].str[:3].astype(int)
loan_info = loan_info[(loan_info['int_rate'] > 0) & (loan_info['term'] > 0) &
                      (loan_info['funded_amnt'] > 0)]


# Reset index and set id to string, also set index to id ______________________
loan_info.reset_index(drop=True, inplace=True)
loan_info['id'] = loan_info['id'].astype(str)
loan_info.set_index('id', drop=False, inplace=True)

# Save in HDFStore ____________________________________________________________
# store = pd.HDFStore(
#     dc.home_path + '/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
#     format(platform),
#     append=True)
# store['loan_info_merged'] = loan_info
# print("{:,}".format(len(loan_info)) + " loans saved " +
#       'for {0}'.format(platform))
# print(store.keys())
# store.close()

['PMTHIST_ALL_201808.csv', 'LoanStats_securev1_2017Q4.csv', 'LoanStats_securev1_2018Q2.csv', 'LoanStats3c_securev1.csv', 'LoanStats_securev1_2016Q2.csv', 'LoanStats3b_securev1.csv', 'LoanStats_securev1_2016Q4.csv', 'LoanStats_securev1_2018Q1.csv', 'LoanStats_securev1_2017Q3.csv', 'LoanStats_securev1_2017Q1.csv', 'LoanStats_securev1_2016Q3.csv', 'LoanStats_securev1_2017Q2.csv', 'LoanStats_securev1_2016Q1.csv', 'LoanStats3a_securev1.csv', 'LoanStats3d_securev1.csv']


In [2]:
loan_info.shape

(2004062, 153)

In [3]:
loan_info.head()

Unnamed: 0_level_0,acc_now_delinq,acc_open_past_24mths,addr_state,all_util,annual_inc,annual_inc_joint,application_type,avg_cur_bal,bc_open_to_buy,bc_util,...,total_pymnt,total_pymnt_inv,total_rec_int,total_rec_late_fee,total_rec_prncp,total_rev_hi_lim,url,verification_status,verification_status_joint,zip_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
125907898,0.0,4.0,WI,46.0,211000.0,,Individual,10882.0,27572.0,20.3,...,3840.56,3840.56,905.28,0.0,2935.28,34600.0,https://lendingclub.com/browse/loanDetail.acti...,Not Verified,,530xx
126419316,0.0,0.0,MD,67.0,65000.0,,Individual,13736.0,,,...,2874.98,2874.98,1512.74,0.0,1362.24,0.0,https://lendingclub.com/browse/loanDetail.acti...,Source Verified,,208xx
126378185,0.0,2.0,MN,94.0,80000.0,,Individual,20724.0,2867.0,79.8,...,1967.89,1967.89,518.98,0.0,1448.91,15600.0,https://lendingclub.com/browse/loanDetail.acti...,Verified,,558xx
126165519,0.0,8.0,GA,74.0,150000.0,,Individual,22139.0,16626.0,79.6,...,1030.04,1030.04,398.04,0.0,632.0,96200.0,https://lendingclub.com/browse/loanDetail.acti...,Not Verified,,300xx
126415456,0.0,1.0,TX,64.0,66560.0,,Individual,35799.0,6826.0,69.4,...,2753.44,2753.44,458.07,0.0,2295.37,27300.0,https://lendingclub.com/browse/loanDetail.acti...,Source Verified,,770xx


In [4]:
loan_info['issue_d'].value_counts(dropna=False)

Mar-2016    61992
Oct-2015    48631
May-2018    46311
Jul-2015    45962
Dec-2015    44343
Aug-2017    43573
Apr-2018    42928
Nov-2017    42343
Jun-2018    41533
Sep-2017    39713
Feb-2016    39529
Jul-2017    39415
Oct-2014    38783
Mar-2018    38771
Dec-2017    38154
Oct-2017    38151
Jun-2017    38087
May-2017    37681
Nov-2015    37530
Mar-2017    37181
Apr-2016    36432
Jan-2018    36347
Aug-2016    36280
Dec-2016    36183
Aug-2015    35886
Apr-2015    35427
Jan-2015    35107
Jul-2016    34696
Nov-2016    34591
Jun-2016    33019
            ...  
Dec-2009      658
Oct-2009      604
Sep-2009      507
Aug-2009      446
Jul-2009      411
Jun-2009      406
Mar-2008      402
May-2009      359
Apr-2009      333
Mar-2009      324
Feb-2008      306
Jan-2008      305
Feb-2009      302
Jan-2009      269
Apr-2008      259
Dec-2008      253
Nov-2008      209
Dec-2007      172
Jul-2008      141
Jun-2008      124
Oct-2008      122
May-2008      115
Nov-2007      112
Oct-2007      105
Aug-2008  

In [8]:
# trying out feather data format
PATH = '/home/justin/all_data/lendingclub/'
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(f'{PATH}loan_info.fth')