# Author: Justin Hsi

In [1]:
import time
import dir_constants as dc
from pandas.api.types import CategoricalDtype

# Set some constants __________________________________________________________
now = time.strftime("%Y_%m_%d_%Hh_%Mm_%Ss")
platform = 'lendingclub'

# Set data_path _______________________________________________________________
data_path = dc.home_path + '/rsync_dl_rig/unzipped_lc_csvs'

# Get the loan_info csvs to iterate over ______________________________________
files = os.listdir(data_path)
loan_info_files = [
    file_ for file_ in files
    if not (file_.startswith('.') | file_.startswith('lc_') |
            file_.startswith('PMTHIST') | file_.startswith('LCData'))
]
print(loan_info_files)

to_concat = []
for file_ in loan_info_files:
    to_concat.append(
        pd.read_csv(
            data_path + '/' + file_, header=1, engine='python', skipfooter=2))

loan_info = pd.concat(to_concat)

# Block to ensure that rows that aren't actually loans are dropped ____________
# All loans must have int/term/funded 
loan_info = loan_info[loan_info['term'].notnull()]
loan_info['int_rate'] = loan_info['int_rate'].str.strip('%').astype(float)
loan_info['term'] = loan_info['term'].str[:3].astype(int)
loan_info = loan_info[(loan_info['int_rate'] > 0) & (loan_info['term'] > 0) &
                      (loan_info['funded_amnt'] > 0)]


# # Reset index and set id to int 
loan_info.reset_index(drop=True, inplace=True)
loan_info['id'] = loan_info['id'].astype(int)
# loan_info.set_index('id', drop=False, inplace=True)

# Save in HDFStore ____________________________________________________________
# store = pd.HDFStore(
#     dc.home_path + '/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
#     format(platform),
#     append=True)
# store['loan_info_merged'] = loan_info
# print("{:,}".format(len(loan_info)) + " loans saved " +
#       'for {0}'.format(platform))
# print(store.keys())
# store.close()

['LoanStats3c_securev1.csv', 'LoanStats_securev1_2016Q2.csv', 'LoanStats3b_securev1.csv', 'LoanStats_securev1_2016Q4.csv', 'LoanStats_securev1_2017Q3.csv', 'LoanStats_securev1_2017Q1.csv', 'LoanStats_securev1_2016Q3.csv', 'LoanStats_securev1_2017Q2.csv', 'LoanStats_securev1_2016Q1.csv', 'LoanStats3a_securev1.csv', 'LoanStats3d_securev1.csv']


In [2]:
loan_info.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1646778 entries, 0 to 1646777
Columns: 153 entries, acc_now_delinq to zip_code
dtypes: float64(114), int64(2), object(37)
memory usage: 4.5 GB


In [3]:
loan_info[loan_info.duplicated('id')]['id']

Series([], Name: id, dtype: int64)

# A bit of cleaning and data changing so i can store nicely

In [4]:
# some manipulation to fix stupid column types
loan_info['revol_util'] = loan_info['revol_util'].str[:-1].astype(float)/100

# drop columsn that take too much space to store
loan_info.drop(['desc', 'emp_title', 'title', 'url',], axis=1, inplace=True)

In [5]:
obj_cols = loan_info.select_dtypes(include=['object']).describe()

In [6]:
# set up category dtype where possible, keep track of cols with nans/stupid
# cols: Inquiries6m has strings, nans, and ints. Dumb.
date_cols = ['debt_settlement_flag_date', 'earliest_cr_line', 'hardship_end_date',
             'hardship_start_date', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d',
             'next_pymnt_d', 'payment_plan_start_date', 'sec_app_earliest_cr_line', 
             'settlement_date',]
stupid_cols = []
dtype = {}
for col in obj_cols.columns:
    if col not in date_cols:
#         try:
#         print(col)
        vals = loan_info[col].unique()
#         vals.sort()
#         print(vals)
        dtype[col] = CategoricalDtype([val for val in vals if not pd.isnull(val)])
#         except:
#             stupid_cols.append(col)
#             print(col)
#             print(loan_info[col].unique())

In [7]:
for col in loan_info.columns:
    if col not in dtype.keys():
        if col not in date_cols and col != 'id' and col not in stupid_cols:
            dtype[col] = np.float32

In [8]:
loan_info = loan_info.astype(dtype)

In [9]:
loan_info[loan_info.duplicated('id')]['id']

Series([], Name: id, dtype: int64)

In [15]:
store_path = f'{datapath}lendingclub/lendingclub.h5'
loan_info.to_hdf(store_path, 'loan_info_merged', format='table')

In [None]:
# # trying out feather data format
# PATH = '/home/justin/justin_tinkering/data_science/lendingclub/data/'
# loan_info.reset_index()
# loan_info.to_feather(f'{PATH}loan_info')

In [3]:
# os.mkdir('/home/justin/justin_tinkering/data_science/lendingclub/data')