In [6]:
%%writefile merge_loan_info.py
import time
import os
import pandas as pd
import dir_constants as dc

# Set some constants __________________________________________________________
now = time.strftime("%Y_%m_%d_%Hh_%Mm_%Ss")
platform = 'lendingclub'

# Set data_path _______________________________________________________________
dpath = dc.data_path + 'lendingclub/csvs/working_csvs'

# Get the loan_info csvs to iterate over ______________________________________
files = os.listdir(dpath)
print(files)
loan_info_files = [
    file_ for file_ in files
    if not (file_.startswith('.') | file_.startswith('lc_') |
            file_.startswith('PMTHIST') | file_.startswith('LCData'))
]

to_concat = []
for file_ in loan_info_files:
    to_concat.append(
        pd.read_csv(
            dpath + '/' + file_, header=1, engine='python', skipfooter=2))

loan_info = pd.concat(to_concat)

# Block to ensure that rows that aren't actually loans are dropped ____________
# All loans must have int/term/funded
loan_info = loan_info[loan_info['term'].notnull()]
loan_info['int_rate'] = loan_info['int_rate'].str.strip('%').astype(float)
loan_info['term'] = loan_info['term'].str[:3].astype(int)
loan_info = loan_info[(loan_info['int_rate'] > 0) & (loan_info['term'] > 0) &
                      (loan_info['funded_amnt'] > 0)]


# Reset index and set id to int______________________
loan_info.reset_index(drop=True, inplace=True)
loan_info['id'] = loan_info['id'].astype(int)

# trying out feather data format
PATH = '/home/justin/all_data/lendingclub/'
loan_info.to_feather(f'{PATH}loan_info.fth')

Overwriting merge_loan_info.py


In [None]:
# sql stuff is so slow and clunky
# from sqlalchemy import create_engine
# engine = create_engine('mysql://root:{0}@localhost/lcdb'.format(acc_info.db_pw))
# loan_info.to_sql('loan_info_merged', con=engine, index=False, if_exists='replace')

# Save in HDFStore ____________________________________________________________
# store = pd.HDFStore(
#     dc.home_path + '/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
#     format(platform),
#     append=True)
# store['loan_info_merged'] = loan_info
# print("{:,}".format(len(loan_info)) + " loans saved " +
#       'for {0}'.format(platform))
# print(store.keys())
# store.close()