# script version

In [3]:
%%writefile ../../scripts/csv_preparation/clean_loan_info.py

'''
maturity time and maturity paid are floats from 0 to 1 that express how "done"
a loan is either time-wise, or money wise. There are loan-status adjusted versions as well.
I use these because I want to include as much data for my models as possible while recognizing that
there is uncertainty in knowing whether a loan was good or bad if it is ongoing.

For example, if a loan is 120 days late, we know that loan is likely a very bad investment that our model should not be choosing. Is it possible that the loan all of a sudden becomes current and you get a massive return due to accumulated late fees? Yes, but not likely. In any case, I would rather incorporate that likely-to-be-bad loan into the model now instead of wait 2-ish months for that loan to truly go through the charged-off process.

maturity_time is how close to original maturity the loan is, regardless of how much the loan has paid back and/or followed the expected payment schedule.
maturity_paid is how close the loan is to completing all its payments (
total_payments_received/(total_expected_payments at point in time, with adjustments for lateness))

status adjusted are adjusting the maturity calculations knowing that if the loan does go the charge-off route, it has x months left or will recover .1 percent of remaining outstanding principal on avg.

Some examples of loans:
1) A loan is issued last month and almost pays off all the outstanding principal this month (maybe a borrower found better loan terms elsewhere, and took out that new loan to almost completely pay down the ) would have maturity_time near 0 and maturity_paid near 1
2) A 3 year loan that is 8 months in and is 120 days late has a low maturity_time and fairly high maturity_paid, as there is an adjustment for denominator (aside form what was already paid to date by the loan, only expecting a 10% recovery on remaining outstanding principal)
'''

import sys
import os
import pandas as pd
from pandas.api.types import is_string_dtype
import numpy as np
import math
import re
from tqdm import tqdm_notebook, tqdm
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
# sys.path.append(os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', 'scripts', 'csv_preparation'))
# print(sys.path)
# print(os.listdir(os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', 'scripts', 'csv_preparation')))
import j_utils.munging as mg
import rem_to_be_paid as rtbp

# load data, turn python Nones into np.nans
dpath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', 'data')
loan_info = pd.read_feather(os.path.join(dpath, 'raw_loan_info.fth'))
loan_info.fillna(value=pd.np.nan, inplace=True)

#turn all date columns into pandas timestamp ________________________________
month_dict = {
    'jan': '1',
    'feb': '2',
    'mar': '3',
    'apr': '4',
    'may': '5',
    'jun': '6',
    'jul': '7',
    'aug': '8',
    'sep': '9',
    'oct': '10',
    'nov': '11',
    'dec': '12'
}
# date cols
date_cols = [
    'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d',
    'next_pymnt_d', 'sec_app_earliest_cr_line', 'hardship_start_date',
    'hardship_end_date', 'payment_plan_start_date', 'debt_settlement_flag_date',
    'settlement_date',
]
for col in date_cols:
    loan_info[col] = loan_info[col].str.strip()
    loan_info[col] = loan_info[col].str.lower()
    loan_info[col] = pd.to_datetime(
        loan_info[col].str[:3].str.lower().replace(month_dict) +
        loan_info[col].str[3:],
        format='%m-%Y')
    
# Cleanups ___________________________________________________________________
# int_rate
loan_info['int_rate'] = loan_info['int_rate'] / 100
# installment funded 
rename_dict = {'installment': 'installment_currently'}
loan_info.rename(rename_dict, inplace=True, axis=1)
# emp_title
loan_info['emp_title'] = loan_info['emp_title'].str.lower()
# home_ownership
dic_home_ownership = {
    'mortgage': 'mortgage',
    'rent': 'rent',
    'own': 'own',
    'other': 'other',
    'none': 'none',
    'any': 'none'
}
loan_info['home_ownership'] = loan_info['home_ownership'].str.lower().replace(
    dic_home_ownership)
# verification_status and verification_status_joint
dic_verification_status = {
    'VERIFIED - income': 'platform',
    'VERIFIED - income source': 'source',
    'not verified': 'none',
    'Source Verified': 'source',
    'Not Verified': 'none',
    'Verified': 'platform'
}
loan_info['verification_status'] = loan_info['verification_status'].replace(
    dic_verification_status)
loan_info['verification_status_joint'] = loan_info[
    'verification_status_joint'].replace(dic_verification_status)
# status
dic_status = {
    'Current': 'current',
    'Charged Off': 'charged_off',
    'Fully Paid': 'paid',
    'Late (31-120 days)': 'late_120',
    'In Grace Period': 'grace_15',
    'Late (16-30 days)': 'late_30',
    'Default': 'defaulted',
    'Issued': 'current'
}
loan_info['loan_status'] = loan_info['loan_status'].apply(
    lambda x: re.sub('Does not meet the credit policy.  Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].apply(
    lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].replace(dic_status)
loan_info['hardship_loan_status'] = loan_info['hardship_loan_status'].replace(dic_status)
#title
loan_info['title'] = loan_info['title'].str.lower()
#application_type
loan_info['application_type'] = loan_info['application_type'].str.lower()
#revol_util
loan_info['revol_util'] = loan_info['revol_util'].apply(
    lambda x: float(x.strip('%')) / 100 if pd.notnull(x) else np.nan)
#all_util
loan_info['all_util'] = loan_info['all_util'] / 100.
# pct_tl_nvr_dlq
loan_info['pct_tl_nvr_dlq'] = loan_info['pct_tl_nvr_dlq'] / 100.
# percent_bc_gt_75
loan_info['percent_bc_gt_75'] = loan_info['percent_bc_gt_75'] / 100.
# dti
loan_info['dti'] = loan_info['dti'] / 100.
# dti_joint
loan_info['dti_joint'] = loan_info['dti_joint'] / 100.
# il_util
loan_info['il_util'] = loan_info['il_util'] / 100.
# bc_util
loan_info['bc_util'] = loan_info['bc_util'] / 100.
# sec_app_revol_util
loan_info['sec_app_revol_util'] = loan_info['sec_app_revol_util'] / 100.
# settlement_percentage
loan_info['settlement_percentage'] = loan_info['settlement_percentage'] / 100.

# check that percents are between 0 and 1, not 0 and 100
pct_cols = []
for col in loan_info.columns:
    if any(x in col for x in ['pct', 'percent', 'util', 'dti', 'rate']):
        pct_cols.append(col)
        
for col in pct_cols:
    if loan_info[col].mean() > 1:
        print('this col needs to be turned into a decimal form of percent: ',col)
    if loan_info[col].median() > 1:
        print('this col needs to be turned into a decimal form of percent: ',col)
        
# Adding columns of interest _________________________________________________        
# unreceived principal, not overwriting out_prncp
loan_info['unreceived_prncp'] = loan_info['funded_amnt'] - loan_info['total_rec_prncp']
loan_info['unreceived_prncp'] = np.where(loan_info['unreceived_prncp'] <= 0.019, 0, loan_info['unreceived_prncp'])
loan_info['unreceived_prncp'] = loan_info['unreceived_prncp'].round(2)

# want to calculate what installment originally was
loan_info['installment_at_funded'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['funded_amnt'])

# have a max_date for reference in making end_d
max_date = loan_info['last_pymnt_d'].max()

# end_d to me means the date we can stop tracking things about the loan. Should be defunct
def applyEndD(status, group):
    if status == 'charged_off':
        #split the group into two groups, one which has paid something, and other which has paid nothing
        never_paid = group[group['last_pymnt_d'].isnull()]
        has_paid = group[group['last_pymnt_d'].notnull()]

        # 4 months of late (1-120) and then 1 month of chargeoff, so 5 months
        never_paid['end_d'] = never_paid['issue_d'] + pd.DateOffset(months=+5)
        has_paid['end_d'] = has_paid['last_pymnt_d'] + pd.DateOffset(months=+5)

        group.ix[never_paid.index.values, 'end_d'] = never_paid['end_d']
        group.ix[has_paid.index.values, 'end_d'] = has_paid['end_d']
        return group['end_d']
    elif status == 'paid':
        return group['last_pymnt_d']
    else:
        return pd.Series([max_date] * len(group), index=group.index.values)
    
# make end_d
status_grouped = loan_info.groupby('loan_status')
end_d_series = pd.Series([])
for status, group in status_grouped:
    end_d_series = end_d_series.append(
        applyEndD(status, group), verify_integrity=True)
loan_info['end_d'] = end_d_series
loan_info.loc[loan_info['end_d'] > max_date, 'end_d'] = max_date

# adding line_history in days, months, and years using pandas .dt functions
loan_info['line_history_d'] = (loan_info['issue_d'] - loan_info['earliest_cr_line']).dt.days
loan_info['line_history_m'] = (loan_info['issue_d'].dt.year - loan_info['earliest_cr_line'].dt.year)*12 + (loan_info['issue_d'].dt.month - loan_info['earliest_cr_line'].dt.month)
loan_info['line_history_y'] = (loan_info['issue_d'].dt.year - loan_info['earliest_cr_line'].dt.year) + (loan_info['issue_d'].dt.month - loan_info['earliest_cr_line'].dt.month)/12
#credit_score
loan_info['fico'] = (
    loan_info['fico_range_high'] + loan_info['fico_range_low']) / 2

# maturity_time
loan_info['months_passed'] = ((
    max_date - loan_info['issue_d']).dt.days *
                            (12 / 365.25)).round()
loan_info['maturity_time'] = loan_info['months_passed'] / loan_info['term']
loan_info['maturity_time'] = np.where(loan_info['maturity_time'] >= 1, 1,
                                      loan_info['maturity_time'])

# make rem_to_be_paid
loan_info['rem_to_be_paid'] = rtbp.apply_rem_to_be_paid(
    loan_info['unreceived_prncp'].values, loan_info['installment_currently'].values,
    loan_info['int_rate'].values)

loan_info['maturity_paid'] = loan_info['total_pymnt'] / (
    loan_info['total_pymnt'] + loan_info['rem_to_be_paid'])

# making status adjusted versions of mat_time, mat_paid
# grace = 35%, late_30 = 64%, late_120 = 98%, 
# See https://www.lendingclub.com/info/demand-and-credit-profile.action for %s used
# maturity_time_stat_adj = 
# maturity_time * prob_not_def + months_passed/months_to_default * prob_def
loan_info['maturity_time_stat_adj'] = np.where(loan_info['loan_status'] == 'grace_15', loan_info['maturity_time']*(1-.35) + ((loan_info['months_passed']/(loan_info['months_passed'] + 4))*.35), 
        np.where(loan_info['loan_status'] == 'late_30', loan_info['maturity_time']*(1-.64) + ((loan_info['months_passed']/(loan_info['months_passed'] + 3))*.64), 
        np.where(loan_info['loan_status'] == 'late_120', loan_info['maturity_time']*(1-.98) + ((loan_info['months_passed']/(loan_info['months_passed'] + 1))*.98), loan_info['maturity_time']
        )))
loan_info['maturity_time_stat_adj'] = np.minimum(1, loan_info['maturity_time_stat_adj'])

# maturity_paid_stat_adj = 
# maturity_paid * prob_not_def + total_paid/total_paid_and_outstanding * prob_def
# .1 is from assuming 10% recovery on defaulted/charged_off loans
loan_info['maturity_paid_stat_adj'] = np.where(loan_info['loan_status'] == 'grace_15', loan_info['maturity_paid']*(1-.35) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.35), 
        np.where(loan_info['loan_status'] == 'late_30', loan_info['maturity_paid']*(1-.64) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.64), 
        np.where(loan_info['loan_status'] == 'late_120', loan_info['maturity_paid']*(1-.98) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.98), loan_info['maturity_paid']
        )))
loan_info['maturity_paid_stat_adj'] = np.minimum(1, loan_info['maturity_paid_stat_adj'])

# final adjustments to status_adj based on done statuses
loan_info.loc[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']),'maturity_paid_stat_adj'] = 1
loan_info.loc[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']),'maturity_time_stat_adj'] = 1

# target_loose
loan_info['target_loose'] = np.where(loan_info['loan_status'].isin(['charged_off', 'defaulted']), 1, 0)

# pull out long string columns
str_cols = loan_info.select_dtypes('object').columns
strip_cols = ['desc', 'emp_title', 'title', 'url']
strings_df = loan_info[strip_cols]
loan_info.drop(columns=strip_cols, inplace=True)
strings_df['id'] = loan_info['id']

# make target strict, anything that was ever late is marked "bad"
bad_statuses = set(['late_120', 'defaulted', 'charged_off', 'late_30'])
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
target_strict_dict = {}
id_grouped = pmt_hist.groupby('loan_id')
for ids, group in tqdm(id_grouped):
    statuses = set(group['status_period_end'])
    if len(statuses.intersection(bad_statuses)) > 0:
        target_strict_dict[ids] = 1
    else:
        target_strict_dict[ids] = 0
target_strict = pd.DataFrame.from_dict(target_strict_dict, orient='index').reset_index(drop=False)
target_strict.columns = ['id', 'target_strict']
loan_info.rename({'loan_id': 'id'}, axis=1, inplace=True)
loan_info = pd.merge(loan_info, target_strict, how='outer', on='id')

# add orig_amt_due and roi_simple
loan_info['orig_amt_due'] = loan_info['term'] * loan_info['installment_at_funded']
loan_info['roi_simple'] = loan_info['total_pymnt']/loan_info['funded_amnt']

# More Data Cleanup __________________________________________________________
# home_ownership: none should be other
loan_info['home_ownership'].replace({'none': 'other'}, inplace=True)
# annual_income has 4 nulls. Just fill with 0
loan_info['annual_inc'].replace({np.nan: 0.0}, inplace=True)
# drop the one null zip_code
loan_info = loan_info[loan_info['zip_code'].notnull()]
# drop the loans where earliest_cr_line is null
loan_info = loan_info[loan_info['earliest_cr_line'].notnull()]
# drop null chargeoff_within_12_mths
loan_info = loan_info[loan_info['chargeoff_within_12_mths'].notnull()]
# drop null tax_liens
loan_info = loan_info[loan_info['tax_liens'].notnull()]
# drop loans that have this null
loan_info = loan_info[loan_info['inq_last_6mths'].notnull()]

# Drop columns _______________________________________________________________
# Dropping these since I don't want them and they might confuse me.
# There is no reason why I care about money that went just to investors rather
# than to lending club as well when they top off loans.
loan_info.drop(['funded_amnt_inv',
                'out_prncp_inv'], axis = 1, inplace = True)

# last cleanups before storing
# if column type is string and has np.nan (a float), turn the nan into "None" for the graphing eda notebook in 
# Exploratory Data Analysis
for col in loan_info.columns:
    if is_string_dtype(loan_info[col].dtype) & (loan_info[col].isnull().sum() > 0):
        loan_info[col] = loan_info[col].fillna('None')
        
for col in strings_df.columns:
    if is_string_dtype(strings_df[col].dtype) & (strings_df[col].isnull().sum() > 0):
        strings_df[col] = strings_df[col].fillna('None')

# reduce memory and store
_, strings_df = mg.reduce_memory(strings_df)
strings_df.reset_index(drop=True, inplace=True)
_, loan_info = mg.reduce_memory(loan_info)
loan_info.reset_index(drop=True, inplace=True)
strings_df.to_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
loan_info.to_feather(os.path.join(dpath, 'loan_info.fth'))

Overwriting ../../scripts/csv_preparation/clean_loan_info.py


# Notebook version

In [1]:
%load_ext Cython
pd.options.display.max_columns = 999

In [2]:
import sys
import os
import pandas as pd
from pandas.api.types import is_string_dtype
import math
import re
from tqdm import tqdm_notebook, tqdm
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg

In [3]:
dpath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', 'data')
loan_info = pd.read_feather(os.path.join(dpath, 'raw_loan_info.fth'))

In [4]:
# Turn python Nones into np.nans
loan_info.fillna(value=pd.np.nan, inplace=True)

In [5]:
# FIXED but leaving here so i can check if I desire later
# these came from dataprep_and_modeling/determining_evaluation.ipynb. these are loan ids that had no data...
# We can see that the original loan info had them. Check loan_info_clean
no_issue_d = [   70686,    71623,    71823,    72176,    72197,    72323,
          72819,    72998,    73003,    73582,    74014,    74323,
          74505,    76597,    76629,    77757,    77792,    79893,
          79924,    79967,    81085,    83185,    83489,    83979,
          84098,    84670,    84918,    85675,    85781,    85818,
          85961,    87023,    88046,    88637,    88854,    89258,
          90106,    90376,    90395,    90665,    90966,    91023,
          91126,    91175,    92187,    92402,    92440,    92507,
          92533,    92552,    92666,    92676,    93055,    93061,
          93277,    94406,    94838,    95198,    95250,    96350,
          96844,    97606,    98276,    98339,    98360,    98982,
          98984,    99009,    99100,    99634,    99982,    99987,
         100134,   100214,   101579,   102376,   102823,   103478,
         103507,   103846,   104530,   104634,   106079,   106216,
         106360,   107136,   107560,   108473,   109355,   109824,
         110627,   111227,   111307,   111564,   111868,   111917,
         112216,   112245,   112323,   112496,   112747,   112806,
         113156,   113179,   113194,   113203,   113231,   113450,
         114133,   114333,   114408,   114469,   114511,   114642,
         114838,   114943,   115363,   115602,   115606,   116040,
         116129,   116582,   117045,   117056,   117192,   117249,
         117794,   117863,   118024,   118367,   118523,   118533,
         118823,   118872,   119043,   119071,   119262,   119360,
         119948,   121535,   121568,   124624,   127213,   127606,
         131387, 69266577, 96387212]
# loan_info[loan_info['id'].isin(no_issue_d)]['issue_d']

# Cleaning the loan_info

In [6]:
#turn all date columns into pandas timestamp
month_dict = {
    'jan': '1',
    'feb': '2',
    'mar': '3',
    'apr': '4',
    'may': '5',
    'jun': '6',
    'jul': '7',
    'aug': '8',
    'sep': '9',
    'oct': '10',
    'nov': '11',
    'dec': '12'
}
# date cols
date_cols = [
    'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d',
    'next_pymnt_d', 'sec_app_earliest_cr_line', 'hardship_start_date',
    'hardship_end_date', 'payment_plan_start_date', 'debt_settlement_flag_date',
    'settlement_date',
]

for col in date_cols:
    loan_info[col] = loan_info[col].str.strip()
    loan_info[col] = loan_info[col].str.lower()
    loan_info[col] = pd.to_datetime(
        loan_info[col].str[:3].str.lower().replace(month_dict) +
        loan_info[col].str[3:],
        format='%m-%Y')

In [7]:
# Cleanups

# int_rate
loan_info['int_rate'] = loan_info['int_rate'] / 100

# installment funded 
rename_dict = {'installment': 'installment_currently'}
loan_info.rename(rename_dict, inplace=True, axis=1)

# emp_title
loan_info['emp_title'] = loan_info['emp_title'].str.lower()

# home_ownership
dic_home_ownership = {
    'mortgage': 'mortgage',
    'rent': 'rent',
    'own': 'own',
    'other': 'other',
    'none': 'none',
    'any': 'none'
}
loan_info['home_ownership'] = loan_info['home_ownership'].str.lower().replace(
    dic_home_ownership)

# verification_status and verification_status_joint
dic_verification_status = {
    'VERIFIED - income': 'platform',
    'VERIFIED - income source': 'source',
    'not verified': 'none',
    'Source Verified': 'source',
    'Not Verified': 'none',
    'Verified': 'platform'
}
loan_info['verification_status'] = loan_info['verification_status'].replace(
    dic_verification_status)
loan_info['verification_status_joint'] = loan_info[
    'verification_status_joint'].replace(dic_verification_status)

# status
dic_status = {
    'Current': 'current',
    'Charged Off': 'charged_off',
    'Fully Paid': 'paid',
    'Late (31-120 days)': 'late_120',
    'In Grace Period': 'grace_15',
    'Late (16-30 days)': 'late_30',
    'Default': 'defaulted',
    'Issued': 'current'
}
loan_info['loan_status'] = loan_info['loan_status'].apply(
    lambda x: re.sub('Does not meet the credit policy.  Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].apply(
    lambda x: re.sub('Does not meet the credit policy. Status:', '', x))
loan_info['loan_status'] = loan_info['loan_status'].replace(dic_status)
loan_info['hardship_loan_status'] = loan_info['hardship_loan_status'].replace(dic_status)

#title
loan_info['title'] = loan_info['title'].str.lower()

#application_type
loan_info['application_type'] = loan_info['application_type'].str.lower()

#revol_util
loan_info['revol_util'] = loan_info['revol_util'].apply(
    lambda x: float(x.strip('%')) / 100 if pd.notnull(x) else np.nan)

#all_util
loan_info['all_util'] = loan_info['all_util'] / 100.

# pct_tl_nvr_dlq
loan_info['pct_tl_nvr_dlq'] = loan_info['pct_tl_nvr_dlq'] / 100.

# percent_bc_gt_75
loan_info['percent_bc_gt_75'] = loan_info['percent_bc_gt_75'] / 100.

# dti
loan_info['dti'] = loan_info['dti'] / 100.

# dti_joint
loan_info['dti_joint'] = loan_info['dti_joint'] / 100.

# il_util
loan_info['il_util'] = loan_info['il_util'] / 100.

# bc_util
loan_info['bc_util'] = loan_info['bc_util'] / 100.

# sec_app_revol_util
loan_info['sec_app_revol_util'] = loan_info['sec_app_revol_util'] / 100.

# settlement_percentage
loan_info['settlement_percentage'] = loan_info['settlement_percentage'] / 100.

In [8]:
pct_cols = []
for col in loan_info.columns:
    if any(x in col for x in ['pct', 'percent', 'util', 'dti', 'rate']):
        pct_cols.append(col)
        
for col in pct_cols:
    if loan_info[col].mean() > 1:
        print('this col needs to be turned into a decimal form of percent: ',col)
    if loan_info[col].median() > 1:
        print('this col needs to be turned into a decimal form of percent: ',col)

In [9]:
# unreceived principal, not overwriting out_prncp
loan_info['unreceived_prncp'] = loan_info['funded_amnt'] - loan_info['total_rec_prncp']
loan_info['unreceived_prncp'] = np.where(loan_info['unreceived_prncp'] <= 0.019, 0, loan_info['unreceived_prncp'])
loan_info['unreceived_prncp'] = loan_info['unreceived_prncp'].round(2)

### add header to collapse

In [None]:
# load in pmt_history and check the meaning of out_prncp
dpath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', 'data')
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))

## outs_prncp_end from pmt_hist matches with unreceived_prncp (self-added) in loan_info

In [11]:
# check how out_prncp and installment_currently are messed up
ex_cols = ['out_prncp', 'funded_amnt', 'total_rec_prncp', 'loan_status', 'id']
done_loans = loan_info[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted'])]
done_loans[ex_cols]

Unnamed: 0,out_prncp,funded_amnt,total_rec_prncp,loan_status,id
2,0.0,30000.0,2196.709961,charged_off,126176997
5,0.0,13000.0,13000.000000,paid,125602675
7,0.0,15000.0,15000.000000,paid,126417318
17,0.0,15750.0,15750.000000,paid,126407653
20,0.0,19800.0,19800.000000,paid,126273452
23,0.0,10000.0,10000.000000,paid,126412424
32,0.0,32600.0,32600.000000,paid,126413578
34,0.0,10000.0,10000.000000,paid,126419328
35,0.0,13000.0,13000.000000,paid,126383635
37,0.0,10000.0,1788.050049,charged_off,126343082


In [12]:
ex_cols.append('unreceived_prncp')
done_loans[ex_cols]

Unnamed: 0,out_prncp,funded_amnt,total_rec_prncp,loan_status,id,unreceived_prncp
2,0.0,30000.0,2196.709961,charged_off,126176997,27803.289062
5,0.0,13000.0,13000.000000,paid,125602675,0.000000
7,0.0,15000.0,15000.000000,paid,126417318,0.000000
17,0.0,15750.0,15750.000000,paid,126407653,0.000000
20,0.0,19800.0,19800.000000,paid,126273452,0.000000
23,0.0,10000.0,10000.000000,paid,126412424,0.000000
32,0.0,32600.0,32600.000000,paid,126413578,0.000000
34,0.0,10000.0,10000.000000,paid,126419328,0.000000
35,0.0,13000.0,13000.000000,paid,126383635,0.000000
37,0.0,10000.0,1788.050049,charged_off,126343082,8211.950195


In [13]:
pmt_hist[pmt_hist.loan_id == 126367651]

Unnamed: 0,pmt_date,status_period_end,date,issue_d,addr_state,home_ownership,first_credit_line,emp_len,grade,vintage,outs_princp_beg,princp_paid,int_paid,fee_paid,amt_due,amt_paid,outs_princp_end,m_on_books,charged_off_this_month,charged_off_amt,int_rate,monthly_pmt,dti,m_income,open_credit_lines,total_credit_lines,revol_line_util,dq_24m,m_since_dq,m_since_rec,current_policy,term,recovs,recov_fees,all_cash_to_inv,public_recs,fico_apply,fico_last,loan_id,revol_credit_bal
41816876,2018-02-01,current,2018-01-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,20000.0,207.529999,301.0,0.0,508.529999,508.529999,19792.470703,1,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.529999,1.0,692,687,126367651,31797.0
41816877,2018-03-01,current,2018-02-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,19792.470703,210.653,297.877014,0.0,508.529999,508.530029,19581.816406,2,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.530029,1.0,692,697,126367651,31797.0
41816878,2018-04-01,current,2018-03-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,19581.816406,213.824005,294.705994,0.0,508.529999,508.529999,19367.992188,3,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.529999,1.0,692,697,126367651,31797.0
41816879,2018-05-01,current,2018-04-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,19367.992188,217.042007,291.488007,0.0,508.529999,508.530029,19150.951172,4,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.530029,1.0,692,707,126367651,31797.0
41816880,2018-06-01,current,2018-05-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,19150.951172,220.307999,288.221985,0.0,508.529999,508.529968,18930.642578,5,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.529968,1.0,692,712,126367651,31797.0
41816881,2018-07-01,current,2018-06-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,18930.642578,223.623993,284.906006,0.0,508.529999,508.529999,18707.019531,6,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.529999,1.0,692,707,126367651,31797.0
41816882,2018-08-01,current,2018-07-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,18707.019531,226.988998,281.540985,0.0,508.529999,508.529968,18480.029297,7,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.529968,1.0,692,707,126367651,31797.0
41816883,2018-09-01,current,2018-08-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,18480.029297,230.406006,278.123993,0.0,508.529999,508.529999,18249.623047,8,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,508.529999,1.0,692,707,126367651,31797.0
41816884,NaT,late_30,2018-09-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,18249.623047,0.0,0.0,0.0,508.529999,0.0,18249.623047,9,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,0.0,1.0,692,742,126367651,31797.0
41816885,NaT,late_120,2018-10-01,2017-12-01,FL,mortgage,1998-03-01,5 years,D,17Q4,18249.623047,0.0,0.0,0.0,1042.48999,0.0,18249.623047,10,0.0,0.0,0.181,508.529999,27.74,9166.666992,15.0,25.0,0.552,0.0,,110.0,1,60,0.0,0.0,0.0,1.0,692,717,126367651,31797.0


### end collapse

In [10]:
# want to calculate what installment originally was
loan_info['installment_at_funded'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['funded_amnt'])

In [11]:
# some date setting stuff for cleanups
max_date = loan_info['last_pymnt_d'].max()

# end_d to me means the date we can stop tracking things about the loan. Should be defunct
def applyEndD(status, group):
    if status == 'charged_off':
        #split the group into two groups, one which has paid something, and other which has paid nothing
        never_paid = group[group['last_pymnt_d'].isnull()]
        has_paid = group[group['last_pymnt_d'].notnull()]

        # 4 months of late (1-120) and then 1 month of chargeoff, so 5 months
        never_paid['end_d'] = never_paid['issue_d'] + pd.DateOffset(months=+5)
        has_paid['end_d'] = has_paid['last_pymnt_d'] + pd.DateOffset(months=+5)

        group.ix[never_paid.index.values, 'end_d'] = never_paid['end_d']
        group.ix[has_paid.index.values, 'end_d'] = has_paid['end_d']
        return group['end_d']
    elif status == 'paid':
        return group['last_pymnt_d']
    else:
        return pd.Series([max_date] * len(group), index=group.index.values)

In [12]:
# make end_d
status_grouped = loan_info.groupby('loan_status')
end_d_series = pd.Series([])
for status, group in status_grouped:
    end_d_series = end_d_series.append(
        applyEndD(status, group), verify_integrity=True)
loan_info['end_d'] = end_d_series
loan_info.loc[loan_info['end_d'] > max_date, 'end_d'] = max_date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.p

In [13]:
# ex_cols.append('end_d')
# loan_info[ex_cols]

In [14]:
# adding line_history in days, months, and years using pandas .dt functions
loan_info['line_history_d'] = (loan_info['issue_d'] - loan_info['earliest_cr_line']).dt.days
loan_info['line_history_m'] = (loan_info['issue_d'].dt.year - loan_info['earliest_cr_line'].dt.year)*12 + (loan_info['issue_d'].dt.month - loan_info['earliest_cr_line'].dt.month)
loan_info['line_history_y'] = (loan_info['issue_d'].dt.year - loan_info['earliest_cr_line'].dt.year) + (loan_info['issue_d'].dt.month - loan_info['earliest_cr_line'].dt.month)/12
#credit_score
loan_info['fico'] = (
    loan_info['fico_range_high'] + loan_info['fico_range_low']) / 2

# maturity (time and paid)

In [15]:
'''
maturity time and maturity paid are floats from 0 to 1 that express how "done"
a loan is either time-wise, or money wise. There are loan-status adjusted versions as well.
I use these because I want to include as much data for my models as possible while recognizing that
there is uncertainty in knowing whether a loan was good or bad if it is ongoing.

For example, if a loan is 120 days late, we know that loan is likely a very bad investment that our model should not be choosing. Is it possible that the loan all of a sudden becomes current and you get a massive return due to accumulated late fees? Yes, but not likely. In any case, I would rather incorporate that likely-to-be-bad loan into the model now instead of wait 2-ish months for that loan to truly go through the charged-off process.

maturity_time is how close to original maturity the loan is, regardless of how much the loan has paid back and/or followed the expected payment schedule.
maturity_paid is how close the loan is to completing all its payments (
total_payments_received/(total_expected_payments at point in time, with adjustments for lateness))

status adjusted are adjusting the maturity calculations knowing that if the loan does go the charge-off route, it has x months left or will recover .1 percent of remaining outstanding principal on avg.

Some examples of loans:
1) A loan is issued last month and almost pays off all the outstanding principal this month (maybe a borrower found better loan terms elsewhere, and took out that new loan to almost completely pay down the ) would have maturity_time near 0 and maturity_paid near 1
2) A 3 year loan that is 8 months in and is 120 days late has a low maturity_time and fairly high maturity_paid, as there is an adjustment for denominator (aside form what was already paid to date by the loan, only expecting a 10% recovery on remaining outstanding principal)
'''

'\nmaturity time and maturity paid are floats from 0 to 1 that express how "done"\na loan is either time-wise, or money wise. There are loan-status adjusted versions as well.\nI use these because I want to include as much data for my models as possible while recognizing that\nthere is uncertainty in knowing whether a loan was good or bad if it is ongoing.\n\nFor example, if a loan is 120 days late, we know that loan is likely a very bad investment that our model should not be choosing. Is it possible that the loan all of a sudden becomes current and you get a massive return due to accumulated late fees? Yes, but not likely. In any case, I would rather incorporate that likely-to-be-bad loan into the model now instead of wait 2-ish months for that loan to truly go through the charged-off process.\n\nmaturity_time is how close to original maturity the loan is, regardless of how much the loan has paid back and/or followed the expected payment schedule.\nmaturity_paid is how close the loan 

In [16]:
# maturity_time
# put in maturity time so I can look at old data for historic default int_rates
loan_info['months_passed'] = ((
    max_date - loan_info['issue_d']).dt.days *
                            (12 / 365.25)).round()
loan_info['maturity_time'] = loan_info['months_passed'] / loan_info['term']
loan_info['maturity_time'] = np.where(loan_info['maturity_time'] >= 1, 1,
                                      loan_info['maturity_time'])

# Cython section, notebook and script version?

In [17]:
# %%cython --verbose
# cimport numpy as np
# import numpy as np

# cpdef rem_to_be_paid(double out_prncp,
#                      double install,
#                      double int_rate):
#     cdef double m_rate
#     cdef double to_be_paid
#     m_rate = int_rate/12
#     to_be_paid = 0.0
#     k = 0
#     while out_prncp > 0:
#         k += 1
#         out_prncp = (1+m_rate) * out_prncp
#         out_prncp -= install
#         to_be_paid += install
#         # the break was added to figure out what was wrong with infinite while; it was due to installment funded
#         # being INCORRECTLY REPORTED by lending club
#         if k >= 100:
#             print(to_be_paid)
#             break
#         if out_prncp < 0:
#             to_be_paid -= abs(out_prncp)
#     return to_be_paid

# cpdef np.ndarray[double] apply_rem_to_be_paid(np.ndarray col_out_prncp,
#                                               np.ndarray col_install,
#                                               np.ndarray col_int_rate):
#     assert (col_out_prncp.dtype == np.float32 and col_install.dtype == np.float32 and col_int_rate.dtype == np.float32)
#     cdef Py_ssize_t i, n = len(col_out_prncp)
#     assert (len(col_out_prncp) == len(col_install) == n)
#     cdef np.ndarray[double] res = np.empty(n)
#     for i in xrange(n):
#         res[i] = rem_to_be_paid(col_out_prncp[i],
#                                 col_install[i],
#                                 col_int_rate[i])
#     return res

In [18]:
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', 'scripts', 'csv_preparation'))

In [19]:
import rem_to_be_paid as rtbp

# Somewhere, apply rem_to_be_paid stalls. Chunk it and iterate to see which chunk?

In [20]:
# add maturity paid
loan_info['rem_to_be_paid'] = rtbp.apply_rem_to_be_paid(
    loan_info['unreceived_prncp'].values, loan_info['installment_currently'].values,
    loan_info['int_rate'].values)

loan_info['maturity_paid'] = loan_info['total_pymnt'] / (
    loan_info['total_pymnt'] + loan_info['rem_to_be_paid'])

# mat_paid has nans because some charged_off loans have 0 rem_to_be_paid
# which is inconsistent with the majority treatment. quick fix is fillna

# loan_info['maturity_paid'] = loan_info['maturity_paid'].fillna(0)

# adjust maturity paid by status, with late-status notes being adjusted by loan status migration rates below
https://www.lendingclub.com/info/demand-and-credit-profile.action

In [21]:
# We have 25% of grace period notes reaching charged-off 9 months later. 
# How do I translate this into a confidence about how done-paid wise it is?
# Done paid wise = how sure we are that no more non-recovery money is coming?
# The chart depicts % outstanding principal recovered....
# Without putting much thought into it, I'll increment all percents by 10%
# Because I do know that everything in default is pretty much sold
# at 10 cents on the dollar hence the 10% recovery under the default
# column

In [22]:
# # grace = 35%, late_30 = 64%, late_120 = 98%,
# # See https://www.lendingclub.com/info/demand-and-credit-profile.action for %s used
# # maturity_time_stat_adj =
# # maturity_time * prob_not_def + months_passed/months_to_default * prob_def
# loan_info['maturity_time_stat_adj'] = np.where(loan_info['loan_status'] == 'grace_15', loan_info['maturity_time']*(1-.35) + ((loan_info['months_passed']/(loan_info['months_passed'] + 4))*.35),
#                                                np.where(loan_info['loan_status'] == 'late_30', loan_info['maturity_time']*(1-.64) + ((loan_info['months_passed']/(loan_info['months_passed'] + 3))*.64),
#                                                         np.where(loan_info['loan_status'] == 'late_120', loan_info['maturity_time']*(1-.98) + ((loan_info['months_passed']/(loan_info['months_passed'] + 1))*.98), loan_info['maturity_time']
#                                                                  )))
# loan_info['maturity_time_stat_adj'] = np.minimum(
#     1, loan_info['maturity_time_stat_adj'])

# # maturity_paid_stat_adj =
# # maturity_paid * prob_not_def + total_paid/total_paid_and_outstanding * prob_def
# # .1 is from assuming 10% recovery on defaulted/charged_off loans
# loan_info['maturity_paid_stat_adj'] = np.where(loan_info['loan_status'] == 'grace_15', loan_info['maturity_paid']*(1-.35) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.35),
#                                                np.where(loan_info['loan_status'] == 'late_30', loan_info['maturity_paid']*(1-.64) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.64),
#                                                         np.where(loan_info['loan_status'] == 'late_120', loan_info['maturity_paid']*(1-.98) + ((loan_info['total_pymnt']/(loan_info['total_pymnt'] + .1*loan_info['unreceived_prncp']))*.98), loan_info['maturity_paid']
#                                                                  )))
# loan_info['maturity_paid_stat_adj'] = np.minimum(
#     1, loan_info['maturity_paid_stat_adj'])

In [23]:
# # final adjustments to status_adj based on done statuses
# loan_info.loc[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']),'maturity_paid_stat_adj'] = 1
# loan_info.loc[loan_info['loan_status'].isin(['paid', 'charged_off', 'defaulted']),'maturity_time_stat_adj'] = 1

In [24]:
# # sanity check
# ex_cols = ['loan_status', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid', 'maturity_paid_stat_adj', 'out_prncp', 'unreceived_prncp', 'rem_to_be_paid', 'funded_amnt', 'total_pymnt']
# loan_info[~loan_info['loan_status'].isin(['paid', 'current'])][ex_cols]

In [26]:
# things that I always expect to be true
# 1) if loan_status is not paid,current,grace_15, then target_strict must be 1
# 2) if loan_status is a bad status, maturity_time_stat_adj and maturity_paid_stat_adj should be higher than
# unadjusted version

# loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
check_cols = ['maturity_time', 'maturity_paid', 'maturity_time_stat_adj', 'maturity_paid_stat_adj',
              'target_loose', 'target_strict', 'loan_status', 'issue_d', 'end_d', 'id']
good_statuses = ['paid', 'current']

stat_adj_dict = {'grace_15' : .35, 'late_30' : .64, 'late_120': .98, 'charged_off': 1, 'defaulted': 1}

# make maturity_time/paid stat_adj
stat_grouped = loan_info.groupby('loan_status')
to_cat = []
for stat, group in stat_grouped:
    if stat not in good_statuses:
        adj = stat_adj_dict[stat]
        group['maturity_time_stat_adj'] = (1-adj) * group['maturity_time'] + adj
        group['maturity_paid_stat_adj'] = (1-adj) * group['maturity_paid'] + adj
    to_cat.append(group)
loan_info = pd.concat(to_cat)

# resort by id
loan_info.sort_values('id', inplace=True)
loan_info.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [27]:
loan_info['target_loose'] = np.where(loan_info['loan_status'].isin(['charged_off', 'defaulted']), 1, 0)

# set aside strings for separate dataframe, maybe try nlp on it

In [28]:
str_cols = loan_info.select_dtypes('object').columns
# loan_info[str_cols]
strip_cols = ['desc', 'emp_title', 'title', 'url']
strings_df = loan_info[strip_cols]
loan_info.drop(columns=strip_cols, inplace=True)
strings_df['id'] = loan_info['id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Adding target strict, which is flag a loan that has ever been late, found via payment history

In [29]:
bad_statuses = set(['late_120', 'defaulted', 'charged_off', 'late_30'])
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
target_strict_dict = {}
id_grouped = pmt_hist.groupby('loan_id')
for ids, group in tqdm(id_grouped):
    statuses = set(group['status_period_end'])
    if len(statuses.intersection(bad_statuses)) > 0:
        target_strict_dict[ids] = 1
    else:
        target_strict_dict[ids] = 0
target_strict = pd.DataFrame.from_dict(target_strict_dict, orient='index').reset_index(drop=False)
target_strict.columns = ['id', 'target_strict']
loan_info.rename({'loan_id': 'id'}, axis=1, inplace=True)
loan_info = pd.merge(loan_info, target_strict, how='outer', on='id')
# # examine a target strict (which is marking any kind of late, in the past or most up to date from pmt history)
# pmt_hist[pmt_hist['loan_id'] == 124428135]

100%|██████████| 2376343/2376343 [12:26<00:00, 3182.28it/s]


In [30]:
# correct target strict
loan_info['target_strict'] = np.where(~loan_info['loan_status'].isin(good_statuses), 1, loan_info['target_strict'])

# Additional cleaning based off information from api, with loan dropping which should be a final/near final step
https://www.lendingclub.com/developers/listed-loans.action

In [31]:
# home_ownership: none should be other
loan_info['home_ownership'].replace({'none': 'other'}, inplace=True)

# annual_income has 4 nulls. Just fill with 0
loan_info['annual_inc'].replace({np.nan: 0.0}, inplace=True)

# drop the one null zip_code
loan_info = loan_info[loan_info['zip_code'].notnull()]

# drop the loans where earliest_cr_line is null
loan_info = loan_info[loan_info['earliest_cr_line'].notnull()]

# drop null chargeoff_within_12_mths
loan_info = loan_info[loan_info['chargeoff_within_12_mths'].notnull()]

# drop null tax_liens
loan_info = loan_info[loan_info['tax_liens'].notnull()]

# # add installment_at_funded
# loan_info['installment_at_funded'] = np.pmt(loan_info['int_rate']/12, loan_info['term'], -loan_info['loan_amnt'])

# drop loans that have this null
loan_info = loan_info[loan_info['inq_last_6mths'].notnull()]

In [32]:
# More Data Cleaning Steps ____________________________________________________

# Drop columns before cleanup
# Dropping these since I don't want them and they might confuse me.
# There is no reason why I care about money that went just to investors rather
# than to lending club as well when they top off loans.
loan_info.drop(['funded_amnt_inv',
                'out_prncp_inv'], axis = 1, inplace = True)

# Dropping these because I'm getting them from data_pmt_history
# loan_info.drop(['home_ownership'], axis = 1, inplace = True)


# loan_info.rename(columns = {'total_pymnt': 'total_pymnt_rec_recov',
#                             'total_pymnt_recov': 'total_pymnt_rec_recov'}, inplace=True)

loan_info['orig_amt_due'] = loan_info['term'] * loan_info['installment_at_funded']

loan_info['roi_simple'] = loan_info['total_pymnt']/loan_info['funded_amnt']

In [33]:
loan_info[['installment_at_funded', 'installment_at_funded', 'installment_currently']]

Unnamed: 0,installment_at_funded,installment_at_funded.1,installment_currently
0,829.046021,829.046021,829.099976
1,35.196629,35.196629,35.200001
2,139.140762,139.140762,139.149994
3,228.211746,228.211746,228.220001
4,245.157532,245.157532,245.160004
5,246.376389,246.376389,246.380005
6,354.888092,354.888092,354.890015
7,891.192200,891.192200,891.200012
8,40.496181,40.496181,40.500000
9,366.867432,366.867432,366.859985


# Examine loan_info, add target cols to strings_df, turn none into nan

In [34]:
# if column type is string and has np.nan (a float), turn the nan into "None" for the graphing eda notebook in 
# Exploratory Data Analysis
for col in loan_info.columns:
    if is_string_dtype(loan_info[col].dtype) & (loan_info[col].isnull().sum() > 0):
        loan_info[col] = loan_info[col].fillna('None')
        
for col in strings_df.columns:
    if is_string_dtype(strings_df[col].dtype) & (strings_df[col].isnull().sum() > 0):
        strings_df[col] = strings_df[col].fillna('None')        

In [35]:
strings_df[['maturity_time', 'maturity_paid', 'target_loose', 'roi_simple', 'target_strict', 'maturity_time_stat_adj', 'maturity_paid_stat_adj']] = loan_info[['maturity_time', 'maturity_paid', 'target_loose', 'roi_simple', 'target_strict', 'maturity_time_stat_adj', 'maturity_paid_stat_adj']]

# Store

In [36]:
ex = loan_info[~(loan_info['loan_status'].isin(good_statuses))][check_cols]
ex

Unnamed: 0,maturity_time,maturity_paid,maturity_time_stat_adj,maturity_paid_stat_adj,target_loose,target_strict,loan_status,issue_d,end_d,id
4,1.000000,0.583209,1.000000,1.000000,1,1,charged_off,2016-01-01,2018-04-01,56121
5,1.000000,0.138888,1.000000,1.000000,1,1,charged_off,2008-04-01,2009-03-01,56413
7,1.000000,0.171158,1.000000,1.000000,1,1,charged_off,2014-08-01,2015-05-01,57167
14,1.000000,0.943687,1.000000,1.000000,1,1,charged_off,2010-02-01,2013-03-01,61419
17,1.000000,0.244007,1.000000,1.000000,1,1,charged_off,2008-03-01,2009-05-01,62774
18,1.000000,0.695243,1.000000,1.000000,1,1,charged_off,2008-04-01,2010-10-01,64949
19,1.000000,0.639463,1.000000,1.000000,1,1,charged_off,2016-05-01,2018-01-01,65104
21,1.000000,0.583312,1.000000,1.000000,1,1,charged_off,2009-08-01,2011-11-01,65426
22,1.000000,0.243244,1.000000,1.000000,1,1,charged_off,2008-06-01,2009-07-01,65595
31,1.000000,0.499945,1.000000,1.000000,1,1,charged_off,2008-04-01,2010-03-01,67477


In [37]:
_, strings_df = mg.reduce_memory(strings_df)
strings_df.reset_index(drop=True, inplace=True)
_, loan_info = mg.reduce_memory(loan_info)
loan_info.reset_index(drop=True, inplace=True)
strings_df.to_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
loan_info.to_feather(os.path.join(dpath, 'loan_info.fth'))

trying to change columns to smaller dtypes when possible
original dataframe is 911.3728561401367 MB or 0.8900125548243523 GB


100%|██████████| 7/7 [00:00<00:00, 83.52it/s]


changed dtypes of 7 cols
reduced dataframe is 847.917652130127 MB or 0.8280445821583271 GB
trying to change columns to smaller dtypes when possible
original dataframe is 4621.357351303101 MB or 4.513044288381934 GB


100%|██████████| 22/22 [00:00<00:00, 124.88it/s]


changed dtypes of 22 cols
reduced dataframe is 4408.342344284058 MB or 4.3050218205899 GB


In [57]:
strings_df.head()

Unnamed: 0,desc,emp_title,title,url,roi_simple,id,maturity_time,maturity_paid,target_loose,target_strict,maturity_time_stat_adj,maturity_paid_stat_adj
0,,uxo technician,debt consolidation,https://lendingclub.com/browse/loanDetail.acti...,0.530701,126065224,0.5,0.444149,0.0,0.0,0.5,0.444149
1,,practice administrator,credit card refinancing,https://lendingclub.com/browse/loanDetail.acti...,0.359194,126413271,0.3,0.265663,0.0,0.0,0.3,0.265663
2,,supervsir,other,https://lendingclub.com/browse/loanDetail.acti...,0.391943,126176997,0.3,0.190153,1.0,1.0,1.0,1.0
3,,project manager,debt consolidation,https://lendingclub.com/browse/loanDetail.acti...,0.376405,126389018,0.3,0.282438,0.0,0.0,0.3,0.282438
4,,clinical program manager,medical expenses,https://lendingclub.com/browse/loanDetail.acti...,0.551107,126406926,0.5,0.479459,0.0,0.0,0.5,0.479459


In [58]:
loan_info.head()

Unnamed: 0,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,purpose,zip_code,addr_state,earliest_cr_line,initial_list_status,last_pymnt_d,next_pymnt_d,last_credit_pull_d,application_type,verification_status_joint,sec_app_earliest_cr_line,hardship_flag,hardship_type,hardship_reason,hardship_status,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_loan_status,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,loan_amnt,funded_amnt,term,int_rate,installment_currently,annual_inc,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,total_acc,out_prncp,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,open_rv_12m,open_rv_24m,max_bal_bc,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_term,unreceived_prncp,installment_at_funded,end_d,fico,orig_amt_due,roi_simple,id,revol_util,dti,dti_joint,il_util,all_util,bc_util,pct_tl_nvr_dlq,percent_bc_gt_75,sec_app_revol_util,settlement_percentage,line_history_d,line_history_m,line_history_y,months_passed,maturity_time,rem_to_be_paid,maturity_paid,maturity_time_stat_adj,maturity_paid_stat_adj,target_loose,target_strict
0,B,B5,10+ years,mortgage,none,2017-12-01,current,n,debt_consolidation,275xx,NC,1987-07-01,w,2019-05-01,2019-06-01,2019-05-01,individual,,NaT,N,,,,NaT,NaT,NaT,,N,NaT,,NaT,9600.0,9600.0,36,0.1199,318.820007,140000.0,0.0,690.0,694.0,0.0,,,15.0,0.0,80850.0,24.0,5753.459961,5094.72998,5094.72998,3846.540039,1248.189941,0.0,0.0,0.0,318.820007,684.0,680.0,0.0,,1.0,,0.0,0.0,141142.0,0.0,1.0,0.0,1.0,20.0,4119.0,0.0,3.0,27354.0,87600.0,0.0,0.0,0.0,4.0,9409.0,2250.0,0.0,0.0,196.0,365.0,15.0,15.0,2.0,19.0,,15.0,,0.0,7.0,12.0,7.0,8.0,3.0,13.0,19.0,12.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,175550.0,84969.0,64300.0,8000.0,,,,,,,,,,,,,,,,,,,,,5753.459961,318.812073,2019-06-01,692.0,11477.234375,0.530701,126065224,0.923,0.2305,,0.52,0.89,0.965,1.0,1.0,,,11111.0,365.0,30.416666,18.0,0.5,6376.026855,0.444149,0.5,0.444149,0,0.0
1,C,C1,1 year,rent,none,2017-12-01,current,n,credit_card,104xx,NY,1999-08-01,w,2019-04-01,2019-06-01,2019-05-01,individual,,NaT,N,,,,NaT,NaT,NaT,,N,NaT,,NaT,25000.0,25000.0,60,0.1262,563.97998,105000.0,0.0,725.0,729.0,1.0,,,20.0,0.0,16295.0,51.0,19788.140625,8979.860352,8979.860352,5211.859863,3768.0,0.0,0.0,0.0,563.97998,764.0,760.0,0.0,,1.0,,0.0,0.0,116607.0,1.0,9.0,1.0,2.0,4.0,100312.0,1.0,2.0,6418.0,33000.0,1.0,0.0,1.0,4.0,5830.0,7705.0,0.0,0.0,159.0,220.0,8.0,4.0,0.0,17.0,,5.0,,0.0,3.0,3.0,4.0,11.0,18.0,11.0,33.0,3.0,20.0,0.0,0.0,0.0,2.0,0.0,0.0,149133.0,116607.0,24000.0,116133.0,,,,,,,,,,,,,,,,,,,,,19788.140625,563.976379,2019-06-01,727.0,33838.582031,0.359194,126413271,0.494,0.2475,,0.86,0.78,0.679,1.0,0.75,,,6697.0,220.0,18.333334,18.0,0.3,24821.785156,0.265663,0.3,0.265663,0,0.0
2,G,G1,10+ years,rent,source,2017-12-01,charged_off,n,other,103xx,NY,2005-08-01,w,2019-02-01,NaT,2019-05-01,joint app,source,2012-09-01,N,,,,NaT,NaT,NaT,,N,NaT,,NaT,30000.0,30000.0,60,0.3079,985.210022,40000.0,1.0,700.0,704.0,0.0,4.0,,11.0,0.0,19754.0,23.0,0.0,11758.280273,11758.280273,2196.709961,9463.049805,98.519997,0.0,0.0,1000.0,579.0,575.0,0.0,,1.0,83000.0,0.0,0.0,46719.0,0.0,1.0,0.0,1.0,16.0,26965.0,0.0,1.0,6310.0,65200.0,1.0,0.0,0.0,2.0,4247.0,36563.0,0.0,0.0,115.0,148.0,23.0,16.0,0.0,38.0,,16.0,,0.0,6.0,7.0,7.0,12.0,4.0,10.0,19.0,7.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,97743.0,46719.0,44800.0,32543.0,51025.0,615.0,619.0,1.0,0.0,8.0,2.0,6.0,0.0,0.0,,,,,,,,,,,27803.289062,985.209045,2019-06-01,702.0,59112.542969,0.391943,126176997,0.303,0.3086,0.332,0.83,0.48,0.184,0.957,0.0,0.724,,4505.0,148.0,12.333333,18.0,0.3,50077.601562,0.190153,1.0,1.0,1,1.0
3,B,B5,< 1 year,rent,source,2017-12-01,current,n,debt_consolidation,926xx,CA,2005-10-01,w,2019-05-01,2019-06-01,2019-05-01,individual,,NaT,N,,,,NaT,NaT,NaT,,N,NaT,,NaT,35000.0,35000.0,60,0.1199,778.380005,65000.0,0.0,740.0,744.0,0.0,,,5.0,0.0,21026.0,16.0,27099.929688,13174.179688,13174.179688,7900.069824,5274.109863,0.0,0.0,0.0,778.380005,804.0,800.0,0.0,,1.0,,0.0,0.0,51125.0,0.0,2.0,0.0,2.0,17.0,25532.0,0.0,0.0,20936.0,33000.0,0.0,5.0,0.0,2.0,10225.0,11974.0,0.0,0.0,142.0,146.0,37.0,17.0,0.0,37.0,,,,0.0,2.0,2.0,2.0,2.0,12.0,2.0,3.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,73825.0,51125.0,33000.0,35182.0,,,,,,,,,,,,,,,,,,,,,27099.929688,778.379944,2019-06-01,742.0,46702.796875,0.376405,126389018,0.637,0.2681,,0.73,0.69,0.637,1.0,0.5,,,4444.0,146.0,12.166667,18.0,0.3,33470.246094,0.282438,0.3,0.282438,0,0.0
4,B,B1,10+ years,rent,none,2017-12-01,current,n,medical,954xx,CA,1993-01-01,w,2019-05-01,2019-06-01,2019-05-01,individual,,NaT,N,,,,NaT,NaT,NaT,,N,NaT,,NaT,6000.0,6000.0,36,0.0944,192.029999,160000.0,0.0,660.0,664.0,0.0,31.0,,10.0,0.0,3451.0,42.0,3326.199951,3306.639893,3306.639893,2673.800049,632.840027,0.0,0.0,0.0,192.029999,679.0,675.0,1.0,72.0,1.0,,0.0,2497.0,33970.0,0.0,7.0,0.0,0.0,41.0,30519.0,0.0,1.0,1285.0,7100.0,0.0,4.0,0.0,1.0,3397.0,1715.0,0.0,0.0,299.0,239.0,13.0,13.0,1.0,13.0,72.0,13.0,31.0,2.0,1.0,3.0,1.0,3.0,35.0,3.0,6.0,3.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,41876.0,33970.0,3000.0,34776.0,,,,,,,,,,,,,,,,,,,,,3326.199951,192.030411,2019-06-01,662.0,6913.094727,0.551107,126406926,0.486,0.0083,,0.88,0.81,0.428,0.738,0.0,,,9100.0,299.0,24.916666,18.0,0.5,3589.969238,0.479459,0.5,0.479459,0,0.0


# Old stuff

In [None]:
# code used for tweaking speed ups

# test = loan_info.tail(1000)

# loan_info.tail(30).apply(lambda x: rem_to_be_paid(x['out_prncp'],
#                                                   x['installment_currently'],
#                                                   x['int_rate']), axis=1)

# apply_rem_to_be_paid(test['out_prncp'].values, test['installment_currently'].values,
#                      test['int_rate'].values)

# %timeit apply_rem_to_be_paid(test['out_prncp'].values, test['installment_currently'].values,test['int_rate'].values)

# %timeit loan_info.tail(1000).apply(lambda x: rem_to_be_paid(x['out_prncp'],x['installment_currently'],x['int_rate']), axis=1)

# %prun -l 10 apply_rem_to_be_paid(test['out_prncp'].values, test['installment_currently'].values,test['int_rate'].values)

# %timeit apply_rem_to_be_paid(test['out_prncp'].values, test['installment_currently'].values,test['int_rate'].values)

In [53]:
# # fix loans with no record at all for a specific month ________________________
# store.open()
# pmt_hist_ids = store['pmt_hist_ids'].astype(int)
# max_id = pmt_hist_ids.max()
# chunksize = 800
# n_chunks = len(pmt_hist_ids)//chunksize + 1
# bad_statuses = set(['late_120', 'defaulted', 'charged_off', 'late_30'])
# # m_disc_rate = .08/12

# target_strict_dict = {}
# nvps_8_dict = {}
# for n in tqdm_notebook(np.arange(n_chunks)):
#     if n == 0:
#         left_bound = 0
#     else:
#         left_bound = pmt_hist_ids[n*chunksize]
#     if n == (n_chunks - 1):
#         right_bound = max_id
#     else:
#         right_bound = pmt_hist_ids[(n+1)*chunksize]
    
#     chunk = pd.read_hdf(
#         store,
#         'pmt_hist_intermediary_2',
#         where='(loan_id_num > left_bound) & (loan_id_num <= right_bound)')
    
#     id_grouped = chunk.groupby('loan_id')
#     for ids, group in id_grouped:
#         statuses = set(group['status_period_end'])
#         if len(statuses.intersection(bad_statuses)) > 0:
#             target_strict_dict[ids] = 1
#         else:
#             target_strict_dict[ids] = 0
        

In [None]:
# # out_prncp is messed up. Fix it
# loan_info['out_prncp'] = loan_info['funded_amnt'] - loan_info['total_rec_prncp']
# loan_info['out_prncp'] = np.where(loan_info['out_prncp'] <= 0.019, 0, loan_info['out_prncp'])
# loan_info['out_prncp'] = loan_info['out_prncp'].round(2)