In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime

df = pd.read_csv("full_clean_raw.csv")

df['return'] = df['total_pymnt']/df['funded_amnt'] #% return for each loan
df['profit'] = df['total_pymnt'] - df['funded_amnt'] #calculate profit per loan

#Adding regions
regions = {
    'NE': ['ME', 'NA', 'VT', 'MA', 'RI', 'CT', 'NJ', 'PA', 'MD', 'VA', 'WV', 'KY', 'OH', 'IN', 'NY', 'NH', 'DC', 'DE'],
    'SE': ['NC', 'SC', 'GA', 'FL', 'AL', 'TN', 'MS', 'AR', 'LA', 'OK', 'TX'],
    'NC': ['ND', 'SD', 'NE', 'KS', 'MO', 'IA', 'MN', 'WI', 'MI', 'IL'],
    'NW': ['WA', 'OR', 'ID', 'MT', 'WY', 'AK'],
    'SW': ['CA', 'NV', 'UT', 'CO', 'NM', 'AZ', 'HI']
}
regionsFix = {}
for key in regions.keys():
    for state in regions[key]:
        regionsFix[state] = key        
df['region'] = df['addr_state'].map(regionsFix)

#Code to remove all loans that are not closed:
#df = df.drop(df[df['status'] == "current"].index) 
#df = df.drop(df[df['status'] == "in grace period"].index) 
#df = df.drop(df[df['status'] == "late (16-30 days)"].index) 
#df = df.drop(df[df['status'] == "late (31-120 days)"].index)
#df = df.drop(df[df['status'] == "default"].index)

list(df.columns.values) #see what columns we have

['Unnamed: 0',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'is_inc_v',
 'issue_d',
 'pymnt_plan',
 'purpose',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'not_compliant',
 'status',
 'inactive_loans',
 'bad_loans',
 'emp_length_num',
 'pub_rec_zero',
 'short_emp',
 'payment_inc_ratio',
 'final_d',
 'last_record_none',
 'last_major_derog_none',
 'home_ownership',
 'annual_inc',
 'addr_state',
 'dti',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'open_acc',
 'pub_rec',
 'total_acc',
 'inc_v_binary',
 'deliquency',
 'status_binary',
 'return',
 'profit',
 'region']

In [None]:
#correlations
df.corr() #correlation matrix

In [None]:
#return % by loan status
pd.pivot_table(df, index="status", values="return") 
df_pre_agg = df.groupby('status').agg({'total_pymnt' : 'sum', 'funded_amnt' : 'sum'})
df_pre_agg['return'] = df_pre_agg['total_pymnt']/df_pre_agg['funded_amnt']
df_pre_agg['return']

In [None]:
#remove open loans
df = df.drop(df[df['status'] == "current"].index) 
df = df.drop(df[df['status'] == "in grace period"].index) 
df = df.drop(df[df['status'] == "late (16-30 days)"].index) 
df = df.drop(df[df['status'] == "late (31-120 days)"].index)
df = df.drop(df[df['status'] == "default"].index)

#return by employement status
pd.pivot_table(df, index="emp_length_num", values="return") #

In [None]:
#return by grades
pd.pivot_table(df, index="sub_grade", values="return", aggfunc=len) #need to convert to percentage

In [None]:
#return % of positive return in each category
df['pos_return'] = df['return'].apply(lambda x: 1 if x>=1 else 0)
pd.pivot_table(df, index="sub_grade", values="pos_return")
#all sub-grades between roughly 20-30%

pd.pivot_table(df, index="grade", values="pos_return")
##all grades between roughly 20-30%

In [None]:
#remove all loans that are not closed
df = df.drop(df[df['status'] == "current"].index) 
df = df.drop(df[df['status'] == "in grace period"].index) 
df = df.drop(df[df['status'] == "late (16-30 days)"].index) 
df = df.drop(df[df['status'] == "late (31-120 days)"].index)
df = df.drop(df[df['status'] == "default"].index)

#df = df.drop(df[df['term'] == 60].index) #remove 60 term loans

#profit by status
df_prof = df.groupby('status').agg({'profit' : 'sum', 'funded_amnt' : 'sum'})
df_prof['margin'] = df_prof['profit']/df_prof['funded_amnt']
sum(df_prof['profit'])/sum(df_prof['funded_amnt']) #total profit margin is -49.2%, -1.6% if discounting all loans

#YoY breakdown
df['year'] = df['issue_d'].apply(lambda x: x[:4])
df_yoy_prof = df.groupby('year').agg({'profit' : 'sum', 'funded_amnt' : 'sum'})
df_yoy_prof['gross_profit'] = (df_yoy_prof['profit']/df_yoy_prof['funded_amnt'] * 100).round(2)
df_yoy_prof

#pd.pivot_table(df, values='profit', index='year', columns='region', aggfunc=np.sum)

In [None]:
#profit by year
df = df.drop(df[df['status'] == "current"].index) #remove records where status = "current"
df['year'] = df['issue_d'].apply(lambda x: x[:4])
df_y = df.groupby('year').agg({'profit' : 'sum', 'funded_amnt' : 'sum', 'member_id': 'count'})
df_y['margin'] = df_y['profit']/df_y['funded_amnt']
df_y #profit margin is between -4% and 10% before 2012, but is -33% and -74% in 2013 and 2014 respectively

analysis = pd.pivot_table(df, values='member_id', index='year', columns='status', aggfunc=len, fill_value=0)
for col in analysis.columns:
    analysis[col] = (analysis[col]/df_y['member_id'] * 100).round(2)

analysis #there is an increase in late payment and grace period status loans in 2012-2014

In [None]:
#profit by region
df = df.drop(df[df['status'] == "current"].index) #remove records where status = "current"
df['year'] = df['issue_d'].apply(lambda x: x[:4])
analysis = pd.pivot_table(df, values='profit', index='year', columns='region', aggfunc=np.sum)
diviz = pd.pivot_table(df, values='funded_amnt', index='year', columns='region', aggfunc=np.sum)
(analysis.divide(diviz) * 100).round(2) #NW doing better in the last 3 years

#a = pd.pivot_table(df, values='profit', index='region', aggfunc=np.sum)
#b = pd.pivot_table(df, values='funded_amnt', index='region', aggfunc=np.sum)
#a['profit'].divide(b['funded_amnt']) #historically NW and SW perform better

In [5]:
df.corr()

Unnamed: 0.1,Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,pymnt_plan,initial_list_status,...,inq_last_6mths,mths_since_last_delinq,open_acc,pub_rec,total_acc,inc_v_binary,deliquency,status_binary,return,profit
Unnamed: 0,1.0,0.969179,0.089192,0.094155,0.106794,0.085248,0.006507,0.065896,-0.010375,0.398269,...,-0.083535,-0.039342,0.117551,0.122146,0.091623,0.058306,0.084973,-0.216837,-0.806639,-0.593048
member_id,0.969179,1.0,0.070109,0.07384,0.083045,0.079852,-0.028283,0.042535,-0.00996,0.365043,...,-0.086367,-0.034101,0.099947,0.107657,0.074443,0.037709,0.073736,-0.194641,-0.763492,-0.560655
loan_amnt,0.089192,0.070109,1.0,0.998548,0.994378,0.412852,0.167152,0.949682,0.008177,0.075111,...,-0.020324,-0.041373,0.204174,-0.08114,0.235257,0.355983,0.016071,-0.014455,-0.1352,-0.607009
funded_amnt,0.094155,0.07384,0.998548,1.0,0.996157,0.410843,0.16789,0.951803,0.008232,0.077481,...,-0.020966,-0.041457,0.204976,-0.080597,0.23513,0.35579,0.016648,-0.016364,-0.139285,-0.610841
funded_amnt_inv,0.106794,0.083045,0.994378,0.996157,1.0,0.411975,0.169296,0.947439,0.008227,0.083232,...,-0.027552,-0.036761,0.205809,-0.078905,0.235445,0.359291,0.017337,-0.023772,-0.147586,-0.615246
term,0.085248,0.079852,0.412852,0.410843,0.411975,1.0,0.443559,0.159632,0.004227,0.101379,...,0.009987,-0.011284,0.077712,-0.022794,0.101322,0.290874,0.008931,0.025738,-0.182673,-0.360163
int_rate,0.006507,-0.028283,0.167152,0.16789,0.169296,0.443559,1.0,0.148678,0.01058,-0.009786,...,0.205697,-0.046787,0.012369,0.066677,-0.032954,0.220383,0.090684,0.098657,-0.023863,-0.086069
installment,0.065896,0.042535,0.949682,0.951803,0.947439,0.159632,0.148678,1.0,0.008867,0.043116,...,0.00225,-0.045798,0.196847,-0.070097,0.215921,0.315393,0.027217,-0.011734,-0.084897,-0.529971
pymnt_plan,-0.010375,-0.00996,0.008177,0.008232,0.008227,0.004227,0.01058,0.008867,1.0,-0.004699,...,0.001354,0.002019,0.001088,-0.002554,0.002039,0.004182,-0.000287,-0.003176,0.00558,0.000517
initial_list_status,0.398269,0.365043,0.075111,0.077481,0.083232,0.101379,-0.009786,0.043116,-0.004699,1.0,...,-0.038072,-0.01535,0.059209,0.040992,0.053057,0.032842,0.030902,-0.097386,-0.334414,-0.275876
