In [None]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime

df = pd.read_csv("full_clean_raw.csv")

df['return'] = df['total_pymnt']/df['funded_amnt'] #% return for each loan
df['profit'] = df['total_pymnt'] - df['funded_amnt'] #calculate profit per loan

#Adding regions
regions = {
    'NE': ['ME', 'NA', 'VT', 'MA', 'RI', 'CT', 'NJ', 'PA', 'MD', 'VA', 'WV', 'KY', 'OH', 'IN', 'NY'],
    'SE': ['NC', 'SC', 'GA', 'FL', 'AL', 'TN', 'MS', 'AR', 'LA', 'OK', 'TX'],
    'NC': ['ND', 'SD', 'NE', 'KS', 'MO', 'IA', 'MN', 'WI', 'MI', 'IL'],
    'NW': ['WA', 'OR', 'ID', 'MT', 'WY', 'AK'],
    'SW': ['CA', 'NV', 'UT', 'CO', 'NM', 'AZ', 'HI']
}
regionsFix = {}
for key in regions.keys():
    for state in regions[key]:
        regionsFix[state] = key        
df['region'] = df['addr_state'].map(regionsFix)

"""
Code to remove all loans that are not closed:
df = df.drop(df[df['status'] == "current"].index) 
df = df.drop(df[df['status'] == "in grace period"].index) 
df = df.drop(df[df['status'] == "late (16-30 days)"].index) 
df = df.drop(df[df['status'] == "late (31-120 days)"].index)
df = df.drop(df[df['status'] == "default"].index)
"""

#df.to_csv('full_data_Vardy.csv') #write to csv
#list(df.columns.values) #see what columns we have

In [None]:
#correlations
df.corr() #correlation matrix

In [None]:
#return % by loan status
pd.pivot_table(df, index="status", values="return") 
df_pre_agg = df.groupby('status').agg({'total_pymnt' : 'sum', 'funded_amnt' : 'sum'})
df_pre_agg['return'] = df_pre_agg['total_pymnt']/df_pre_agg['funded_amnt']
df_pre_agg['return']

In [None]:
#remove open loans
df = df.drop(df[df['status'] == "current"].index) 
df = df.drop(df[df['status'] == "in grace period"].index) 
df = df.drop(df[df['status'] == "late (16-30 days)"].index) 
df = df.drop(df[df['status'] == "late (31-120 days)"].index)
df = df.drop(df[df['status'] == "default"].index)

#return by employement status
pd.pivot_table(df, index="emp_length_num", values="return") #

In [None]:
#return by grades
pd.pivot_table(df, index="sub_grade", values="return", aggfunc=len) #need to convert to percentage

In [None]:
#return % of positive return in each category
df['pos_return'] = df['return'].apply(lambda x: 1 if x>=1 else 0)
pd.pivot_table(df, index="sub_grade", values="pos_return")
#all sub-grades between roughly 20-30%

pd.pivot_table(df, index="grade", values="pos_return")
##all grades between roughly 20-30%

In [None]:
#remove all loans that are not closed
df = df.drop(df[df['status'] == "current"].index) 
df = df.drop(df[df['status'] == "in grace period"].index) 
df = df.drop(df[df['status'] == "late (16-30 days)"].index) 
df = df.drop(df[df['status'] == "late (31-120 days)"].index)
df = df.drop(df[df['status'] == "default"].index)

#profit by status
df_prof = df.groupby('status').agg({'profit' : 'sum', 'funded_amnt' : 'sum'})
df_prof['margin'] = df_prof['profit']/df_prof['funded_amnt']
sum(df_prof['profit'])/sum(df_prof['funded_amnt']) #total profit margin is -49.2%, -1.6% if discounting all loans

#YoY breakdown
#df_yoy_prof = df.groupby('year').agg({'profit' : 'sum', 'funded_amnt' : 'sum'})
#df_yoy_prof['gross_profit'] = (df_yoy_prof['profit']/df_yoy_prof['funded_amnt'] * 100).round(2)
#df_yoy_prof['gross_profit']

In [None]:
#profit by year
df = df.drop(df[df['status'] == "current"].index) #remove records where status = "current"
df['year'] = df['issue_d'].apply(lambda x: x[:4])
df_y = df.groupby('year').agg({'profit' : 'sum', 'funded_amnt' : 'sum', 'member_id': 'count'})
df_y['margin'] = df_y['profit']/df_y['funded_amnt']
df_y #profit margin is between -4% and 10% before 2012, but is -33% and -74% in 2013 and 2014 respectively

analysis = pd.pivot_table(df, values='member_id', index='year', columns='status', aggfunc=len, fill_value=0)
for col in analysis.columns:
    analysis[col] = (analysis[col]/df_y['member_id'] * 100).round(2)

analysis #there is an increase in late payment and grace period status loans in 2012-2014

In [None]:
#profit by region
df = df.drop(df[df['status'] == "current"].index) #remove records where status = "current"
df['year'] = df['issue_d'].apply(lambda x: x[:4])
analysis = pd.pivot_table(df, values='profit', index='year', columns='region', aggfunc=np.sum)
diviz = pd.pivot_table(df, values='funded_amnt', index='year', columns='region', aggfunc=np.sum)
(analysis.divide(diviz) * 100).round(2) #NW doing better in the last 3 years

#a = pd.pivot_table(df, values='profit', index='region', aggfunc=np.sum)
#b = pd.pivot_table(df, values='funded_amnt', index='region', aggfunc=np.sum)
#a['profit'].divide(b['funded_amnt']) #historically NW and SW perform better