# Final Project
## Introduction to Business Analytics
Spring 2017 
***

## Approved Applications

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

# For file loading and memory monitoring
import os
import gc
import psutil
'''
    If you get error saying no module named psutil, run this in terminal :
    sudo su
    pip install psutil
'''

# collect garbage and check current memory
def collect_and_check_mem():
    proc = psutil.Process(os.getpid())
    gc.collect()
    mem = proc.memory_info().rss
    print ("Memory : %.2f MB" % (mem / (1000 * 1000)))
collect_and_check_mem()

Memory : 78.61 MB


In [2]:
# data loading util
def load_year_data(year, suffixs):
    loans_data = pd.DataFrame()
    for index in range(len(suffixs)):
        suffix = suffixs[index]
        path ="data/loans/%s/xa%s.csv.gz" % (year, suffix)
        if index == 0:
            loans_data = pd.read_csv(path, skiprows=1)
        else:
            frame = pd.read_csv(path, skiprows=0, names=loans_data.columns)
            loans_data = loans_data.append(frame, ignore_index=True)
            del frame
    return loans_data
# create suffix from start (e.g. 'a') to end (e.g. 'z')
def create_suffixs(start, end):
    return [chr(i) for i in range(ord(start), ord(end)+1)]

In [3]:

#df1 = pd.read_csv("data/loans/2007-2011/xaa.csv.gz",skiprows=1)
#df2 = pd.read_csv("data/loans/2007-2011/xab.csv.gz",skiprows=0, names=df1.columns)
#loans_2007_2011 = pd.concat([df1,df2])
#loans_2007_2011.columns

#Reason why memory overflow: 
#    When we do read and concat, concat actually create a copy with each data frame (data frame is immutable)
#    So after concat at least we need to delete the data chunks and do garbage collection:
#        del df1
#        gc.collect()
#    But never mind, we will use load_year_data helper which will load data for each year without keeping temporary data 

# 'a' to 'b' for 2007-2011
# suffixs = create_suffixs('a', 'b')
# loans_2007_2011 = load_year_data('2007-2011', suffixs)


In [4]:
# 'a' to 'g' for 2012-2013
# suffixs = create_suffixs('a', 'g')
# loans_2007_2011 = load_year_data('2012-2013', suffixs)

In [5]:
# 'a' to 'h' for 2014
# suffixs = create_suffixs('a', 'h')
# loans_2014 = load_year_data('2012-2013', suffixs)

In [6]:
# 'a' to 'o' for 2015
# suffixs = create_suffixs('a', 'o')
# loans_2015 = load_year_data('2015', suffixs)
# collect_and_check_mem()
# len(loans_2015.index)

In [7]:
# Before scaling up the instance we can try small set of data
years_info = {
    "2016Q1": create_suffixs('a', 'e'), 
    #"2016Q2": create_suffixs('a', 'd'), 
    #"2016Q3": create_suffixs('a', 'd'), 
    "2016Q4": create_suffixs('a', 'd')
}
loans_2016 = pd.DataFrame()
for year in years_info:
    frame = load_year_data(year, years_info[year])
    loans_2016 = loans_2016.append(frame, ignore_index=True)
    collect_and_check_mem()
collect_and_check_mem()
len(loans_2016.index)

  if self.run_code(code, result):
  if self.run_code(code, result):


Memory : 360.45 MB


  if self.run_code(code, result):


Memory : 425.78 MB
Memory : 425.78 MB


237437

In [None]:
# # Merge any years you want
# loans_data = pd.concat([loans_2015, loans_2016])
# collect_and_check_mem()

# # IMPORTANT: remove the useless temporary frames:
# del loans_2015
# del loans_2016
# collect_and_check_mem()
# len(loans_data.index)

# While the collected memory of deleted object will not be returned to OS but kept for python
# so the memory does not go down as expected, but actually they are available:
# reference: http://stackoverflow.com/questions/39100971/how-do-i-release-memory-used-by-a-pandas-dataframe

In [8]:
# Currenly I just use 2016:
loans_data = loans_2016

In [14]:
# since too many features, maybe we take out those we wanna keep:
features_to_key = set([
    # numerical
    'loan_amnt', 'funded_amnt', 'annual_inc', 'installment',
    'open_acc', 'total_acc',
    # Some of the following line features are duplicates? Do we need them all?
    # Say: total_pymnt = total_rec_prncp + total_rec_int
    'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
    'recoveries', 'collection_recovery_fee', 
    'last_pymnt_amnt',  
    
    # target ?
    'loan_status', 
    
    # categorical
        #dates
    #'last_pymnt_d', 296 NULL values for 2016Q1Q4, since we have last_pymt_amnt maybe ignore it
    #'last_credit_pull_d', # 21 NULL values for 2016Q1Q4 same as above, maybe ignore it
        #other
    'verification_status', 'purpose', 'addr_state',
    'grade', 'sub_grade', 'home_ownership', 'term',
    
    # special
    #'title', # 10629 NULLs for 2016 Q1Q4 value maybe forget it
    'issue_d', # categorical ? Dec-11
    'int_rate', 'revol_util' # trim out percentage mark: 10.65%
    'emp_length', # extracting number: 10+ years < 1 year
    
    # not sure:
    'inq_last_6mths', 'pub_rec', 'revol_bal', 'dti', 'delinq_2yrs', 
    'pymnt_plan', 'earliest_cr_line' 'initial_list_status',
    'out_prncp', 'out_prncp_inv',
    'collections_12_mths_ex_med',
    'policy_code', 'application_type',
    'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt',
    'pub_rec_bankruptcies', 'tax_liens'
])

for column in loans_data.columns:
    if column not in features_to_key:
        loans_data = loans_data.drop(column, axis=1)
loans_data.columns

Index(['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment', 'grade',
       'sub_grade', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'addr_state', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths',
       'delinq_amnt', 'pub_rec_bankruptcies', 'tax_liens'],
      dtype='object')

In [15]:
# Check NULL
loans_data.isnull().sum()

loan_amnt                     4
funded_amnt                   4
term                          4
int_rate                      4
installment                   4
grade                         4
sub_grade                     4
home_ownership                4
annual_inc                    4
verification_status           4
issue_d                       4
loan_status                   4
pymnt_plan                    4
purpose                       4
addr_state                    4
dti                           4
delinq_2yrs                   4
inq_last_6mths                5
open_acc                      4
pub_rec                       4
revol_bal                     4
total_acc                     4
out_prncp                     4
out_prncp_inv                 4
total_pymnt                   4
total_pymnt_inv               4
total_rec_prncp               4
total_rec_int                 4
total_rec_late_fee            4
recoveries                    4
collection_recovery_fee       4
last_pym

In [20]:
# Drop rows with NULL values:
loans_data = loans_data.dropna()
#loans_data.isnull().sum()
len(loans_data.index)

237432

In [19]:
loans_data.describe()

Unnamed: 0,loan_amnt,funded_amnt,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,policy_code,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
count,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,...,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0,237432.0
mean,14962.207179,14962.207179,449.053061,80583.26,21.134602,0.352627,0.568521,11.949185,0.25243,17635.85,...,6.214471,1.098292,1502.746316,0.021501,1.0,0.00673,0.00943,22.818057,0.138899,0.073941
std,9043.615916,9043.615916,270.01149,77714.21,138.919001,0.929862,0.870691,5.769793,0.668555,24779.57,...,138.462758,24.747863,4334.122513,0.166286,0.0,0.0871,0.112217,1000.792344,0.391568,0.459585
min,1000.0,1000.0,30.12,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8000.0,8000.0,251.58,48800.0,12.56,0.0,0.0,8.0,0.0,6267.0,...,0.0,0.0,263.58,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,13000.0,13000.0,381.27,68000.0,18.54,0.0,0.0,11.0,0.0,11641.5,...,0.0,0.0,412.82,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,20000.0,20000.0,602.3,95500.0,25.2325,0.0,1.0,15.0,0.0,20862.0,...,0.0,0.0,682.44,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,40000.0,40000.0,1584.9,9550000.0,9999.0,29.0,5.0,97.0,46.0,1023940.0,...,21245.02,3824.1036,41301.78,12.0,1.0,6.0,7.0,185408.0,9.0,45.0


## Rejected applications (deprecated)

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

reject_2007_2012 = pd.read_csv("data/rejected/2007-2012/RejectStatsA.csv.gz",skiprows=1)

#df1 = pd.read_csv("data/rejected/2013-2014/xaa.csv.gz",skiprows=1)
#df2 = pd.read_csv("data/rejected/2013-2014/xab.csv.gz",skiprows=0,names=df1.columns)
#reject_2013_2014 = pd.concat([df1,df2])

#df2 = pd.read_csv("data/rejected/2015/xaa.csv.gz",skiprows=1)
#df2 = pd.read_csv("data/rejected/2015/xab.csv.gz",skiprows=0, names=df1.columns)
#reject_2015 = pd.concat([df1,df2])

#reject_2016Q1 = pd.read_csv("data/rejected/2016Q1/RejectStats_2016Q1.csv.gz",skiprows=1)

#reject_2016Q2 = pd.read_csv("data/rejected/2016Q2/RejectStats_2016Q2.csv.gz",skiprows=1)

#reject_2016Q3 = pd.read_csv("data/rejected/2016Q3/RejectStats_2016Q3.csv.gz",skiprows=1)

#reject_2016Q4 = pd.read_csv("data/rejected/2016Q4/RejectStats_2016Q4.csv.gz",skiprows=1)




In [None]:
reject_2007_2012.columns

In [None]:
reject_2007_2012.describe()