### Import required libraries and define helper functions

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '{:,.0f}'.format(x))

In [2]:
def uniformly_read_n_lines(filename, n_lines_to_read, printDetails = True):

    f = open(filename, encoding = "utf-8", errors = "ignore")
    n_lines_in_file = -1
    for line in f:
        n_lines_in_file = n_lines_in_file + 1

    every_other_x_lines = round(n_lines_in_file/n_lines_to_read,0)
    
    if printDetails:
        print("Reading file",filename,"...")
        print("Number of observations in",filename,"is:",'{:,.0f}'.format(n_lines_in_file))
        print("To read in",'{:,.0f}'.format(n_lines_to_read),"lines, we need to read 1 line out of every",'{:,.0f}'.format(every_other_x_lines),"lines")
    
    df = pd.read_csv(filename, header = 0, skiprows = lambda i: i % every_other_x_lines != 0)
    
    return df

In [3]:
def EDA_1(df, describe = True, head_and_tail = True):
    print("Size of dataframe:",df.shape)
    print("")
          
    print("List of columns:")
    i = 0
    for column in df.columns:
        i = i+1
        print(i,"-",column)
    
    print("")
    print("Quantiles of numeric variables:")
    if describe:
        print(df.describe())

    if head_and_tail:
        df_head = df.head(n=5)
        df_head_and_tail = df_head.append(df.tail(n=5))
        return df_head_and_tail

In [4]:
def EDA_2(df):
    for column in df.columns:
        print("#"*50)
        print("Random sample of column name (",column,")...")
        print(df[column].sample(n=10))

        print("")

        print("Number of missing values:",df[column].isnull().sum())

        print("")

        if df[column].dtype.name == "category":
            print(df[column].value_counts())
        else:
            print(df[column].describe())
        print("#"*50)

        print("")
        print("")
        print("")

### Read in "rejected loans" dataset

In [4]:
df_rejected = uniformly_read_n_lines("rejected_2007_to_2018q4.csv", n_lines_to_read = 10**4)

Reading file rejected_2007_to_2018q4.csv ...
Number of observations in rejected_2007_to_2018q4.csv is: 27,648,741
To read in 10,000 lines, we need to read 1 line out of every 2,765 lines


In [19]:
EDA_1(df_rejected)

Size of dataframe: (9999, 9)

List of columns:
1 - Amount Requested
2 - Application Date
3 - Loan Title
4 - Risk_Score
5 - Debt-To-Income Ratio
6 - Zip Code
7 - State
8 - Employment Length
9 - Policy Code

Quantiles of numeric variables:
       Amount Requested   Risk_Score  Policy Code
count       9999.000000  3334.000000  9999.000000
mean       13326.080108   629.041092     0.006601
std        16222.286452    87.617875     0.114713
min         1000.000000     0.000000     0.000000
25%         5000.000000   592.000000     0.000000
50%        10000.000000   636.000000     0.000000
75%        20000.000000   675.000000     0.000000
max       300000.000000   931.000000     2.000000


Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,3500.0,2007-11-16,Jessica,503.0,1.33%,461xx,IN,2 years,0.0
1,2000.0,2008-01-04,lesdmc,466.0,77.48%,759xx,TX,5 years,0.0
2,8000.0,2008-02-03,debt_consolidation,681.0,51.24%,221xx,VA,10+ years,0.0
3,1500.0,2008-03-13,debt_consolidation,652.0,13.94%,197xx,DE,< 1 year,0.0
4,1000.0,2008-04-06,Taking business & marketing courses,513.0,35.88%,945xx,CA,10+ years,0.0
9994,5000.0,2016-12-30,home_improvement,,9.45%,296xx,SC,< 1 year,0.0
9995,15000.0,2016-12-30,Debt consolidation,644.0,21.58%,852xx,AZ,< 1 year,0.0
9996,20000.0,2016-12-31,Debt consolidation,600.0,20.97%,750xx,TX,< 1 year,0.0
9997,15000.0,2016-12-31,house,,27.7%,800xx,CO,< 1 year,0.0
9998,3000.0,2016-12-31,Debt consolidation,559.0,17.04%,554xx,MN,< 1 year,0.0


### Read in the "accepted loans" dataset

In [6]:
df_accepted = uniformly_read_n_lines("../data/accepted_2007_to_2018Q4.csv", n_lines_to_read = 10**4)



Reading file ../data/accepted_2007_to_2018Q4.csv ...
Number of observations in ../data/accepted_2007_to_2018Q4.csv is: 2,260,701
To read in 10,000 lines, we need to read 1 line out of every 226 lines


In [10]:
df_accepted.to_csv(path_or_buf = "../data/joe.csv", sep = ',', header=True)

In [34]:
EDA_2(df_accepted)

##################################################
Random sample of column name ( id )...
1100     51766134
9399    121770770
9710     93379255
6959    132173707
7280       466423
4343    104040712
9464    121276802
5216     29294505
4664     71836861
9475    120655654
Name: id, dtype: int64

Number of missing values: 0

count        10,003
mean     80,328,956
std      44,981,772
min         121,568
25%      45,182,232
50%      84,498,420
75%     122,661,334
max     145,593,179
Name: id, dtype: float64
##################################################



##################################################
Random sample of column name ( member_id )...
8204   nan
1937   nan
960    nan
5154   nan
1928   nan
3363   nan
2267   nan
9540   nan
7555   nan
8448   nan
Name: member_id, dtype: float64

Number of missing values: 10003

count     0
mean    nan
std     nan
min     nan
25%     nan
50%     nan
75%     nan
max     nan
Name: member_id, dtype: float64
#####################################

Number of missing values: 0

count   10,003
mean     4,167
std      7,320
min          0
25%          0
50%          0
75%      6,002
max     38,451
Name: out_prncp, dtype: float64
##################################################



##################################################
Random sample of column name ( out_prncp_inv )...
7700        0
403         0
7623        0
3005        0
5710        0
1889    3,454
6687    9,468
7446   11,829
3664    1,284
2166    3,718
Name: out_prncp_inv, dtype: float64

Number of missing values: 0

count   10,003
mean     4,167
std      7,319
min          0
25%          0
50%          0
75%      6,002
max     38,451
Name: out_prncp_inv, dtype: float64
##################################################



##################################################
Random sample of column name ( total_pymnt )...
6493    1,319
4609   22,628
9216    7,947
6043      938
582    21,913
3581    2,335
1329   55,534
7215   11,335
9205   11,715
6580    2,516
Name: tot

Name: chargeoff_within_12_mths, dtype: float64
##################################################



##################################################
Random sample of column name ( delinq_amnt )...
7485   0
7349   0
3275   0
2351   0
7998   0
1162   0
591    0
9013   0
398    0
1689   0
Name: delinq_amnt, dtype: float64

Number of missing values: 0

count   10,003
mean        16
std        839
min          0
25%          0
50%          0
75%          0
max     65,000
Name: delinq_amnt, dtype: float64
##################################################



##################################################
Random sample of column name ( mo_sin_old_il_acct )...
9936    93
2156   116
5154   144
1716   154
449    165
2840   145
2914   142
8054   120
746     75
7052   nan
Name: mo_sin_old_il_acct, dtype: float64

Number of missing values: 621

count   9,382
mean      126
std        53
min         1
25%        97
50%       130
75%       153
max       443
Name: mo_sin_old_il_acct, dtype: floa

Number of missing values: 9956

count                   47
unique                   8
top       NATURAL_DISASTER
freq                    18
Name: hardship_reason, dtype: object
##################################################



##################################################
Random sample of column name ( hardship_status )...
1487    NaN
1616    NaN
4169    NaN
6799    NaN
9973    NaN
2549    NaN
5086    NaN
1854    NaN
4208    NaN
4210    NaN
Name: hardship_status, dtype: object

Number of missing values: 9956

count            47
unique            3
top       COMPLETED
freq             36
Name: hardship_status, dtype: object
##################################################



##################################################
Random sample of column name ( deferral_term )...
9926   nan
2974   nan
5054   nan
8003   nan
1117   nan
5601   nan
1634   nan
9685   nan
510    nan
2953   nan
Name: deferral_term, dtype: float64

Number of missing values: 9956

count   47
mean     3
std