In [42]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import plotly.express as px
import datetime

from shapely.geometry import Polygon, MultiPolygon
from collections import Counter

import missingno as msno
import re
from uszipcode import SearchEngine, SimpleZipcode, ComprehensiveZipcode

import seaborn as sns
sns.set(rc={'figure.figsize':(14.15,10)})
import matplotlib.pyplot as plt

import warnings
# warnings.filterwarnings('ignore')

In [3]:
%load_ext rpy2.ipython
# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

<rpy2.rinterface_lib.sexp.NULLType object at 0x7fa36525c640> [RTYPES.NILSXP]

In [4]:
# R package names
packnames = ('olsrr', 'car', 'corrplot')

# R vector of strings
from rpy2.robjects.vectors import StrVector

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

In [5]:
%%R -o pairsJDRS -o myResPlots -o myResPlots2
library(olsrr)
library(car)

# Use: Best Subsets Regression
# library(leaps)
library(corrplot)

# Use: Plot CIs
# library(plotrix)

pairsJDRS <- function (R, ...) {
  panel.cor <- function(x, y, digits = 2, prefix = "", ...) {
    usr <- par("usr")
    on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- cor(x, y, use = "pairwise.complete.obs", method = "pearson")
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste(prefix, txt, sep = "")
    cex <- 0.8/strwidth(txt)
    test <- cor.test(x, y)
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE, 
                     cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***", 
                                                                              "**", "*", ".", " "))
    text(0.5, 0.5, txt, cex = cex * (abs(r) + 0.3)/1.3)
    text(0.8, 0.8, Signif, cex = cex, col = 2)
  }
  hist.panel = function(x, ...) {
    par(new = TRUE)
    hist(x, col = "light gray", probability = TRUE, axes = FALSE, 
         main = "", breaks = "FD")
    lines(density(x, na.rm = TRUE), col = "red", lwd = 1)
    rug(x)
  }
  pairs(R, gap = 0, upper.panel = panel.smooth, lower.panel = panel.cor, 
        diag.panel = hist.panel,  ...)
}


myResPlots <- function(model, label){
  
  #Normal quantile plot of studentized residuals
  qqPlot(rstudent(model), pch=19, main=paste("NQ Plot of Studentized Residuals,",label))
  
  #plot of fitted vs. studentized residuals
  plot(rstudent(model) ~ model$fitted.values, pch=19, col='red', xlab="Fitted Values", ylab = "Studentized Residuals",
     main = paste("Fits vs. Studentized Residuals,", label))
  abline(h=0, lwd=3)
  abline(h=c(3,-3), lty=2, lwd=3, col="blue")

  #Cooks distance plot
  ols_cooksd_chart(model)
  
  #Combination outlier and influence plot
  ols_rsdlev_plot(model)
}

myResPlots2 <- function(model, label = "Residual Plots"){
  
  #Normal quantile plot of studentized residuals
  qqPlot(rstudent(model), pch=19, main=paste("NQ Plot of Studentized Residuals,",label))
  
  #plot of fitted vs. studentized residuals
  plot(rstudent(model) ~ model$fitted.values, pch=19, col='red', xlab="Fitted Values", ylab = "Studentized Residuals",
     main = paste("Fits vs. Studentized Residuals,", label))
  abline(h=0, lwd=3)
  abline(h=c(3,-3), lty=2, col="green")
  abline(h=c(2,-2), lty=2, col="blue")

}

R[write to console]: 
Attaching package: ‘olsrr’


R[write to console]: The following object is masked from ‘package:datasets’:

    rivers


R[write to console]: corrplot 0.92 loaded



In [6]:
# Reading data
accepted = pd.read_csv('data/LC/Lending_Club_Accepted_2014_2018.csv')
rejected = pd.read_csv('data/LC/Lending_Club_Rejected_2014_2018.csv')
print(accepted.shape)
display(accepted.head())
print(rejected.shape)
display(rejected.head())

  exec(code_obj, self.user_global_ns, self.user_ns)


(2029952, 151)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


(26132308, 9)


Unnamed: 0,Amount_Requested,Application_Date,Loan_Title,Risk_Score,Debt_to_Income_Ratio,Zip_Code,State,Employment_Length,Policy_Code
0,1000.0,2016-04-01,other,,2.69%,331xx,FL,< 1 year,0.0
1,4000.0,2016-04-01,debt_consolidation,,28.26%,834xx,ID,< 1 year,0.0
2,5000.0,2016-04-01,moving,,-1%,648xx,MO,,0.0
3,1000.0,2016-04-01,moving,628.0,21.43%,380xx,TN,< 1 year,0.0
4,3000.0,2016-04-01,Debt consolidation,,8.49%,895xx,NV,2 years,2.0


In [7]:
stripped_accepted = accepted[['loan_amnt', 'issue_d', 'title', 'dti', 'zip_code', 'addr_state', 'emp_length', 'policy_code']]
stripped_accepted['accepted'] = 1
rejected.rename(columns={'Amount_Requested': 'loan_amnt',
                         'Application_Date': 'date',
                         'Loan_Title': 'title',
                         'Risk_Score': 'risk_score',
                         'Debt_to_Income_Ratio': 'dti',
                         'Zip_Code': 'zip_code',
                         'State': 'state',
                         'Employment_Length': 'emp_length',
                         'Policy_Code': 'policy_code'}, inplace=True)
rejected.drop(columns='risk_score', inplace=True)
rejected['accepted'] = 0
stripped_accepted.rename(columns={'issue_d': 'date',
                                  'addr_state': 'state'}, inplace=True)
print("Accepted")
for col in stripped_accepted.columns:
    prop_missing = stripped_accepted[col].isna().mean()
    print(f'Column: {col}, Proportion missing: {prop_missing}')
print("Rejected")
for col in rejected.columns:
    prop_missing = rejected[col].isna().mean()
    print(f'Column: {col}, Proportion missing: {prop_missing}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stripped_accepted['accepted'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Accepted
Column: loan_amnt, Proportion missing: 0.0
Column: date, Proportion missing: 0.0
Column: title, Proportion missing: 0.011480567028185888
Column: dti, Proportion missing: 0.0008428770729554196
Column: zip_code, Proportion missing: 4.926224856548332e-07
Column: state, Proportion missing: 0.0
Column: emp_length, Proportion missing: 0.06793658175168674
Column: policy_code, Proportion missing: 0.0
Column: accepted, Proportion missing: 0.0
Rejected
Column: loan_amnt, Proportion missing: 0.0
Column: date, Proportion missing: 0.0
Column: title, Proportion missing: 4.932591487900724e-05
Column: dti, Proportion missing: 0.0
Column: zip_code, Proportion missing: 1.0370304834919289e-05
Column: state, Proportion missing: 3.826680750892726e-08
Column: emp_length, Proportion missing: 0.03566757287569089
Column: policy_code, Proportion missing: 3.512892929319523e-05
Column: accepted, Proportion missing: 0.0


In [8]:
loan_titles = ['Debt Consolidation', 'Credit Card', 'Home', 'Car', 'Medical', 'Business', 'Other']

In [9]:
full_data = pd.concat([stripped_accepted, rejected]).reset_index().drop(columns='index')
print(full_data.head())
print(full_data.tail())

   loan_amnt      date               title    dti zip_code state emp_length  \
0     3600.0  Dec-2015  Debt consolidation   5.91    190xx    PA  10+ years   
1    24700.0  Dec-2015            Business  16.06    577xx    SD  10+ years   
2    20000.0  Dec-2015                 NaN  10.78    605xx    IL  10+ years   
3    35000.0  Dec-2015  Debt consolidation  17.06    076xx    NJ  10+ years   
4    10400.0  Dec-2015      Major purchase  25.37    174xx    PA    3 years   

   policy_code  accepted  
0          1.0         1  
1          1.0         1  
2          1.0         1  
3          1.0         1  
4          1.0         1  
          loan_amnt        date               title     dti zip_code state  \
28162255    10000.0  2016-12-31  Debt consolidation  41.26%    441xx    OH   
28162256    10000.0  2016-12-31              moving   1.48%    207xx    MD   
28162257     1200.0  2016-12-31               Other  10.26%    914xx    CA   
28162258    25000.0  2016-12-31  debt_consolidation

In [10]:
counter = 0
def clean_titles(full_data):
    full_data['title'] = full_data['title'].fillna('')
    full_data['title'] = full_data['title'].str.lower()
    full_data.loc[full_data['title'].str.contains('credit'), 'title'] = 'Credit Card'
    full_data.loc[full_data['title'].str.contains('cc'), 'title'] = 'Credit Card'
    full_data.loc[full_data['title'].str.contains('debt'), 'title'] = 'Debt Consolidation'
    full_data.loc[full_data['title'].str.contains('consolidation'), 'title'] = 'Debt Consolidation'
    full_data.loc[full_data['title'].str.contains('consolidate'), 'title'] = 'Debt Consolidation'
    full_data.loc[full_data['title'].str.contains('payoff'), 'title'] = 'Debt Consolidation'
    full_data.loc[full_data['title'].str.contains('pay off'), 'title'] = 'Debt Consolidation'
    full_data.loc[full_data['title'].str.contains('refinance'), 'title'] = 'Debt Consolidation'
    full_data.loc[full_data['title'].str.contains('home'), 'title'] = 'Home'
    full_data.loc[full_data['title'].str.contains('house'), 'title'] = 'Home'
    full_data.loc[full_data['title'].str.contains('moving'), 'title'] = 'Home'
    full_data.loc[full_data['title'].str.contains('car'), 'title'] = 'Car'
    full_data.loc[full_data['title'].str.contains('medical'), 'title'] = 'Medical'
    full_data.loc[full_data['title'].str.contains('business'), 'title'] = 'Business'
    full_data.loc[~full_data['title'].isin(loan_titles), 'title'] = 'Other'
    return full_data

def clean_date(data):
    data['date'] = pd.to_datetime(data['date']).dt.strftime('%m-%Y')
    return data
    
def clean_dti(data):
    # Remove all rows with negative dti
    data['dti'] = data['dti'].astype(str)
    data = data[~data['dti'].str.contains('-')]
    
    # Replace %
    data['dti'] = data['dti'].str.replace('%', '').astype('float64')
    data = data[~data['dti'].isna()]
    
    return data
    
def clean_zip(data):
    # Remove na values first or else error
    data = data[~data['zip_code'].isna()]
    data = data[~data['state'].isna()]
    
    # Get first 3 digits
    data['zip_code'] = data['zip_code'].apply(lambda x: x[:3])

    # Remove the one case where zipcode does not begin with 3 digits
    data = data[data['zip_code'].apply(str.isdigit)]    
    
    # Make the mapping zip -> state code
    search = SearchEngine()
    zips = set()
    zip2state = {}
    for zip_code in data['zip_code']:
        zips.add(zip_code)
    for zip_code in zips:
        res = search.by_prefix(zip_code)
        if res == []:
            zip2state[zip_code] = ''
        else:
            zip2state[zip_code] = res[0].state
            
    # Use knowledge of correct zip_codes to make correct state -> zipcode
    def matchStateZip(zip_code, state):
        global counter
        counter += 1
        if counter % 100000 == 0:
            print(counter)
        return state == zip2state[zip_code]

    with SearchEngine() as search:
        to_remove = data.apply(lambda x: matchStateZip(x.zip_code, x.state), axis=1)

    print(f'Shape before {data.shape}')
    data = data[to_remove]
    print(f'Shape after removing incorrect state <-> zip code matchings {data.shape}')
    
    return data

def clean_full(data):
    # loan_amnt is clean
    # Clean titles first
    print('clean titles')
    data = clean_titles(data)
    
    # Clean date
    print('clean date')
    data = clean_date(data)
    
    # Clean dti
    print('clean dti')
    data = clean_dti(data)
    
    # Clean zip + state combo
    print('clean zip')
    data = clean_zip(data)
    
    # # policy code corresponds with being accepted/rejected except for special policy code 2.0
    # ALL POLICY CODE 2.0 (https://news.fintechnexus.com/policy-code-2-loans-lending-club/) LOANS WERE REJECTED APPS
    print('clean policy code')
    data = data.drop(columns='policy_code')
    
    # Keep emp_length the same (will drop na values)
    
    # Remove NA Last
    print('clean na')
    prev_rows = data.shape[0]
    data = data.dropna()
    print(f'Training examples reduced by {1-(data.shape[0]/prev_rows)}')
    return data.reset_index().drop(columns='index')

full_data = clean_full(full_data)

clean titles
clean date
clean dti


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['dti'] = data['dti'].str.replace('%', '').astype('float64')


clean zip
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000
12100000
12200000


In [11]:
full_data

Unnamed: 0,loan_amnt,date,title,dti,zip_code,state,emp_length,accepted
0,3600.0,12-2015,Debt Consolidation,5.91,190,PA,10+ years,1
1,24700.0,12-2015,Business,16.06,577,SD,10+ years,1
2,20000.0,12-2015,Other,10.78,605,IL,10+ years,1
3,35000.0,12-2015,Debt Consolidation,17.06,076,NJ,10+ years,1
4,10400.0,12-2015,Other,25.37,174,PA,3 years,1
...,...,...,...,...,...,...,...,...
25871613,10000.0,12-2016,Debt Consolidation,41.26,441,OH,< 1 year,0
25871614,10000.0,12-2016,Home,1.48,207,MD,5 years,0
25871615,1200.0,12-2016,Other,10.26,914,CA,< 1 year,0
25871616,25000.0,12-2016,Debt Consolidation,17.71,880,NM,< 1 year,0


In [12]:
# Make sample sizes for accepted rejected 50/50
# num_accepted = full_data[full_data.accepted == 1].shape[0]
num_accepted = 500000
print(f'{num_accepted} accepted samples')
full_data_reduced = pd.concat([full_data[full_data.accepted == 1].sample(num_accepted).reset_index().drop(columns='index'),
                               full_data[full_data.accepted == 0].sample(num_accepted).reset_index().drop(columns='index')]).reset_index().drop(columns='index')
full_data_reduced

500000 accepted samples


Unnamed: 0,loan_amnt,date,title,dti,zip_code,state,emp_length,accepted
0,1800.0,07-2015,Debt Consolidation,15.12,913,CA,10+ years,1
1,12000.0,09-2015,Debt Consolidation,26.78,850,AZ,2 years,1
2,14000.0,07-2016,Credit Card,30.53,581,ND,3 years,1
3,15000.0,02-2015,Debt Consolidation,11.70,060,CT,10+ years,1
4,6000.0,11-2018,Other,21.41,812,CO,10+ years,1
...,...,...,...,...,...,...,...,...
999995,2000.0,06-2017,Other,9.37,700,LA,< 1 year,0
999996,5000.0,10-2015,Car,10.83,949,CA,< 1 year,0
999997,6000.0,11-2018,Debt Consolidation,100.00,362,AL,< 1 year,0
999998,10000.0,04-2016,Credit Card,25.41,281,NC,< 1 year,0


In [14]:
# Append the data with zipcode information
# Rural definition (https://www.ers.usda.gov/topics/rural-economy-population/rural-classifications/what-is-rural.aspx#:~:text=According%20to%20this%20system%2C%20rural,with%20fewer%20than%202%2C500%20people.)
def get_zip_info(data):
    zips = set()
    zip2data = {}
    for zip_code in data['zip_code']:
        zips.add(zip_code)
    for zip_code in zips:
        print(zip_code)
        with SearchEngine(simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive) as search2:
            res = search2.by_prefix(zip_code, returns=0)
        if res == []:
            zip2data[zip_code] = {'population': None, 
                                    'population_density': None, 
                                    'median_home_value': None, 
                                    'median_household_income': None, 
                                    'polygon': None, 
                                    'pop_male': None, 
                                    'pop_female': None,
                                    'pop_white': None,
                                    'pop_black': None,
                                    'pop_indian_alaska': None,
                                    'pop_asian': None,
                                    'pop_pacific': None,
                                    'pop_other': None,
                                    'pop_multi': None}
        else:
            # Get consolidate polygon, gender, race
            count = 0
            polygons = []
            pop = 0
            area = 0
            home = 0
            income = 0
            genders = Counter()
            races = Counter()
            for zipcode in res:
                count += 1
                if zipcode.population is not None:
                    pop += zipcode.population
                if zipcode.population_density is not None and zipcode.population is not None:
                    if zipcode.population_density != 0:
                        area += zipcode.population / zipcode.population_density
                    elif zipcode.land_area_in_sqmi is not None and zipcode.water_area_in_sqmi is not None:
                        area += zipcode.land_area_in_sqmi + zipcode.water_area_in_sqmi
                    else:
                        area += 0
                else:
                    if zipcode.land_area_in_sqmi is not None and zipcode.water_area_in_sqmi is not None:
                        area += zipcode.land_area_in_sqmi + zipcode.water_area_in_sqmi
                if zipcode.median_home_value is not None:
                    home += zipcode.median_home_value
                if zipcode.median_household_income is not None:
                    income += zipcode.median_household_income
                # Multipolygon
                if zipcode.polygon is not None:
                    if len(zipcode.polygon) == 1:
                        polygons.append(Polygon(zipcode.polygon[0]))
                    elif len(zipcode.polygon[0]) == 2:
                        polygons.append(Polygon(zipcode.polygon))
                    else:
                        for p in zipcode.polygon:
                            polygons.append(Polygon(p))
                # Gender
                if zipcode.population_by_gender is not None:
                    gender = zipcode.population_by_gender[0]['values']
                    gender = Counter({gender[0]['x']: gender[0]['y'], gender[1]['x']: gender[1]['y']})

                    genders += gender
                # Race
                if zipcode.population_by_race is not None:
                    race = zipcode.population_by_race[0]['values']
                    for item in race:
                        races += Counter({item['x']: item['y']})

            mp = MultiPolygon(polygons)

            if pop is not None and area is not None and area != 0:
                pop_d = pop/area
            else:
                pop_d = 90 # US average

            # Get other information
            if pop == 0:
                zip2data[zip_code] = {'population': pop, 
                                    'population_density': pop_d, 
                                    'median_home_value': home/count, 
                                    'median_household_income': income/count, 
                                    'polygon': mp, 
                                    'pop_male': 0, 
                                    'pop_female': 0,
                                    'pop_white': 0,
                                    'pop_black': 0,
                                    'pop_indian_alaska': 0,
                                    'pop_asian': 0,
                                    'pop_pacific': 0,
                                    'pop_other': 0,
                                    'pop_multi': 0}
            else:
                zip2data[zip_code] = {'population': pop, 
                                    'population_density': pop_d, 
                                    'median_home_value': home/count, 
                                    'median_household_income': income/count, 
                                    'polygon': mp, 
                                    'pop_male': genders['Male']/pop, 
                                    'pop_female': genders['Female']/pop,
                                    'pop_white': races['White']/pop,
                                    'pop_black': races['Black Or African American']/pop,
                                    'pop_indian_alaska': races['American Indian Or Alaskan Native']/pop,
                                    'pop_asian': races['Asian']/pop,
                                    'pop_pacific': races['Native Hawaiian & Other Pacific Islander']/pop,
                                    'pop_other': races['Other Race']/pop,
                                    'pop_multi': races['Two Or More Races']/pop}

    return zip2data

zip2data = get_zip_info(full_data_reduced)

259
336
583
982
081
154
971
684
370
026
448
306
573
224
837
328
584
747
580
734
783
710
826
183
863
292
894
408
253
597
860
490
013
170
315
108
773
462
806
330
816
500
606
813
970
220
219
731
815
121
933
052
629
436
830
078
832
248
890
984
439
941
831
232
656
337
322
492
264
190
895
951
956
653
283
243
017
452
369
225
836
648
874
244
616
824
460
612
323
146
044
058
125
240
636
995
045
723
426
657
317
335
019
285
902
610
785
838
898
277
011
793
086
071
440
263
333
704
474
168
478
794
498
910
717
028
949
756
324
438
347
906
804
037
674
450
857
443
059
127
758
291
167
482
789
678
469
770
138
364
258
226
985
926
828
752
155
367
313
947
705
130
687
989
379
457
042
784
344
599
433
928
967
484
479
943
189
255
238
638
728
014
730
801
744
066
920
740
142
960
027
603
200
162
218
660
036
845
658
047
074
669
404
823
237
745
453
111
222
434
923
990
217
588
124
679
944
084
598
534
912
775
538
672
417
412
406
442
076
986
567
251
402
271
267
184
721
738
202
630
940
265
223
305
349
104
786
374
198
822


In [15]:
# print(zip2data["190"])
# rows = []
# for ind, row in full_data.iloc[:5, :].iterrows():
#     rows.append(row.append(pd.Series({k:v for k,v in zip2data[row.zip_code].items() if k != 'polygon'})))
# pd.DataFrame(rows)

In [16]:
rows = []
count = 0
for ind, row in full_data_reduced.iterrows():
    count += 1
    if count % 10000 == 0:
        print(count)
    rows.append(row.append(pd.Series({k:v for k,v in zip2data[row.zip_code].items() if k != 'polygon'})))
full_data_reduced = pd.DataFrame(rows)
del rows

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000


In [59]:
# datetime.datetime(2016, 5, 1)
full_data_reduced['date'] = pd.to_datetime(full_data_reduced['date'])

In [60]:
full_data_reduced

Unnamed: 0,loan_amnt,date,title,dti,zip_code,state,emp_length,accepted,population,population_density,...,median_household_income,pop_male,pop_female,pop_white,pop_black,pop_indian_alaska,pop_asian,pop_pacific,pop_other,pop_multi
0,1800.0,2015-07-01,Debt Consolidation,15.12,913,CA,10+ years,1,1361160.0,1746.550501,...,76702.184211,0.496502,0.503498,0.627759,0.036603,0.005935,0.106151,0.001306,0.177122,0.045125
1,12000.0,2015-09-01,Debt Consolidation,26.78,850,AZ,2 years,1,1320630.0,2509.405876,...,41769.607843,0.502855,0.497145,0.652965,0.063495,0.023393,0.029759,0.001745,0.192361,0.036282
2,14000.0,2016-07-01,Credit Card,30.53,581,ND,3 years,1,105474.0,1481.397192,...,49553.666667,0.503887,0.496113,0.903351,0.026793,0.013747,0.028860,0.000408,0.006134,0.020707
3,15000.0,2015-02-01,Debt Consolidation,11.70,060,CT,10+ years,1,672732.0,582.703536,...,82044.388889,0.489501,0.510499,0.818194,0.081199,0.001981,0.038769,0.000253,0.036695,0.022910
4,6000.0,2018-11-01,Other,21.41,812,CO,10+ years,1,81252.0,11.822466,...,40065.904762,0.558669,0.442303,0.914427,0.027039,0.014978,0.006055,0.000345,0.020209,0.017920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2000.0,2017-06-01,Other,9.37,700,LA,< 1 year,0,566173.0,506.455909,...,48799.119048,0.488469,0.511531,0.610978,0.294311,0.005044,0.032089,0.000457,0.036173,0.020948
999996,5000.0,2015-10-01,Car,10.83,949,CA,< 1 year,0,368717.0,546.098281,...,73696.758621,0.484057,0.516084,0.803855,0.018627,0.006368,0.051931,0.002370,0.072804,0.044186
999997,6000.0,2018-11-01,Debt Consolidation,100.00,362,AL,< 1 year,0,185623.0,74.676121,...,39556.814815,0.485064,0.514936,0.777797,0.182057,0.004202,0.005689,0.000609,0.014319,0.015327
999998,10000.0,2016-04-01,Credit Card,25.41,281,NC,< 1 year,0,586988.0,232.661800,...,47695.526316,0.493835,0.506165,0.781064,0.148352,0.003671,0.015939,0.000412,0.033604,0.016958


In [18]:
from sklearn.linear_model import LogisticRegression

In [20]:
# import pickle
# full_data_reduced.to_csv("full_data_reduced.csv", index=False)
# with open('./zip2data', 'wb') as fout:
#     pickle.dump(zip2data, fout, pickle.HIGHEST_PROTOCOL)

In [64]:
rng = 50000
subset = full_data_reduced.iloc[500000 - rng: 500000 + rng, :]
subset.columns

Index(['loan_amnt', 'date', 'title', 'dti', 'zip_code', 'state', 'emp_length',
       'accepted', 'population', 'population_density', 'median_home_value',
       'median_household_income', 'pop_male', 'pop_female', 'pop_white',
       'pop_black', 'pop_indian_alaska', 'pop_asian', 'pop_pacific',
       'pop_other', 'pop_multi'],
      dtype='object')

In [62]:
pre_2016 = full_data_reduced[full_data_reduced['date'] <= datetime.datetime(2016, 5, 9)]
post_2016 = full_data_reduced[full_data_reduced['date'] > datetime.datetime(2016, 5, 9)]
print(pre_2016.shape)
print(post_2016.shape)


(331429, 21)
(668571, 21)


In [None]:
%%R -i subset
plot(probs ~ names(probs), col = 'blue', pch = 19, main = "Probability of Search by Age", xlab = "Age in Years", ylab = "Prob of Being Searched")

In [65]:
%%R -i subset -o m1
m1 <- glm(accepted ~ . - zip_code - state - date, data = subset, family = binomial)
summary(m1)


Call:
glm(formula = accepted ~ . - zip_code - state - date, family = binomial, 
    data = subset)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.1444  -0.4479   0.0521   0.2953   7.0508  

Coefficients: (1 not defined because of singularities)
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -3.727e+00  8.301e-01  -4.489 7.14e-06 ***
loan_amnt                1.984e-05  1.147e-06  17.306  < 2e-16 ***
titleCar                -6.489e-01  1.207e-01  -5.377 7.56e-08 ***
titleCredit Card         1.729e+00  9.932e-02  17.412  < 2e-16 ***
titleDebt Consolidation  1.361e+00  9.699e-02  14.031  < 2e-16 ***
titleHome                8.089e-01  1.030e-01   7.853 4.06e-15 ***
titleMedical             2.839e-01  1.278e-01   2.221  0.02633 *  
titleOther               2.815e-01  1.001e-01   2.812  0.00492 ** 
dti                     -2.317e-02  7.101e-04 -32.633  < 2e-16 ***
emp_length1 year         4.349e+00  5.544e-02  78.446  < 2e-

In [66]:
subset

Unnamed: 0,loan_amnt,date,title,dti,zip_code,state,emp_length,accepted,population,population_density,...,median_household_income,pop_male,pop_female,pop_white,pop_black,pop_indian_alaska,pop_asian,pop_pacific,pop_other,pop_multi
450000,12000.0,2017-09-01,Debt Consolidation,11.72,774,TX,10+ years,1,1002396.0,213.741833,...,65504.750000,0.492909,0.507091,0.615333,0.154739,0.004863,0.114502,0.000439,0.081425,0.028699
450001,22775.0,2017-07-01,Credit Card,22.48,200,DC,10+ years,1,593412.0,10037.794257,...,73992.956522,0.470953,0.529047,0.378862,0.513158,0.003451,0.034433,0.000487,0.040913,0.028697
450002,10000.0,2015-11-01,Debt Consolidation,24.13,894,NV,9 years,1,277600.0,27.224871,...,53119.520000,0.503790,0.496210,0.807172,0.016153,0.021855,0.030605,0.003775,0.084103,0.036336
450003,18000.0,2017-02-01,Debt Consolidation,24.80,441,OH,10+ years,1,1213620.0,2946.708008,...,46290.659574,0.474281,0.525719,0.620941,0.311108,0.002060,0.025760,0.000219,0.018837,0.021075
450004,14400.0,2016-11-01,Home,10.17,721,AR,1 year,1,311399.0,74.421393,...,40840.437500,0.490140,0.510217,0.776020,0.175852,0.004897,0.007717,0.000543,0.016676,0.018651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549995,30000.0,2016-09-01,Debt Consolidation,68.16,322,FL,5 years,0,899059.0,1042.239172,...,45730.857143,0.487768,0.516801,0.624358,0.285049,0.003884,0.041221,0.000896,0.020834,0.028326
549996,10000.0,2018-03-01,Debt Consolidation,16.73,069,CT,< 1 year,0,122486.0,3366.428070,...,78150.142857,0.492505,0.507495,0.649086,0.139445,0.003209,0.078931,0.000702,0.097146,0.031481
549997,5000.0,2017-12-01,Other,25.84,329,FL,< 1 year,0,614397.0,508.272386,...,48403.187500,0.489835,0.512154,0.833785,0.097312,0.003573,0.019821,0.000837,0.022041,0.024621
549998,30000.0,2018-11-01,Debt Consolidation,39.64,301,GA,< 1 year,0,1178367.0,334.533373,...,52469.148148,0.487232,0.512768,0.731820,0.188758,0.003378,0.015953,0.000608,0.037345,0.022140


In [71]:
# Graphing
mycoordslist = [list(x.exterior.coords) for x in zip2data["423"]['polygon'].geoms]
# zip2data["423"]['polygon']

35