# Linear Regression 

In [11]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [12]:
train_file=r'../Data/loan_data_train.csv'
test_file=r'../Data/loan_data_test.csv'

ld_train=pd.read_csv(train_file)
ld_test=pd.read_csv(test_file)               


In [13]:
ld_train.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years


In [14]:
ld_test.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,20093,5000,5000,60 months,moving,12.59%,NY,RENT,4416.67,690-694,13,7686,0,< 1 year
1,62445,18000,18000,60 months,debt_consolidation,4.93%,CA,RENT,5258.5,710-714,6,11596,0,10+ years
2,65248,7200,7200,60 months,debt_consolidation,25.16%,LA,MORTGAGE,3750.0,750-754,13,7283,0,6 years
3,81822,7200,7200,36 months,debt_consolidation,17.27%,NY,MORTGAGE,3416.67,790-794,14,4838,0,10+ years
4,57923,22000,22000,60 months,debt_consolidation,18.28%,MI,MORTGAGE,6083.33,720-724,9,20181,0,8 years


In [15]:
ld_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              2199 non-null   float64
 1   Amount.Requested                2199 non-null   object 
 2   Amount.Funded.By.Investors      2199 non-null   object 
 3   Interest.Rate                   2200 non-null   object 
 4   Loan.Length                     2199 non-null   object 
 5   Loan.Purpose                    2199 non-null   object 
 6   Debt.To.Income.Ratio            2199 non-null   object 
 7   State                           2199 non-null   object 
 8   Home.Ownership                  2199 non-null   object 
 9   Monthly.Income                  2197 non-null   float64
 10  FICO.Range                      2200 non-null   object 
 11  Open.CREDIT.Lines               2196 non-null   object 
 12  Revolving.CREDIT.Balance        21

In [16]:
ld_train.sample(10)

# drop columns : Amount.Funded.By.Investors , ID, Interest.Rate

#1 Amount requested : convert it to numeric

#2 Loan Length : create dummies with frequency cutoff 20

#3 Loan.Purpose : dummies with freq cutoff

#4  Debt.To.Income.Ratio : remove % and then convert to numeric

#5  State: dummies with frequency cutoff

#6 Home.Ownership : dummies with frequency cutoff

#7 Monthly Income : as is

#8 FICO.Range : break a-b , in to a ,b , convert them to numeric 
# then create new column fico=0.5*(a+b) and then drop the original FICO.Range

#9 Open Credit Lines : convert to numeric

#10 Revolving Credit balance : convert it to numeric 

#11 inquiries in the last 6 months : as is

#12 Employment.Length : create dummies with frequency cutoff

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
1141,48009.0,5000,5000.0,7.62%,36 months,debt_consolidation,3.04%,NY,OWN,2500.0,735-739,5,4864,0.0,3 years
524,18362.0,15000,14950.0,16.32%,60 months,debt_consolidation,7.53%,MA,MORTGAGE,14000.0,685-689,14,5469,4.0,5 years
1872,21993.0,7450,7450.0,13.06%,36 months,debt_consolidation,3.30%,WA,MORTGAGE,5883.33,710-714,2,6416,1.0,10+ years
45,34144.0,21000,21000.0,14.27%,60 months,debt_consolidation,11.27%,GA,MORTGAGE,5333.33,710-714,8,19431,0.0,8 years
135,55803.0,14400,14400.0,10.74%,36 months,debt_consolidation,5.90%,CA,MORTGAGE,6833.33,700-704,15,10017,0.0,10+ years
599,587.0,11000,2116.76,13.87%,36 months,debt_consolidation,20.93%,NY,OWN,4167.0,670-674,14,17436,0.0,2 years
2022,30986.0,13800,13800.0,17.49%,60 months,medical,1.68%,NY,RENT,4166.67,710-714,3,2766,0.0,8 years
574,9115.0,16750,16445.93,9.88%,36 months,credit_card,10.25%,IL,MORTGAGE,3338.0,750-754,10,18696,1.0,5 years
2168,17218.0,7325,7300.0,17.80%,60 months,debt_consolidation,20.37%,NY,OWN,3000.0,680-684,6,13583,0.0,5 years
621,95827.0,5600,5600.0,11.14%,36 months,credit_card,27.96%,NC,MORTGAGE,11666.67,665-669,18,194205,0.0,10+ years


In [17]:
p1=pdPipeline([
    ('var_select',VarSelector(['Amount.Requested','Open.CREDIT.Lines','Revolving.CREDIT.Balance'])),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

p2=pdPipeline([
    ('var_select',VarSelector(['Debt.To.Income.Ratio'])),
    ('string_clean',string_clean(replace_it='%',replace_with='')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

p3=pdPipeline([
    ('var_select',VarSelector(['Loan.Length', 'Loan.Purpose','State','Home.Ownership','Employment.Length'])),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(20))
])

p4=pdPipeline([
    ('var_select',VarSelector(['Monthly.Income','Inquiries.in.the.Last.6.Months'])),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['FICO.Range'])),
    ('custom_fico',custom_fico()),
    ('missing_trt',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('obj_to_num',p1),
    ('dtir',p2),
    ('obj_to_dum',p3),
    ('num',p4),
    ('fico',p5)
])

In [18]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(ld_train),
                     columns=data_pipe.get_feature_names())

In [19]:
x_train.shape

(2200, 60)

In [20]:
x_test=pd.DataFrame(data=data_pipe.transform(ld_test),
                     columns=data_pipe.get_feature_names())

In [21]:
x_test.shape

(300, 60)

In [22]:
x_train.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_4 years,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico
0,25000.0,11.0,15210.0,27.56,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8606.56,3.0,722.0
1,19750.0,14.0,19070.0,13.39,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6737.5,3.0,712.0
2,2100.0,13.0,893.0,3.5,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1.0,692.0
3,28000.0,12.0,38194.0,19.62,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7083.33,1.0,712.0
4,24250.0,6.0,31061.0,23.79,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5833.33,2.0,732.0


In [23]:
x_test.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_4 years,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico
0,5000.0,13.0,7686.0,12.59,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4416.67,0.0,692.0
1,18000.0,6.0,11596.0,4.93,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5258.5,0.0,712.0
2,7200.0,13.0,7283.0,25.16,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3750.0,0.0,752.0
3,7200.0,14.0,4838.0,17.27,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3416.67,0.0,792.0
4,22000.0,9.0,20181.0,18.28,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6083.33,0.0,722.0


In [24]:
y_train=ld_train['Interest.Rate'].str.replace('%','').astype(float)

In [26]:
# 1: Importing necessary Library
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [27]:
#2. Instantiate the object of that class
lm=LinearRegression()

In [41]:
cv_mae=-cross_val_score(lm, # model object
                        x_train,y_train, # X train, y train
                        cv=10, # number of rounds
                        scoring='neg_mean_absolute_error')

cv_mae

array([1.74273801, 1.76502624, 1.77011354, 1.65406589, 1.43720296,
       1.63540955, 1.44255408, 1.58043799, 1.52220714, 1.65823135])

In [40]:
# cv_mae=-cross_val_score(lm, # model object
#                         x_train,y_train, # X train, y train
#                         cv=10, # number of rounds
#                         scoring='neg_root_mean_squared_error')

# cv_mae

array([2.18201412, 2.29075149, 2.2564944 , 2.08421269, 1.85528335,
       2.08466811, 1.86042481, 2.08053274, 2.0156708 , 2.04758367])

In [42]:
cv_mae.mean()

1.6207986760157334

In [43]:
cv_mae.std()

0.11726675023085202

In [44]:
# 3: Fitting on the Train Data
lm.fit(x_train,y_train)

LinearRegression()

In [45]:
lm.intercept_

73.16432302812701

In [46]:
lm.coef_

array([ 1.60913878e-04, -3.72164153e-02, -3.05844966e-06,  1.88427404e-04,
        1.30627284e+00,  4.47755390e+00, -7.84079703e-01, -8.91888331e-01,
        3.20936459e-02, -6.95275775e-01, -4.22744797e-01, -2.64224120e-01,
       -5.41274155e-01, -9.69555836e-01, -5.98574359e-01,  7.21470875e-01,
       -1.99766458e-01, -1.88645568e-01, -7.45527597e-02,  4.73923022e-01,
       -4.68212871e-01, -5.71694597e-01, -1.79076702e-01, -3.78950003e-01,
       -1.57191036e-01, -1.08257898e-01, -4.96139885e-01, -1.99177682e-01,
       -7.78964606e-02,  4.10456351e-02, -2.67193758e-01,  4.08575782e-01,
        3.80179542e-01, -9.81649463e-02, -6.21920499e-02, -1.77735228e-01,
       -3.50604056e-01, -2.33745724e-01, -3.92591988e-02,  1.10048502e-02,
        1.87183988e-02,  1.85782841e-01, -2.39881640e+00, -2.17035525e+00,
       -2.06526514e+00,  5.26184321e-01,  3.52218686e-01,  2.87554381e-01,
        1.51007910e-01,  5.83377246e-01,  4.22222352e-01,  2.41210867e-01,
        3.88741640e-01,  

In [51]:
list(x_train.columns)

['obj_to_num__Amount.Requested',
 'obj_to_num__Open.CREDIT.Lines',
 'obj_to_num__Revolving.CREDIT.Balance',
 'dtir__Debt.To.Income.Ratio',
 'obj_to_dum__Loan.Length_36 months',
 'obj_to_dum__Loan.Length_60 months',
 'obj_to_dum__Loan.Purpose_debt_consolidation',
 'obj_to_dum__Loan.Purpose_credit_card',
 'obj_to_dum__Loan.Purpose_other',
 'obj_to_dum__Loan.Purpose_home_improvement',
 'obj_to_dum__Loan.Purpose_major_purchase',
 'obj_to_dum__Loan.Purpose_small_business',
 'obj_to_dum__Loan.Purpose_car',
 'obj_to_dum__Loan.Purpose_wedding',
 'obj_to_dum__Loan.Purpose_medical',
 'obj_to_dum__Loan.Purpose_moving',
 'obj_to_dum__State_CA',
 'obj_to_dum__State_NY',
 'obj_to_dum__State_FL',
 'obj_to_dum__State_TX',
 'obj_to_dum__State_PA',
 'obj_to_dum__State_IL',
 'obj_to_dum__State_GA',
 'obj_to_dum__State_NJ',
 'obj_to_dum__State_VA',
 'obj_to_dum__State_MA',
 'obj_to_dum__State_NC',
 'obj_to_dum__State_OH',
 'obj_to_dum__State_MD',
 'obj_to_dum__State_CO',
 'obj_to_dum__State_WA',
 'obj_to_

In [47]:
list(zip(x_train.columns,lm.coef_))

[('obj_to_num__Amount.Requested', 0.00016091387809670106),
 ('obj_to_num__Open.CREDIT.Lines', -0.03721641529315642),
 ('obj_to_num__Revolving.CREDIT.Balance', -3.0584496637280838e-06),
 ('dtir__Debt.To.Income.Ratio', 0.0001884274038879974),
 ('obj_to_dum__Loan.Length_36 months', 1.3062728365133434),
 ('obj_to_dum__Loan.Length_60 months', 4.47755390330456),
 ('obj_to_dum__Loan.Purpose_debt_consolidation', -0.7840797027427531),
 ('obj_to_dum__Loan.Purpose_credit_card', -0.8918883314528003),
 ('obj_to_dum__Loan.Purpose_other', 0.032093645937298465),
 ('obj_to_dum__Loan.Purpose_home_improvement', -0.6952757746474031),
 ('obj_to_dum__Loan.Purpose_major_purchase', -0.42274479675892296),
 ('obj_to_dum__Loan.Purpose_small_business', -0.26422412044799104),
 ('obj_to_dum__Loan.Purpose_car', -0.5412741545888301),
 ('obj_to_dum__Loan.Purpose_wedding', -0.9695558362069793),
 ('obj_to_dum__Loan.Purpose_medical', -0.5985743591789386),
 ('obj_to_dum__Loan.Purpose_moving', 0.721470874850843),
 ('obj_to

In [48]:
# 4: Predicitng on Test Data
test_pred=lm.predict(x_test)

In [50]:
x_test.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_4 years,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico
0,5000.0,13.0,7686.0,12.59,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4416.67,0.0,692.0
1,18000.0,6.0,11596.0,4.93,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5258.5,0.0,712.0
2,7200.0,13.0,7283.0,25.16,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3750.0,0.0,752.0
3,7200.0,14.0,4838.0,17.27,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3416.67,0.0,792.0
4,22000.0,9.0,20181.0,18.28,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6083.33,0.0,722.0


In [49]:
test_pred

array([16.73641244, 15.9822577 , 10.41724451,  3.71534168, 15.21108834,
        7.0351977 , 15.57246453, 10.58352597, 15.94955183, 12.38355726,
        9.9366698 , 15.17209087, 11.53730446, 13.67752736, 13.40134753,
       18.33363766, 10.18969518, 15.79040256, 13.37967692, 14.03563066,
       22.50430604, 17.50946104, 12.12105201, 14.6034318 ,  9.68060525,
       11.39039568, 13.33566534, 19.0210008 , 11.80641539, 16.88384891,
       15.0680717 , 15.29064393, 12.35705772, 15.07255624, 13.78562695,
       14.07528779, 19.52106736, 11.3554855 , 12.00458203, 16.82962478,
       14.15907615, 11.12807774, 14.8967374 , 13.17699098, 15.55418833,
       17.04873439, 14.90876743, 19.10551152, 17.0897673 , 10.33940429,
       13.57931315, 19.75300247, 10.02009378, 19.49728229, 15.57850152,
       15.06801067, 17.10315371, 14.53133523, 10.93874049, 14.48472924,
       13.05230477, 17.09882657,  8.84706185, 14.56313892, 10.50916007,
       11.43247185, 12.79734112, 14.90266567, 12.05952277, 14.46

In [None]:
# 5. Evaluating Performance

In [52]:
x_test.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_4 years,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico
0,5000.0,13.0,7686.0,12.59,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4416.67,0.0,692.0
1,18000.0,6.0,11596.0,4.93,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5258.5,0.0,712.0
2,7200.0,13.0,7283.0,25.16,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3750.0,0.0,752.0
3,7200.0,14.0,4838.0,17.27,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3416.67,0.0,792.0
4,22000.0,9.0,20181.0,18.28,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6083.33,0.0,722.0


In [54]:
x_test["Pred_InterestRate"] = test_pred
x_test.head()

Unnamed: 0,obj_to_num__Amount.Requested,obj_to_num__Open.CREDIT.Lines,obj_to_num__Revolving.CREDIT.Balance,dtir__Debt.To.Income.Ratio,obj_to_dum__Loan.Length_36 months,obj_to_dum__Loan.Length_60 months,obj_to_dum__Loan.Purpose_debt_consolidation,obj_to_dum__Loan.Purpose_credit_card,obj_to_dum__Loan.Purpose_other,obj_to_dum__Loan.Purpose_home_improvement,...,obj_to_dum__Employment.Length_1 year,obj_to_dum__Employment.Length_6 years,obj_to_dum__Employment.Length_7 years,obj_to_dum__Employment.Length_8 years,obj_to_dum__Employment.Length_missing,obj_to_dum__Employment.Length_9 years,num__Monthly.Income,num__Inquiries.in.the.Last.6.Months,fico__fico,Pred_InterestRate
0,5000.0,13.0,7686.0,12.59,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4416.67,0.0,692.0,16.736412
1,18000.0,6.0,11596.0,4.93,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5258.5,0.0,712.0,15.982258
2,7200.0,13.0,7283.0,25.16,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,3750.0,0.0,752.0,10.417245
3,7200.0,14.0,4838.0,17.27,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3416.67,0.0,792.0,3.715342
4,22000.0,9.0,20181.0,18.28,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,6083.33,0.0,722.0,15.211088


We can write these to a csv file for submission like this :

In [None]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=False)

In [None]:
import os
os.getcwd()

In [55]:
# 4: Predicitng on Test Data
train_pred=lm.predict(x_train)
train_pred

array([16.84593129, 16.59545104, 12.6931631 , ..., 19.27772835,
       19.77684851, 10.43719438])

In [56]:
ld_train['Actual_InterestRate'] = y_train
ld_train['Predicted_InterestRate'] = train_pred

In [58]:
ld_train['Error'] = y_train - train_pred
ld_train['Error_Squared'] = ld_train['Error'] * ld_train['Error']
ld_train

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,Actual_InterestRate,Predicted_InterestRate,Error,Error_Squared
0,79542.0,25000,25000,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years,18.49,16.845931,1.644069,2.702962
1,75473.0,19750,19750,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.50,710-714,14,19070,3.0,4 years,17.27,16.595451,0.674549,0.455016
2,67265.0,2100,2100,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.00,690-694,13,893,1.0,< 1 year,14.33,12.693163,1.636837,2.679235
3,80167.0,28000,28000,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years,16.29,14.037159,2.252841,5.075292
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years,12.23,15.526528,-3.296528,10.867098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,74047.0,30000,30000,23.28%,60 months,other,12.10%,IL,MORTGAGE,7083.33,675-679,16,17969,1.0,10+ years,23.28,21.052664,2.227336,4.961024
2196,63768.0,4200,4200,14.33%,60 months,car,14.16%,NJ,RENT,3850.00,700-704,4,5718,0.0,2 years,14.33,14.581808,-0.251808,0.063407
2197,94545.0,19800,19775,15.31%,60 months,debt_consolidation,15.03%,IL,MORTGAGE,6666.67,675-679,10,46879,3.0,6 years,15.31,19.277728,-3.967728,15.742868
2198,53635.0,18000,18000,20.99%,60 months,credit_card,11.63%,CA,RENT,9051.83,670-674,5,32394,2.0,4 years,20.99,19.776849,1.213151,1.471737


In [60]:
#RMSE:
np.sqrt(np.mean(ld_train['Error_Squared']))

2.014365842694063

In [57]:
ld_train.head(10)

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,Actual_InterestRate,Predicted_InterestRate
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years,18.49,16.845931
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years,17.27,16.595451
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year,14.33,12.693163
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years,16.29,14.037159
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years,12.23,15.526528
5,32737.0,5400,5375.0,8.90%,36 months,small_business,6.27%,OH,RENT,3000.0,785-789,2,90,2.0,,8.9,5.83553
6,71685.0,16000,16000.0,22.47%,60 months,credit_card,11.27%,CA,RENT,4791.67,665-669,5,8474,0.0,2 years,22.47,19.26316
7,38028.0,6400,6400.0,7.51%,36 months,major_purchase,9.58%,IL,RENT,6400.0,735-739,11,12306,1.0,< 1 year,7.51,8.718899
8,32119.0,24000,23772.15,12.99%,60 months,home_improvement,15.54%,NC,MORTGAGE,8333.33,740-744,24,56984,0.0,,12.99,13.128315
9,84563.0,8000,8000.0,7.62%,36 months,other,0%,NC,MORTGAGE,3500.0,765-769,7,0,1.0,,7.62,7.216358


# Ridge  Regression

In [None]:
from sklearn.linear_model import Ridge,Lasso

from sklearn.model_selection import GridSearchCV



In [None]:
lambdas=np.linspace(1,100,100)

In [None]:
lambdas

In [None]:
params={'alpha':lambdas}

In [None]:
model=Ridge(fit_intercept=True)

In [None]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.cv_results_

 if you want you can now fit a ridge regression model with obtained value of alpha , although there is no need, grid search automatically fits the best estimator on the entire data, you can directly use this to make predictions on test_data. But if you want to look at coefficients , its much more convenient to fit the model with direct function

Using the report function given below you can see the cv performance of top few models as well, that will the tentative performance

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(grid_search.cv_results_,5)

In [None]:
test_pred=grid_search.predict(x_test)

In [None]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=False)

## For looking at coefficients

In [None]:
grid_search.best_estimator_

In [None]:
ridge_model=grid_search.best_estimator_

In [None]:
ridge_model.fit(x_train,y_train)

In [None]:
list(zip(data_pipe.get_feature_names(),ridge_model.coef_))

In [None]:
lm.coef_/ridge_model.coef_

## Lasso Regression

In [None]:
lambdas=np.linspace(1,10,100)

model=Lasso(fit_intercept=True)

params={'alpha':lambdas}

In [None]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

you can see that, the best value of alpha comes at the edge of the range that we tried , we should expand the trial range on that side and run this again

In [None]:
lambdas=np.linspace(.001,2,100)

params={'alpha':lambdas}

In [None]:
grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
report(grid_search.cv_results_,5)

In [None]:
lasso_model=grid_search.best_estimator_

In [None]:
lasso_model.fit(x_train,y_train)

In [None]:
lasso_model.intercept_

In [None]:
list(zip(data_pipe.get_feature_names(),lasso_model.coef_))


In [None]:
(lasso_model.coef_==0).sum()


# Logistic Regression

In [None]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [None]:
train_file=r'~/Dropbox/0.0 Data/rg_train.csv'
test_file=r'~/Dropbox/0.0 Data/rg_test.csv'
bd_train=pd.read_csv(train_file)

bd_test=pd.read_csv(test_file)

In [None]:
bd_train.head()

In [None]:
bd_train['family_income'].value_counts(dropna=False)

In [None]:
# drop : REF_NO, post_area , post_code,Revenue.Grid 
# children : convert zero:0 and 4+: 4 and then convert to numeric 
# age_band : 71+ : 71, Unknown: NA, rest: split and average
# status, occupation, occupation_partner,home_status: create dummies with freq cutoff
# family_income : remove [,>=], 35000:35000, 4000: 4000, unknown:NA, rest : split then avg
# self_employed, self_employed_partner : dummies 
# year_last_moved : keep as is 
# TVarea : dummies 
# 'Average.Credit.Card.Transaction', 'Balance.Transfer',
#       'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
#       'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
#       'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount'
# 'Investment.in.Commudity',
#       'Investment.in.Equity', 'Investment.in.Derivative',
#      'Portfolio.Balance' : as is 
# gender , region : dummies 
    

In [None]:
bd_train.info()

In [None]:
num_vars=list(bd_train.select_dtypes(exclude=['object']).columns)

In [None]:
num_vars

In [None]:
num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [None]:
num_vars

In [None]:
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)

In [None]:
cat_vars

In [None]:
cat_vars=[_ for _ in cat_vars if _ not in 
          ['children','age_band', 'post_code','post_area','family_income']]

In [None]:
cat_vars

In [None]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

In [None]:
p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

In [None]:
p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [None]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [None]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                     columns=data_pipe.get_feature_names())


In [None]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                     columns=data_pipe.get_feature_names())

In [None]:
bd_train['Revenue.Grid'].value_counts(dropna=False)

In [None]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
x_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':[.0001,.0005,.001,.005,.01,.05,.1,1,2,5]}

In [None]:
model=LogisticRegression(fit_intercept=True)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring="roc_auc",
                         n_jobs=-1,
                         verbose=20)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
logr=grid_search.best_estimator_

In [None]:
report(grid_search.cv_results_,5)

In [None]:
LogisticRegression?

In [None]:
logr=LogisticRegression(fit_intercept=True,
                        **{'C': 0.005, 'class_weight': 'balanced', 'penalty': 'l1'},solver='liblinear')
# default solver lbfgs does not support l1 penalty for some versions of sklearn
# if you get an error like that , simply use solver='liblinear', it supports both l1 & l2 penalty


In [None]:
logr.fit(x_train,y_train)

In [None]:
(logr.coef_[0]==0).sum()

In [None]:
list(zip(x_train.columns,logr.coef_[0]))

In [None]:
logr.predict_proba(x_test)

In [None]:
logr.classes_

In [None]:
cutoffs=np.linspace(0.01,0.99,99)

cutoffs

In [None]:
logr.predict_proba(x_train)

In [None]:
logr.classes_

In [None]:
train_score=logr.predict_proba(x_train)[:,1]
real=y_train
 # In  order to find the probability of which column is for outcome 1 and which for outcome 0


In [None]:
(train_score>0.2).astype(int)

In [None]:
KS_all=[]

for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)

    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
      
    KS=(TP/P)-(FP/N)
       
    KS_all.append(KS)
    


In [None]:
list(zip(cutoffs,KS_all))

In [None]:
mycutoff=cutoffs[KS_all==max(KS_all)]
mycutoff

In [None]:
logr.intercept_

In [None]:
list(zip(x_train.columns,logr.coef_[0]))

if you simply had to submit probability scores , you could do this 

In [None]:
logr.predict_proba(x_test)

In [None]:
test_score=logr.predict_proba(x_test)[:,1]
test_score

In [None]:
(test_score>mycutoff).astype(int)

In [None]:
pd.DataFrame(test_score).to_csv("mysubmission.csv",index=False)

if you had to submit hardclasses , you can apply the cutoff obtained above and then submit

In [None]:
test_classes=(test_score>mycutoff).astype(int)

In [None]:
pd.DataFrame(test_classes).to_csv("mysubmission.csv",index=False)