## Regularized Linear Regression

In [2]:
# import libraries
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns; sns,set()
%matplotlib inline

In [3]:
# Import Data
cols = ["ID", "IntRate", "LoanAmt", "LoanTerm", "Debt2Inc", "Home_OWN", "Home_RENT", "MonthlyInc", "RevCredBal", "FICO", "EmpLen"]
loan_data = pd.read_csv("../data/loansDataClean.csv")
# see https://github.com/jeffCabrera0321/Simple_linear_regression on how this was cleaned
loan_data = loan_data.drop(["ID"], axis=1)

In [4]:
# Display the dataset
loan_data.head()

Unnamed: 0,IntRate,LoanAmt,LoanTerm,Debt2Inc,Home_OWN,Home_RENT,MonthlyInc,RevCredBal,FICO,EmpLen
0,8.9,20000,36,14.9,False,False,6541.67,14272.0,737,1
1,12.12,19200,36,28.36,False,False,4583.33,11140.0,717,2
2,21.98,35000,60,23.81,False,False,11500.0,21977.0,692,2
3,11.71,12000,36,18.78,False,True,3195.0,14469.0,697,9
4,15.31,6000,36,20.05,True,False,4891.67,10391.0,672,3


In [5]:
# Convert LoanTerm from 36 to 0 and 60 to 1
loan_data["LoanTerm"] = np.where(loan_data["LoanTerm"] == 36, 0, 1)
loan_data["LoanTerm"].head()

0    0
1    0
2    1
3    0
4    0
Name: LoanTerm, dtype: int64

In [6]:
# Create the response DataFrame and display the first five records
Y = loan_data[["IntRate"]]
Y.head()

Unnamed: 0,IntRate
0,8.9
1,12.12
2,21.98
3,11.71
4,15.31


In [7]:
# Create the predictors DataFrame for the predictors to be standardized and display the first records
# see https://github.com/jeffCabrera0321/Simple_linear_regression on why these independents features were chosen
X_std = loan_data[["LoanAmt", "Debt2Inc", "MonthlyInc", "FICO"]]
X_std.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO
0,20000,14.9,6541.67,737
1,19200,28.36,4583.33,717
2,35000,23.81,11500.0,692
3,12000,18.78,3195.0,697
4,6000,20.05,4891.67,672


In [8]:
# Standardize teh predictors and display the first five rows
sc = StandardScaler()
X_std = sc.fit_transform(X_std)
X_std[:5]

array([[ 1.1276208 , -0.1056352 ,  0.32047414,  0.97751917],
       [ 1.02010707,  1.65831149, -0.29431579,  0.37469848],
       [ 3.14350321,  1.06202936,  1.87706358, -0.37882739],
       [ 0.05248351,  0.40284275, -0.73016008, -0.22812221],
       [-0.75386945,  0.56927754, -0.19751732, -0.98164808]])

In [9]:
# Create a DataFrame from these standardized predictors and display the first five records
X_std = DataFrame(X_std, columns=["LoanAmt", "Debt2Inc", "MonthlyInc", "FICO"])
X_std.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO
0,1.127621,-0.105635,0.320474,0.977519
1,1.020107,1.658311,-0.294316,0.374698
2,3.143503,1.062029,1.877064,-0.378827
3,0.052484,0.402843,-0.73016,-0.228122
4,-0.753869,0.569278,-0.197517,-0.981648


In [12]:
# Join together the standardized and indicator columns into the predictors DataFrame
X = X_std.join(loan_data["LoanTerm"])
X = X.join(loan_data["Home_RENT"])
X.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO,LoanTerm,Home_RENT
0,1.127621,-0.105635,0.320474,0.977519,0,False
1,1.020107,1.658311,-0.294316,0.374698,0,False
2,3.143503,1.062029,1.877064,-0.378827,1,False
3,0.052484,0.402843,-0.73016,-0.228122,0,True
4,-0.753869,0.569278,-0.197517,-0.981648,0,False
