## Regularized Linear Regression

In [1]:
# import libraries
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns; sns,set()
%matplotlib inline

In [2]:
# Import Data
cols = ["ID", "IntRate", "LoanAmt", "LoanTerm", "Debt2Inc", "Home_OWN", "Home_RENT", "MonthlyInc", "RevCredBal", "FICO", "EmpLen"]
loan_data = pd.read_csv("../data/loansDataClean.csv")
# see https://github.com/jeffCabrera0321/Simple_linear_regression on how this was cleaned
loan_data = loan_data.drop(["ID"], axis=1)

In [3]:
# Display the dataset
loan_data.head()

Unnamed: 0,IntRate,LoanAmt,LoanTerm,Debt2Inc,Home_OWN,Home_RENT,MonthlyInc,RevCredBal,FICO,EmpLen
0,8.9,20000,36,14.9,False,False,6541.67,14272.0,737,1
1,12.12,19200,36,28.36,False,False,4583.33,11140.0,717,2
2,21.98,35000,60,23.81,False,False,11500.0,21977.0,692,2
3,11.71,12000,36,18.78,False,True,3195.0,14469.0,697,9
4,15.31,6000,36,20.05,True,False,4891.67,10391.0,672,3


In [4]:
# Convert LoanTerm from 36 to 0 and 60 to 1
loan_data["LoanTerm"] = np.where(loan_data["LoanTerm"] == 36, 0, 1)
loan_data["Home_OWN"] = np.where(loan_data["Home_OWN"] == False, 0, 1)
loan_data["Home_RENT"] = np.where(loan_data["Home_RENT"] == False, 0, 1)
loan_data["LoanTerm"].head()

0    0
1    0
2    1
3    0
4    0
Name: LoanTerm, dtype: int64

In [5]:
# Create the response DataFrame and display the first five records
Y = loan_data[["IntRate"]]
Y.head()

Unnamed: 0,IntRate
0,8.9
1,12.12
2,21.98
3,11.71
4,15.31


In [6]:
# Create the predictors DataFrame for the predictors to be standardized and display the first records
# see https://github.com/jeffCabrera0321/Simple_linear_regression on why these independents features were chosen
X_std = loan_data[["LoanAmt", "Debt2Inc", "MonthlyInc", "FICO"]]
X_std.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO
0,20000,14.9,6541.67,737
1,19200,28.36,4583.33,717
2,35000,23.81,11500.0,692
3,12000,18.78,3195.0,697
4,6000,20.05,4891.67,672


In [7]:
# Standardize teh predictors and display the first five rows
sc = StandardScaler()
X_std = sc.fit_transform(X_std)
X_std[:5]

array([[ 1.1276208 , -0.1056352 ,  0.32047414,  0.97751917],
       [ 1.02010707,  1.65831149, -0.29431579,  0.37469848],
       [ 3.14350321,  1.06202936,  1.87706358, -0.37882739],
       [ 0.05248351,  0.40284275, -0.73016008, -0.22812221],
       [-0.75386945,  0.56927754, -0.19751732, -0.98164808]])

In [8]:
# Create a DataFrame from these standardized predictors and display the first five records
X_std = DataFrame(X_std, columns=["LoanAmt", "Debt2Inc", "MonthlyInc", "FICO"])
X_std.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO
0,1.127621,-0.105635,0.320474,0.977519
1,1.020107,1.658311,-0.294316,0.374698
2,3.143503,1.062029,1.877064,-0.378827
3,0.052484,0.402843,-0.73016,-0.228122
4,-0.753869,0.569278,-0.197517,-0.981648


In [9]:
# Join together the standardized and indicator columns into the predictors DataFrame
X = X_std.join(loan_data["LoanTerm"])
X = X.join(loan_data["Home_RENT"])
X.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO,LoanTerm,Home_RENT
0,1.127621,-0.105635,0.320474,0.977519,0,0
1,1.020107,1.658311,-0.294316,0.374698,0,0
2,3.143503,1.062029,1.877064,-0.378827,1,0
3,0.052484,0.402843,-0.73016,-0.228122,0,1
4,-0.753869,0.569278,-0.197517,-0.981648,0,0


In [10]:
# Produce summary report using OLS
X_ols = sm.add_constant(X)
loan_model = sm.OLS(Y, X_ols)
loan_model = loan_model.fit()
print(loan_model.summary())

                            OLS Regression Results                            
Dep. Variable:                IntRate   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     887.7
Date:                Wed, 11 Dec 2024   Prob (F-statistic):               0.00
Time:                        23:24:50   Log-Likelihood:                -3774.3
No. Observations:                1752   AIC:                             7563.
Df Residuals:                    1745   BIC:                             7601.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.3818      0.075    164.891      0.0

In [11]:
# Create coefficient array from the report
coeff = loan_model.params
coeff

const         12.381839
LoanAmt        1.086789
Debt2Inc      -0.132790
MonthlyInc    -0.149863
FICO          -3.127157
LoanTerm       3.412037
Home_RENT      0.232231
dtype: float64

In [12]:
# Display the means of the standardized variables
sc.mean_

array([11609.47488584,    15.70606164,  5520.83785388,   704.56849315])

In [13]:
# Base case sample
LoanAmt = 10000
Debt2Inc = 20
MonthlyInc = 5000
FICO = 700
LoanTerm = 0 # 36 month = 0
Home_RENT = 0 # Home_MORT or Home_OWN = 0

In [14]:
# Create a sample array for standardization
sample = [LoanAmt, Debt2Inc, MonthlyInc, FICO]
sample

[10000, 20, 5000, 700]

In [15]:
# Standardize the sample
std_sample = (sample - sc.mean_) / sc.scale_
std_sample

array([-0.21630081,  0.56272499, -0.16350882, -0.13769911])

In [16]:
# Predict interest rate for base case sample
LoanAmt = std_sample[0]
Debt2Inc = std_sample[1]
MonthlyInc = std_sample[2]
FICO = std_sample[3]
base_int_rate = (coeff.iloc[0] +
                 coeff.iloc[1]*LoanAmt +
                 coeff.iloc[2]*Debt2Inc +
                 coeff.iloc[3]*MonthlyInc +
                 coeff.iloc[4]*FICO +
                 coeff.iloc[5]*LoanTerm +
                 coeff.iloc[6]*Home_RENT)
base_int_rate

np.float64(12.527151788581584)

In [17]:
# New sample: LoanAmt changed by one std dev unit
LoanAmt = 10000 + sc.scale_[0] # 17440.91
Debt2Inc = 20
MonthlyInc = 5000
FICO = 700

In [18]:
# Standardize the new sample
sample = [LoanAmt, Debt2Inc, MonthlyInc, FICO]
std_sample = (sample - sc.mean_) / sc.scale_
std_sample

array([ 0.78369919,  0.56272499, -0.16350882, -0.13769911])

In [19]:
# Predict interest rate for new sample
LoanAmt = std_sample[0]
Debt2Inc = std_sample[1]
MonthlyInc = std_sample[2]
FICO = std_sample[3]
int_rate = (coeff.iloc[0] +
            coeff.iloc[1]*LoanAmt +
            coeff.iloc[2]*Debt2Inc +
            coeff.iloc[3]*MonthlyInc +
            coeff.iloc[4]*FICO +
            coeff.iloc[5]*LoanTerm +
            coeff.iloc[6]*Home_RENT)
int_rate

np.float64(13.613940693821647)

In [20]:
# Check the model interpretation
base_int_rate + coeff.iloc[1]

np.float64(13.613940693821647)

In [21]:
# New sample: Debt2Inc changed by one std dev
LoanAmt = 10000
Debt2Inc = 20 + sc.scale_[1] # 27.630616
MonthlyInc = 5000
FICO = 700

In [22]:
# Standardize the new sample
sample = [LoanAmt, Debt2Inc, MonthlyInc, FICO]
std_sample = (sample - sc.mean_) / sc.scale_
std_sample

array([-0.21630081,  1.56272499, -0.16350882, -0.13769911])

In [23]:
# Predict interest rate for new sample
LoanAmt = std_sample[0]
Debt2Inc = std_sample[1]
MonthlyInc = std_sample[2]
FICO = std_sample[3]
int_rate = (coeff.iloc[0] +
            coeff.iloc[1]*LoanAmt +
            coeff.iloc[2]*Debt2Inc +
            coeff.iloc[3]*MonthlyInc +
            coeff.iloc[4]*FICO +
            coeff.iloc[5]*LoanTerm +
            coeff.iloc[6]*Home_RENT)
int_rate

np.float64(12.394362053524517)

In [24]:
# Check the model interpretation
base_int_rate + coeff.iloc[2]

np.float64(12.394362053524517)

## Linear Regression Analysis Using OLS – No Scaling

In [25]:
# Create new DataFrames for new predictions
Y = loan_data[["IntRate"]]
X = loan_data.drop(["IntRate"], axis=1)

In [26]:
# Produce summary report using OLS:
x_ols = sm.add_constant(X)
loan_model = sm.OLS(Y, X_ols)
loan_model = loan_model.fit()
print(loan_model.summary())

                            OLS Regression Results                            
Dep. Variable:                IntRate   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     887.7
Date:                Wed, 11 Dec 2024   Prob (F-statistic):               0.00
Time:                        23:24:50   Log-Likelihood:                -3774.3
No. Observations:                1752   AIC:                             7563.
Df Residuals:                    1745   BIC:                             7601.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.3818      0.075    164.891      0.0

In [27]:
# Let's create a function to reduce repetitive actions
def viewOLS(X_variable):
    # Add a constant to the independent variable matrix to hold the y_intercept
    X_ss = sm.add_constant(X_variable)
    # Create the linear regression model using Y and X_std0
    loan_model = sm.OLS(Y, X_ss)
    # Fit the model to the data
    loan_model = loan_model.fit()
    return loan_model

In [28]:
# Dropping the highest p-value variable EmpLen
X = loan_data.drop(["IntRate", "EmpLen"], axis=1)
X.head()

Unnamed: 0,LoanAmt,LoanTerm,Debt2Inc,Home_OWN,Home_RENT,MonthlyInc,RevCredBal,FICO
0,20000,0,14.9,0,0,6541.67,14272.0,737
1,19200,0,28.36,0,0,4583.33,11140.0,717
2,35000,1,23.81,0,0,11500.0,21977.0,692
3,12000,0,18.78,0,1,3195.0,14469.0,697
4,6000,0,20.05,1,0,4891.67,10391.0,672


In [29]:
loan_model1 = viewOLS(X)
print(loan_model1.summary())

                            OLS Regression Results                            
Dep. Variable:                IntRate   R-squared:                       0.754
Model:                            OLS   Adj. R-squared:                  0.753
Method:                 Least Squares   F-statistic:                     667.8
Date:                Wed, 11 Dec 2024   Prob (F-statistic):               0.00
Time:                        23:24:50   Log-Likelihood:                -3771.6
No. Observations:                1752   AIC:                             7561.
Df Residuals:                    1743   BIC:                             7610.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         77.4697      1.135     68.227      0.0

In [30]:
# Dropping the highest p-value variable RevCredBal
X = loan_data.drop(["IntRate", "EmpLen", "RevCredBal"], axis=1)
X.head()

Unnamed: 0,LoanAmt,LoanTerm,Debt2Inc,Home_OWN,Home_RENT,MonthlyInc,FICO
0,20000,0,14.9,0,0,6541.67,737
1,19200,0,28.36,0,0,4583.33,717
2,35000,1,23.81,0,0,11500.0,692
3,12000,0,18.78,0,1,3195.0,697
4,6000,0,20.05,1,0,4891.67,672


In [31]:
loan_model2 = viewOLS(X)
print(loan_model2.summary())

                            OLS Regression Results                            
Dep. Variable:                IntRate   R-squared:                       0.754
Model:                            OLS   Adj. R-squared:                  0.753
Method:                 Least Squares   F-statistic:                     762.4
Date:                Wed, 11 Dec 2024   Prob (F-statistic):               0.00
Time:                        23:24:51   Log-Likelihood:                -3772.7
No. Observations:                1752   AIC:                             7561.
Df Residuals:                    1744   BIC:                             7605.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         77.4918      1.136     68.229      0.0

In [32]:
# Dropping the highest p-value variable RevCredBal
X = loan_data.drop(["IntRate", "EmpLen", "RevCredBal", "Home_OWN"], axis=1)
X.head()

Unnamed: 0,LoanAmt,LoanTerm,Debt2Inc,Home_RENT,MonthlyInc,FICO
0,20000,0,14.9,0,6541.67,737
1,19200,0,28.36,0,4583.33,717
2,35000,1,23.81,0,11500.0,692
3,12000,0,18.78,1,3195.0,697
4,6000,0,20.05,0,4891.67,672


In [33]:
loan_model3 = viewOLS(X)
print(loan_model3.summary())

                            OLS Regression Results                            
Dep. Variable:                IntRate   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     887.7
Date:                Wed, 11 Dec 2024   Prob (F-statistic):               0.00
Time:                        23:24:51   Log-Likelihood:                -3774.3
No. Observations:                1752   AIC:                             7563.
Df Residuals:                    1745   BIC:                             7601.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         77.6289      1.134     68.457      0.0

In [34]:
# Create coefficient array from teh report
coeff = loan_model3.params
coeff

const         77.628889
LoanAmt        0.000146
LoanTerm       3.412037
Debt2Inc      -0.017402
Home_RENT      0.232231
MonthlyInc    -0.000047
FICO          -0.094256
dtype: float64

In [35]:
# Base case sample
LoanAmt = 10000
LoanTerm = 0 # 36 month = 0
Debt2Inc = 20
Home_RENT = 0 # Home_MORT or Home_OWN = 0
MonthlyInc = 5000
FICO = 700
base_int_rate = (coeff.iloc[0] +
                 coeff.iloc[1]*LoanAmt +
                 coeff.iloc[2]*LoanTerm +
                 coeff.iloc[3]*Debt2Inc +
                 coeff.iloc[4]*Home_RENT +
                 coeff.iloc[5]*MonthlyInc +
                 coeff.iloc[6]*FICO)
base_int_rate

np.float64(12.527151788581264)

In [36]:
# New sample: Add 1000 to LoanAmt
LoanAmt = 10000 + 1000
LoanTerm = 0 # 36 month = 0
Debt2Inc = 20
Home_RENT = 0 # Home_MORT or Home_OWN = 0
MonthlyInc = 5000
FICO = 700
int_rate = (coeff.iloc[0] +
            coeff.iloc[1]*LoanAmt +
            coeff.iloc[2]*LoanTerm +
            coeff.iloc[3]*Debt2Inc +
            coeff.iloc[4]*Home_RENT +
            coeff.iloc[5]*MonthlyInc +
            coeff.iloc[6]*FICO)
int_rate

np.float64(12.673207697597647)

## Linear Regression Analysis Using OLS - With Centering

In [37]:
# Create the predictors DataFrame for the predictors to be centered and display the first five records
X_cent = loan_data[["LoanAmt", "Debt2Inc", "MonthlyInc", "FICO"]]
X_cent.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO
0,20000,14.9,6541.67,737
1,19200,28.36,4583.33,717
2,35000,23.81,11500.0,692
3,12000,18.78,3195.0,697
4,6000,20.05,4891.67,672


In [38]:
# Center the data and display the first five rows
# Set the with_std param to False, for only centering
sc = StandardScaler(with_std=False)
X_cent = sc.fit_transform(X_cent)
X_cent[:5]

array([[ 8.39052511e+03, -8.06061644e-01,  1.02083215e+03,
         3.24315068e+01],
       [ 7.59052511e+03,  1.26539384e+01, -9.37507854e+02,
         1.24315068e+01],
       [ 2.33905251e+04,  8.10393836e+00,  5.97916215e+03,
        -1.25684932e+01],
       [ 3.90525114e+02,  3.07393836e+00, -2.32583785e+03,
        -7.56849315e+00],
       [-5.60947489e+03,  4.34393836e+00, -6.29167854e+02,
        -3.25684932e+01]])

In [39]:
# Create a DataFrame from these centered predictors and display the first five records
X_cent = DataFrame(X_cent, columns=["LoanAmt", "Debt2Inc", "MonthlyInc", "FICO"])
X_cent.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO
0,8390.525114,-0.806062,1020.832146,32.431507
1,7590.525114,12.653938,-937.507854,12.431507
2,23390.525114,8.103938,5979.162146,-12.568493
3,390.525114,3.073938,-2325.837854,-7.568493
4,-5609.474886,4.343938,-629.167854,-32.568493


In [40]:
# Join together the centered and indicator columns into the predictors DataFrame
X = X_cent.join (loan_data["LoanTerm"])
X = X.join (loan_data["Home_RENT"])
X.head()

Unnamed: 0,LoanAmt,Debt2Inc,MonthlyInc,FICO,LoanTerm,Home_RENT
0,8390.525114,-0.806062,1020.832146,32.431507,0,0
1,7590.525114,12.653938,-937.507854,12.431507,0,0
2,23390.525114,8.103938,5979.162146,-12.568493,1,0
3,390.525114,3.073938,-2325.837854,-7.568493,0,1
4,-5609.474886,4.343938,-629.167854,-32.568493,0,0


In [41]:
centered_model = viewOLS(X)
print(centered_model.summary())

                            OLS Regression Results                            
Dep. Variable:                IntRate   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     887.7
Date:                Wed, 11 Dec 2024   Prob (F-statistic):               0.00
Time:                        23:24:51   Log-Likelihood:                -3774.3
No. Observations:                1752   AIC:                             7563.
Df Residuals:                    1745   BIC:                             7601.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.3818      0.075    164.891      0.0

In [42]:
# Create coefficient array from the report
coeff = centered_model.params
coeff

const         12.381839
LoanAmt        0.000146
Debt2Inc      -0.017402
MonthlyInc    -0.000047
FICO          -0.094256
LoanTerm       3.412037
Home_RENT      0.232231
dtype: float64

In [43]:
# Create mean sample
LoanAmt = sc.mean_[0]
Debt2Inc = sc.mean_[1]
MonthlyInc = sc.mean_[2]
FICO = sc.mean_[3]
LoanTerm = 0 # 36 month = 0
Home_RENT = 0 # Home_MORT or Home_OWN = 0

In [44]:
# Create centered sample array
sample = [LoanAmt, Debt2Inc, MonthlyInc, FICO]
cent_sample = (sample - sc.mean_)
cent_sample

array([0., 0., 0., 0.])

In [45]:
# Predict interest rate for base case sample
LoanAmt = cent_sample[0]
Debt2Inc = cent_sample[1]
MonthlyInc = cent_sample[2]
FICO = cent_sample[3]
base_int_rate = (coeff.iloc[0] +
                 coeff.iloc[1]*LoanAmt +
                 coeff.iloc[2]*Debt2Inc +
                 coeff.iloc[3]*MonthlyInc +
                 coeff.iloc[4]*FICO +
                 coeff.iloc[5]*LoanTerm +
                 coeff.iloc[6]*Home_RENT)
base_int_rate

np.float64(12.381838602190395)

## Evaluating the Centered Linear Regression Model

In [46]:
# Split the DataFrame into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33, test_size=0.25)

In [47]:
# Run Linear Regression on the train set
lr = LinearRegression()
model = lr.fit(X_train, Y_train)

In [48]:
# Report the R-squared score
model.score(X_test, Y_test)

0.780690401391094

In [49]:
Y_pred = lr.predict(X_test)
metrics.mean_squared_error(Y_test, Y_pred)

4.185137896883718

In [50]:
# Run at various random states
r_squ = []
mse = []
rand_state = [1,3,5,7,11,15,21,33,35]
for rs in rand_state:
    # Split the DataFrames into train and test
    X_train, X_test, Y_train, Y_test = train_test_split (X, Y, random_state = rs, test_size = 0.25)
    # Run linear regression on the train set
    lr = LinearRegression()
    model = lr.fit (X_train, Y_train)
    # Save the R-squared and MSE scores
    r_squ.append (model.score (X_test, Y_test))
    mse.append (metrics.mean_squared_error (Y_test,Y_pred))
    r_squ_array = np.array (r_squ)
    mse_array = np.array (mse)
print ("Minimum R-squared:", r_squ_array.min(),
       "Rand State:", rand_state[r_squ_array.argmin()])
print ("Maximum R-squared:", r_squ_array.max(),
       "Rand State:", rand_state[r_squ_array.argmax()])
print ("Minimum MSE:", mse_array.min(),
       "Rand State:", rand_state[mse_array.argmin()])
print ("Maximum MSE:", mse_array.max(),
       "Rand State:", rand_state[mse_array.argmax()])

Minimum R-squared: 0.7064262336718382 Rand State: 11
Maximum R-squared: 0.780690401391094 Rand State: 33
Minimum MSE: 4.185137896883718 Rand State: 33
Maximum MSE: 35.92159455963931 Rand State: 3


In [51]:
# Use cross validation to find the mean R-squared score
lr = LinearRegression()
scores = cross_val_score(lr, X, Y, cv=10)
print("Mean R-Squared Score: ", sum(scores)/10)

Mean R-Squared Score:  0.749889629297859


## Ridge Regression on Centered Model, alpha = 0

In [53]:
# Create Response DataFrame
Y = loan_data[["IntRate"]]
Y = Y.values
Y = Y.ravel()
Y[:5]

array([ 8.9 , 12.12, 21.98, 11.71, 15.31])

In [54]:
# Split the DataFrames into train and test using random_state and test_size = 0.25
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33, test_size=0.25)

In [57]:
# Fit the Ridge mode with alpha=0 to produce a set of predictions
ridgemod = Ridge(alpha=0)
ridgemod.fit(X_train, Y_train)
Y_pred = ridgemod.predict(X_test)

In [58]:
# Display the ridge coefficients
ridgemod.coef_

array([ 1.47000317e-04, -1.22104638e-02, -5.41948484e-05, -9.35859533e-02,
        3.31168242e+00,  2.05338866e-01])

In [59]:
# Display the Ridge intercept
ridgemod.intercept_

np.float64(12.430343488633653)

In [60]:
print("R-Squared Score: ", ridgemod.score(X_test, Y_test))

R-Squared Score:  0.7806904013910905


In [61]:
# Display the MSE score
print("Ridge alpha=0 MSE",
      metrics.mean_squared_error(Y_test, Y_pred))

Ridge alpha=0 MSE 4.185137896883786


In [62]:
# Create Predictor DataFrame
X_Full = loan_data.drop(["IntRate"], axis=1)

In [63]:
# Split the DataFrame into train and test using random_state=33
X_train, X_test, Y_train, Y_test = train_test_split(X_Full, Y, random_state=33, test_size=0.25)

In [64]:
# Create a list of alpha values for RidgeCV
alpha_range = [0.01, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5]
print("RidgeCV alpha list: ", alpha_range)

RidgeCV alpha list:  [0.01, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5]


In [65]:
# Run RidgeCV Regression
# Select the best alpha with RidgeCV
ridcvmod = RidgeCV(alphas=alpha_range, scoring="neg_mean_squared_error")
ridcvmod.fit(X_train, Y_train)
Y_pred = ridcvmod.predict(X_test)

In [66]:
# Display the RidgeCV best alpha
print("RidgeCV Regression Best Alpha: ", ridcvmod.alpha_)

RidgeCV Regression Best Alpha:  1.0


In [67]:
# Display the RidgeCV coefficients
print("RidgeCV Regression Coefficients: ", ridcvmod.coef_)

RidgeCV Regression Coefficients:  [ 1.47697632e-04  3.30368950e+00 -1.07931122e-02  4.90475469e-01
  2.89121905e-01 -4.25684702e-05 -2.68110307e-06 -9.34558680e-02
  2.51311928e-03]


In [68]:
# Display the RidgeCV Regression R-Squared
print("RidgeCV Regression Test R-Squared: ", ridcvmod.score(X_test, Y_test))

RidgeCV Regression Test R-Squared:  0.780079490105551


In [69]:
# Display the RidgeCV MSE
print("RidgeCV Regression MSE: ", metrics.mean_squared_error(Y_pred, Y_test))

RidgeCV Regression MSE:  4.196796064100191


## Lasso Regression on Full Model

In [70]:
# Run  LassoCV Regression
# Select the best alpha with LassoCV
lascvmod = LassoCV(n_alphas=100, random_state=33)
lascvmod.fit(X_train, Y_train)
Y_pred = lascvmod.predict(X_test)

In [71]:
# Display the LassoCV best alpha
print("LassoCV Regression Best Alpha: ", lascvmod.alpha_)

LassoCV Regression Best Alpha:  9.054246228127488


In [72]:
# Display the LassoCV coefficients
print("LassoCV Regression Coefficients: ", lascvmod.coef_)

LassoCV Regression Coefficients:  [ 2.15170508e-04  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -1.00809240e-04 -4.60023078e-06 -8.61109782e-02
 -0.00000000e+00]


In [73]:
# Display the LassoCV intercept
print("LassoCV Regression Intercept: ", lascvmod.intercept_)

LassoCV Regression Intercept:  71.9225125495878


In [74]:
# Display the R-Squared
print("LassoCV Regression Test R-Squared: ", lascvmod.score(X_test, Y_test))

LassoCV Regression Test R-Squared:  0.6778815827022546


In [75]:
# Display the LassoCV RMSE
print("LassoCV Regression RMSE: ", metrics.mean_squared_error(Y_pred, Y_test))

LassoCV Regression RMSE:  6.147063348198807


### Lasso regression prefers setting many model coefficients to exactly zero. This is a type of feature selection. It is the same as removing the predictor from the model. Using this full dataset, the LassoCV algorithm zeroed out the EmpLen variable coefficient. This was the first feature dropped during backward elimination with ordinary linear regression. Its p-value was much larger than any of the others.