# Multiple linear regression

Note that we don't use the variable "credit_checks" in our model. Therefore, the coefficients are different from the example in the lectures.

## Setup

In [47]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Data

### Import data

In [48]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/loans.csv')

In [50]:
# select columns
df = df[['interest_rate', 'verified_income', 'debt_to_income', 'total_credit_utilized', 'public_record_bankrupt', 'term', 'issue_month' ]]

###  Data structure

In [51]:
df

Unnamed: 0,interest_rate,verified_income,debt_to_income,credit_util,bankruptcy,term,issue_month
0,14.07,Verified,18.01,38767,0,60,Mar-2018
1,12.61,Not Verified,5.04,4321,1,36,Feb-2018
2,17.09,Source Verified,21.15,16000,0,36,Feb-2018
3,6.72,Not Verified,10.16,4997,0,36,Jan-2018
4,14.07,Verified,57.96,52722,0,36,Mar-2018
...,...,...,...,...,...,...,...
9995,7.35,Source Verified,22.28,77963,1,36,Jan-2018
9996,19.03,Verified,32.38,101571,0,36,Feb-2018
9997,23.88,Verified,45.26,95421,0,36,Feb-2018
9998,5.32,Source Verified,11.99,27641,0,36,Feb-2018


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   interest_rate    10000 non-null  float64
 1   verified_income  10000 non-null  object 
 2   debt_to_income   9976 non-null   float64
 3   credit_util      10000 non-null  int64  
 4   bankruptcy       10000 non-null  int64  
 5   term             10000 non-null  int64  
 6   issue_month      10000 non-null  object 
dtypes: float64(2), int64(3), object(2)
memory usage: 547.0+ KB


## Data corrections

In [49]:
df.rename(columns= {'total_credit_utilized' : 'credit_util', 'public_record_bankrupt': 'bankruptcy'}, inplace=True)

Prepare data for scikit-learn model:

In [53]:
df = pd.get_dummies(df, prefix = ['verified_income', 'issue'])

df

Unnamed: 0,interest_rate,debt_to_income,credit_util,bankruptcy,term,verified_income_Not Verified,verified_income_Source Verified,verified_income_Verified,issue_Feb-2018,issue_Jan-2018,issue_Mar-2018
0,14.07,18.01,38767,0,60,0,0,1,0,0,1
1,12.61,5.04,4321,1,36,1,0,0,1,0,0
2,17.09,21.15,16000,0,36,0,1,0,1,0,0
3,6.72,10.16,4997,0,36,1,0,0,0,1,0
4,14.07,57.96,52722,0,36,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
9995,7.35,22.28,77963,1,36,0,1,0,0,1,0
9996,19.03,32.38,101571,0,36,0,0,1,1,0,0
9997,23.88,45.26,95421,0,36,0,0,1,1,0,0
9998,5.32,11.99,27641,0,36,0,1,0,1,0,0


In [54]:
# Drop missing values (see debt_to_income)
df.dropna(inplace=True)

### Variable lists

In [55]:
# Prepare the data
y_label = "interest_rate"

features = ["verified_income_Source Verified", 
            "verified_income_Verified",
            "debt_to_income",
            "credit_util",
            "bankruptcy",
            "term",
            "issue_Jan-2018",
            "issue_Mar-2018"
            ]

X = df[features]
y = df[y_label]

## Model

### Select model

In [56]:
# Choose the linear regression model
reg = LinearRegression()

### Fit model

In [None]:

# Fit the model to the data
reg.fit(X, y)

### Coefficients

We create a pandas dataframe:

In [68]:
# Intercept 
intercept = pd.DataFrame({
    "Name": ["Intercept"],
    "Coefficient":[reg.intercept_]}
    )
intercept

Unnamed: 0,Name,Coefficient
0,Intercept,4.254818


In [70]:
# Slope coefficients
slope = pd.DataFrame({
    "Name": features,
    "Coefficient": reg.coef_}
)
slope

Unnamed: 0,Name,Coefficient
0,verified_income_Source Verified,1.175864
1,verified_income_Verified,2.580503
2,debt_to_income,0.03764
3,credit_util,-3e-06
4,bankruptcy,0.61484
5,term,0.149919
6,issue_Jan-2018,-0.015715
7,issue_Mar-2018,-0.096337


In [73]:
# combine dataframes
table = pd.concat([intercept, slope], ignore_index=True, sort=False)

round(table, 3)

Unnamed: 0,Name,Coefficient
0,Intercept,4.255
1,verified_income_Source Verified,1.176
2,verified_income_Verified,2.581
3,debt_to_income,0.038
4,credit_util,-0.0
5,bankruptcy,0.615
6,term,0.15
7,issue_Jan-2018,-0.016
8,issue_Mar-2018,-0.096


### Make predictions

In [74]:
# Make predictions on the data
y_pred = reg.predict(X)

### Mean squared error

In [79]:
mean_squared_error(y, y_pred).round(3)

20.403