# Multiple linear regression

Note that we don't use the variable "credit_checks" in our model. Therefore, the coefficients are different from the example in the lectures.

## Setup

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Data

### Import data

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/loans.csv')

In [4]:
# select columns
df = df[['interest_rate', 'verified_income', 'debt_to_income', 'total_credit_utilized', 'public_record_bankrupt', 'term', 'issue_month' ]]

###  Data structure

In [5]:
df.head()

Unnamed: 0,interest_rate,verified_income,debt_to_income,total_credit_utilized,public_record_bankrupt,term,issue_month
0,9.93,Source Verified,19.28,68631,0,60,Feb-2018
1,6.08,Not Verified,29.71,67294,0,36,Jan-2018
2,10.42,Source Verified,15.57,22270,0,60,Feb-2018
3,15.05,Verified,24.31,110739,0,60,Feb-2018
4,19.42,Verified,9.44,26309,0,60,Mar-2018


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   interest_rate           200 non-null    float64
 1   verified_income         200 non-null    object 
 2   debt_to_income          200 non-null    float64
 3   total_credit_utilized   200 non-null    int64  
 4   public_record_bankrupt  200 non-null    int64  
 5   term                    200 non-null    int64  
 6   issue_month             200 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 11.1+ KB


## Data corrections

In [8]:
df = df.rename(columns= {'total_credit_utilized' : 'credit_util', 'public_record_bankrupt': 'bankruptcy'})

Prepare data for scikit-learn model. 

Make dummy variables only for verified_income

In [9]:
df = pd.get_dummies(df, prefix = ['verified_income', 'issue'])


In [10]:
df

Unnamed: 0,interest_rate,debt_to_income,credit_util,bankruptcy,term,verified_income_Not Verified,verified_income_Source Verified,verified_income_Verified,issue_Feb-2018,issue_Jan-2018,issue_Mar-2018
0,9.93,19.28,68631,0,60,False,True,False,True,False,False
1,6.08,29.71,67294,0,36,True,False,False,False,True,False
2,10.42,15.57,22270,0,60,False,True,False,True,False,False
3,15.05,24.31,110739,0,60,False,False,True,True,False,False
4,19.42,9.44,26309,0,60,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
195,18.06,27.36,101437,0,60,True,False,False,False,True,False
196,5.32,22.52,87190,0,36,True,False,False,False,True,False
197,7.34,23.87,78614,0,36,True,False,False,False,False,True
198,20.39,14.04,34273,0,36,False,True,False,False,False,True


In [11]:
# Drop missing values with .dropna (see debt_to_income)
df = df.dropna()

### Variable lists

In [12]:
# Prepare the data
y_label = "interest_rate"

features = ["verified_income_Source Verified", 
            "verified_income_Verified",
            "debt_to_income",
            "credit_util",
            "bankruptcy",
            "term",
            "issue_Jan-2018",
            "issue_Mar-2018"
            ]

y = df[y_label]
X = df[features]


## Model

### Select model

In [13]:
# Choose the linear regression model
reg = LinearRegression()

### Fit model

In [14]:

# Fit the model to the data
reg.fit(X,y)

### Coefficients

We create a pandas dataframe:

In [20]:
# Intercept 
intercept = pd.DataFrame({
    "Name": ["Intercept"],
    "Coefficient":[reg.intercept_]}
    )

intercept

Unnamed: 0,Name,Coefficient
0,Intercept,1.528375


In [18]:
# Slope coefficients
slope = pd.DataFrame({
    "Name": features,
    "Coefficient": reg.coef_}
)

slope

Unnamed: 0,Name,Coefficient
0,verified_income_Source Verified,2.319008
1,verified_income_Verified,2.794431
2,debt_to_income,0.065521
3,credit_util,-8e-06
4,bankruptcy,0.384196
5,term,0.170228
6,issue_Jan-2018,1.465442
7,issue_Mar-2018,0.720255


In [21]:
# combine dataframes with .concat
table = pd.concat([intercept, slope], ignore_index=True, sort=False)

round(table, 3)

Unnamed: 0,Name,Coefficient
0,Intercept,1.528
1,verified_income_Source Verified,2.319
2,verified_income_Verified,2.794
3,debt_to_income,0.066
4,credit_util,-0.0
5,bankruptcy,0.384
6,term,0.17
7,issue_Jan-2018,1.465
8,issue_Mar-2018,0.72


### Make predictions

In [22]:
# Make predictions on the data
y_pred = reg.predict(X)

### Mean squared error

In [23]:
mean_squared_error(y, y_pred)

17.13094535012257

## Root mean squared error

In [24]:
mean_squared_error(y, y_pred, squared=False)

4.138954620447362