In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LinearRegression

# Multivariate Linear Regression
## Datasets: Boston housing price
### 설명변수(독립변수), $\mathbf{X}$
- CRIM
- ZN
- INDUS
- CHAS
- NOX
- RM
- AGE
- DIS
- RAD
- TAX
- PTRATIO
- B
- LSTAT
### 종속변수, $\mathbf{y}$
- MEDV

In [3]:
# data download
data_url = "https://www.openml.org/data/download/22102290/dataset"
data = pd.read_csv(data_url, sep=",", skiprows=43, header=None)
data.columns = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [76]:
# train dataset
X = data.iloc[:400, :-1]
y = data.iloc[:400, -1]
X, y

(         CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
 0     0.00632  18.0   2.31   0.0  0.538  6.575   65.2  4.0900   1.0  296.0   
 1     0.02731   0.0   7.07   0.0  0.469  6.421   78.9  4.9671   2.0  242.0   
 2     0.02729   0.0   7.07   0.0  0.469  7.185   61.1  4.9671   2.0  242.0   
 3     0.03237   0.0   2.18   0.0  0.458  6.998   45.8  6.0622   3.0  222.0   
 4     0.06905   0.0   2.18   0.0  0.458  7.147   54.2  6.0622   3.0  222.0   
 ..        ...   ...    ...   ...    ...    ...    ...     ...   ...    ...   
 395   8.71675   0.0  18.10   0.0  0.693  6.471   98.8  1.7257  24.0  666.0   
 396   5.87205   0.0  18.10   0.0  0.693  6.405   96.0  1.6768  24.0  666.0   
 397   7.67202   0.0  18.10   0.0  0.693  5.747   98.9  1.6334  24.0  666.0   
 398  38.35180   0.0  18.10   0.0  0.693  5.453  100.0  1.4896  24.0  666.0   
 399   9.91655   0.0  18.10   0.0  0.693  5.852   77.8  1.5004  24.0  666.0   
 
      PTRATIO       B  LSTAT  
 0       15.3  396.

In [77]:
X_test = data.iloc[400:, :-1]
y_test = data.iloc[400:, -1]
X_test, y_test

(         CRIM   ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
 400  25.04610  0.0  18.10   0.0  0.693  5.987  100.0  1.5888  24.0  666.0   
 401  14.23620  0.0  18.10   0.0  0.693  6.343  100.0  1.5741  24.0  666.0   
 402   9.59571  0.0  18.10   0.0  0.693  6.404  100.0  1.6390  24.0  666.0   
 403  24.80170  0.0  18.10   0.0  0.693  5.349   96.0  1.7028  24.0  666.0   
 404  41.52920  0.0  18.10   0.0  0.693  5.531   85.4  1.6074  24.0  666.0   
 ..        ...  ...    ...   ...    ...    ...    ...     ...   ...    ...   
 501   0.06263  0.0  11.93   0.0  0.573  6.593   69.1  2.4786   1.0  273.0   
 502   0.04527  0.0  11.93   0.0  0.573  6.120   76.7  2.2875   1.0  273.0   
 503   0.06076  0.0  11.93   0.0  0.573  6.976   91.0  2.1675   1.0  273.0   
 504   0.10959  0.0  11.93   0.0  0.573  6.794   89.3  2.3889   1.0  273.0   
 505   0.04741  0.0  11.93   0.0  0.573  6.030   80.8  2.5050   1.0  273.0   
 
      PTRATIO       B  LSTAT  
 400     20.2  396.90  26.77  


In [78]:
model = LinearRegression()
model.fit(X, y)

In [79]:
model.intercept_

28.67259959085591

In [80]:
model.coef_

array([-1.91246374e-01,  4.42289967e-02,  5.52207977e-02,  1.71631351e+00,
       -1.49957220e+01,  4.88773025e+00,  2.60921031e-03, -1.29480799e+00,
        4.84787214e-01, -1.54006673e-02, -8.08795026e-01, -1.29230427e-03,
       -5.17953791e-01])

In [81]:
def predict(x, model=None):
    return sum(model.coef_ * x) + model.intercept_

In [84]:
y_pred = predict(data.iloc[500, :-1], model=model)
y_pred

20.24689934306219

In [83]:
model.predict(data.iloc[[500], :-1])[0]

20.24689934306219

In [85]:
y_pred == model.predict(data.iloc[[500], :-1])[0]

True

In [86]:
y_pred, data.iloc[[500], -1].values[0]

(20.24689934306219, 16.8)

In [87]:
for i in range(500, 506):
    print(f'index: {i}', predict(data.iloc[i, :-1], model=model), data.iloc[[i], -1].values[0])

index: 500 20.24689934306219 16.8
index: 501 23.70317412720544 22.4
index: 502 21.96111308386388 20.6
index: 503 28.11649747957258 23.9
index: 504 26.495863132567546 22.0
index: 505 21.8714296685737 11.9


In [89]:
# r2
model.score(X, y), model.score(X_test, y_test)

(0.7338501200472262, -0.3410244010040857)

In [90]:
for i in range(200, 206):
    print(f'index: {i}', predict(data.iloc[i, :-1], model=model), data.iloc[[i], -1].values[0])

index: 200 30.621643483751114 32.9
index: 201 27.667306380121797 24.1
index: 202 36.923415480610416 42.3
index: 203 42.73817479176718 48.5
index: 204 44.10691146611522 50.0
index: 205 22.12816207223552 22.6
