회귀(Regression) 예측


In [6]:
import numpy as np
import pandas as pd

In [7]:
from sklearn.datasets import load_boston

In [8]:
data = load_boston()

In [9]:
print(data["DESCR"])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [10]:
df = pd.DataFrame(
    data['data'],
    columns=data['feature_names']
)

In [11]:
df['MEDV'] = data['target']
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


# 데이터 분할

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    df.drop("MEDV", axis=1),
    df["MEDV"]
)

X_train.shape, X_valid.shape

((379, 13), (127, 13))

- x 변수에는 모든 속성/기능이 포함되고 y변수에는 레이블이 포함

- lr 객체의 coef_ 속성 : 기울기 파라미터(W)
- lr 객체의 intercept_ 속성 : 편향 또는 절편

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


### 평가지표 만들기

# MSE(mean squared ERROR)
예측값과 실제값의 차이에 대한 제곱에 대하여 평균을 낸 값
# MAE(Mean Absolute Error)
예측값과 실제값의 차이에 대한 절대값에 대하여 평균을 낸 값
# RMSE(root mean squared error)
MSE에 root를 씌운 값으로써 실제 오차의 평균이라고 봐도 무방

In [17]:
pred = np.array([3.0, 4.0, 5.3])
actual = np.array([1.0, 3.8, 6.8])

- MSE 구현

In [18]:
def my_mean_squared_error(y,t):

    return ((y - t)**2).mean()

In [19]:
my_mean_squared_error(pred, actual)

2.0966666666666667

-MAE 구현(절대값 : np.abs())

In [22]:
def my_mean_absolute_error(y, t):

    return np.abs(y- t).mean()


In [23]:
my_mean_absolute_error(pred, actual)

1.2333333333333334

- RMSE 구현

In [24]:
def my_root_mean_squared_error(y, t):
    return np.sqrt(my_mean_squared_error(y,t))

In [25]:
my_root_mean_squared_error(pred, actual)

1.4479871085982314

In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

my_predictions = {}

colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
          'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
          'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive', 
          'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
          'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray', 
          'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
         ]

In [None]:
def plot_predictions(name_, pred, actual):
    df = pd.DataFrame({'prediction': pred, 'actual': actual})
    df = df.sort_values(by='actual').reset_index(drop=True)

    plt.figure(figsize=(12, 9))
    plt.scatter(df.index, df['prediction'], marker='x', color='r')
    plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
    plt.title(name_, fontsize=15)
    plt.legend(['prediction', 'actual'], fontsize=12)
    plt.show()

In [27]:
## linearRegression

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
model = LinearRegression(n_jobs= -1) # n_job = 사용할 cpu 코어의 개수 (모든 코어를 활용)
model.fit(X_train, y_train)

LinearRegression(n_jobs=-1)

In [30]:
pred = model.predict(X_valid)

In [31]:
# 시각화로 확인해보기


In [32]:
mse_eval("LinearRegression", pred, y_valid)

NameError: name 'mse_eval' is not defined