# Scikit-learn으로 Linear Regression 구현하기
---
* 모든 데이터를 8:2로 나누고 8 부분을 다시 8:2로 나누어, training set, validation set, test set을 구성. 보유하고 있는 데이터는 총 21,613개.
* training set만으로 여러 종류의 모델을 학습하고 validation set으로 각 모델의 성능을 측정. test set은 건드리지 않음.
* 최종적으로 21,613개 중 64%를 Training set으로, 16%를 Validation set으로 남은 20%를 Test set으로 구성. train set 13,832개, valid set 3,458개, test set 4,323개

In [1]:
import pandas as pd

df = pd.read_csv("../../COALA_DS_DATA/COALA_DS_DATA/week5/data/house.csv")
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [2]:
house_data = df.drop(['id', 'date'], axis = 1)

In [11]:
from sklearn.model_selection import train_test_split

train_data = house_data.drop(['price'], axis = 1)
target_data = house_data['price']

# train : test set -> 8 : 2
x_train, x_test, y_train, y_test = train_test_split(train_data, target_data, test_size = 0.2)

# 다시 train set을 8: 2로 나눠 val_data
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2)

print("모든 데이터", train_data.shape) # (행, 열)
print("Train set", x_train.shape, y_train.shape)
print("Valid set", x_valid.shape, y_valid.shape)
print("Test set", x_test.shape,  y_test.shape)

모든 데이터 (21613, 18)
Train set (13832, 18) (13832,)
Valid set (3458, 18) (3458,)
Test set (4323, 18) (4323,)


## LR은 연속적인 값에 대한 scoring 방식을 상관계수로!
### 상관계수(관련성)을 기인한 결정계수를 바탕으로 score을 보여줌

In [15]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

print('train set score', lr.score(x_train, y_train))
print('valid set score', lr.score(x_valid, y_valid))

train set score 0.7042940670380808
valid set score 0.6886792810184212


## PolynomialFearture, pipeline
* scikit-learn의 파이프라인(pipeline) 기능을 이용하여 분류 모형과 합칠 수 있다.
* PolynomialFeatures 입력값  x 를 다항식으로 변환한다.

In [16]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

model = make_pipeline(PolynomialFeatures(2), 
                     LinearRegression())
model.fit(x_train, y_train)
print('train set score', model.score(x_train, y_train))
print('valid set score', model.score(x_valid, y_valid))

train set score 0.8286893107020755
valid set score 0.798791281654934


In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

model3 = make_pipeline(PolynomialFeatures(3), 
                     LinearRegression())
model3.fit(x_train, y_train)
print('train set score', model3.score(x_train, y_train))
print('valid set score', model3.score(x_valid, y_valid))

train set score 0.8295003474304721
valid set score 0.3212952436284976
