In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from types import MethodType
import numpy
import pandas
import cloudpickle
import os

data = pandas.read_csv('../../data/kc_house_data.csv')
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [2]:
data.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [3]:
X_columns = ['grade', 'bedrooms', 'floors', 'condition', 'yr_built', 'yr_renovated']
y_column = 'price'
X = data[X_columns]
y = data[y_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=1,
)

In [4]:
class YrRenovatedTransformer(BaseEstimator, TransformerMixin):
    """yr_renovated を renovated_flg に変換するクラス。
    """
    def __init__(self):
        self.key = 'yr_renovated'
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.assign(renovated_flg=X[self.key].apply(lambda x: 1 if x > 0 else 0))
        X = X.drop([self.key], axis=1)
        return X

In [5]:
pipe = Pipeline([
    ('yr_renovated_transformer', YrRenovatedTransformer()),
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regressor', LinearRegression()),
])

param_grid = [
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'reduce_dim__n_components': range(1, len(X_columns)+1),
        'regressor': [LinearRegression()],
    },
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'reduce_dim__n_components': range(1, len(X_columns)+1),
        'regressor': [Ridge(), Lasso(), ElasticNet()],
        'regressor__alpha': [0.5, 0.8, 1.0, 1.2, 1.5],
    },
]

regr = GridSearchCV(
    pipe, 
    cv=5, 
    n_jobs=-1, 
    verbose=1,
    param_grid=param_grid
)

regr.fit(X_train, y_train)
regr.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:   47.4s finished


Pipeline(memory=None,
     steps=[('yr_renovated_transformer', YrRenovatedTransformer()), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [6]:
# predict
y_train_pred = regr.predict(X_train)
y_test_pred = regr.predict(X_test)

# report
print('train MAE', mean_absolute_error(y_train, y_train_pred))
print('test MAE', mean_absolute_error(y_test, y_test_pred))
print('train RMSE', numpy.sqrt(mean_squared_error(y_train, y_train_pred)))
print('test RMSE', numpy.sqrt(mean_squared_error(y_test, y_test_pred)))

train MAE 154045.78961067938
test MAE 158677.3669716433
train RMSE 240853.74726452815
test RMSE 289451.26083429233


In [7]:
# define format_predict function
def format_predict(self, feature):
    feature_df = pandas.DataFrame({col: [feature.get(col)] for col in X_columns})
    return self.predict(feature_df)

# set format_predict function to clf
regr.format_predict = MethodType(format_predict, regr)

In [8]:
MODEL_DIR = '../../models/sample/'

if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

with open(MODEL_DIR + 'model.pkl', 'wb') as f:
    pickled_regr = cloudpickle.dumps(regr)
    f.write(pickled_regr)