## IMPORT LIBRARY

In [None]:
import numpy as np
import os
import platform
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPRegressor
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import xgboost as xgb
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

Error: Session cannot generate requests

## CONSTRAINT

In [None]:
DEBUG = True

In [None]:
if platform.system() == 'Darwin':
    PATH = './'
else:
    PATH = '/content/drive/MyDrive/Colab Notebooks/Kaggle/House-Prices/'

## SET SEED

In [None]:
np.random.seed(1)

## READ DATA

In [None]:
df_train = pd.read_csv(f'{PATH}train.csv')
df_test = pd.read_csv(f'{PATH}test.csv')

# テストIDを別途保存しておく
test_ID = df_test['Id']


## REMOVE OUTER VALUE

In [None]:
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

df_train["TotalSF"] = df_train["1stFlrSF"] + df_train["2ndFlrSF"] + df_train["TotalBsmtSF"]
df_test["TotalSF"] = df_test["1stFlrSF"] + df_test["2ndFlrSF"] + df_test["TotalBsmtSF"]

df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)

## MAKE DATA

In [None]:

# データを合体
df = pd.concat([df_train, df_test], sort=False)

df["PoolQC"] = df["PoolQC"].fillna("None")
df["MiscFeature"] = df["MiscFeature"].fillna("None")
df["Alley"] = df["Alley"].fillna("None")
df["Fence"] = df["Fence"].fillna("None")
df["FireplaceQu"] = df["FireplaceQu"].fillna("None")
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# 'RL'が一番多いのでそれで埋める
df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])

df["Functional"] = df["Functional"].fillna("Typ")

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    df[col] = df[col].fillna('None')
    
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df[col] = df[col].fillna(0)
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    df[col] = df[col].fillna(0)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df[col] = df[col].fillna('None')

df["MasVnrType"] = df["MasVnrType"].fillna("None")
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

# 電気がないことはないので最頻値で埋める
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

df = df.drop(['Utilities'], axis=1)

# キッチンがないことはないので最頻値で埋める
df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])

df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])

df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
df['MSSubClass'] = df['MSSubClass'].fillna("None")

## カテゴリにしたい数値型の特徴量を文字列に変換
#MSSubClass=The building class
df['MSSubClass'] = df['MSSubClass'].apply(str)

#Changing OverallCond into a categorical variable
df['OverallCond'] = df['OverallCond'].astype(str)

#Year and month sold are transformed into categorical features.
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)

from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))
    
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

# 正規化    
numeric_features = df.dtypes[df.dtypes != "object"].index
# 歪みを数値化
skewed_features = df[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skewness = skewed_features[abs(skewed_features) > 0.9]
skewed_features = high_skewness.index

# 0.9以上のものを補正をかける
for feature in skewed_features:
    if feature != 'SalePrice': 
        df[feature] = boxcox1p(df[feature], 0.15)

# one-hot-Encoding
df = pd.get_dummies(df)

# 訓練データを作成
y = np.log1p(df[:df_train.shape[0]]['SalePrice'])
X = df[:df_train.shape[0]].drop(['SalePrice'], axis=1)

# 検証
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
X.head()

## RSME

In [None]:
def rmse(y_true,y_pred):
    #RMSEを算出
    rmse = np.sqrt(mean_squared_error(y_true,y_pred))
    return rmse

## MAKE MODEL AND FIT (XGBOOST)

In [None]:
import copy
base_params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
}

tmp_params = copy.deepcopy(base_params)

watchlist = [(trains, 'train'), (tests, 'eval')]

def optimizer(trial):
#     booster = trial.suggest_categorical('booster', ['gbtree', 'dart', 'gblinear'])
    eta = trial.suggest_uniform('eta', 0.01, 0.3)
    max_depth = trial.suggest_int('max_depth', 4, 15)
    __lambda = trial.suggest_uniform('lambda', 0.7, 2)

#     params['booster'] = booster
    tmp_params['eta'] = eta
    tmp_params['max_depth'] = max_depth
    tmp_params['lambda'] = __lambda

    model = xgb.train(tmp_params, X_train, num_boost_round=50)
    predicts = model.predict(X_test)

    return rmse(y_test, predicts)

study = optuna.create_study(direction='min')
study.optimize(optimizer, n_trials=500)

In [None]:
pipeline = make_pipeline(RobustScaler(), xgb.XGBRegressor(gamma=0.001, learning_rate=0.01, max_depth=2, n_estimators=8000))
#pipeline = make_pipeline(RobustScaler(), xgb.XGBRegressor())

#params = {'xgbregressor__gamma':[0.001, 0.1, 1, 10, 100],
#          'xgbregressor__max_depth':[2,4,6,8,10],
#          'xgbregressor__learning_rate':[0.0001, 0.001, 0.01],
#          'xgbregressor__n_estimators':[10, 100, 1000]}

#if DEBUG:
#    params = {'xgbregressor__gamma':[0.001, 0.1, 1, 10, 100]}

#gd = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring=make_scorer(rmse,greater_is_better=False))
pipeline.fit(X_train, y_train)
xgboost = pipeline

## BEST PARAM(XGBOOST)

In [None]:
#gd.best_params_

## MAKE MODEL AND FIT (LIGHT BGM)

In [None]:
import optuna.integration.lightgbm as lgbo
opt_params = { "objective":"regression", "metric":"rmse"}
reg_train = lgbm.Dataset(X_train, y_train)
reg_eval = lgbm.Dataset(X_test, y_test, reference=reg_train)

#opt=lgbo.train(opt_params, reg_train, valid_sets = reg_eval, verbose_eval=False, num_boost_round = 5)

In [None]:
opt.params

In [None]:
"""
{'objective': 'regression',
 'metric': 'rmse',
 'feature_pre_filter': False,
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'num_leaves': 85,
 'feature_fraction': 0.6,
 'bagging_fraction': 0.5374366835020357,
 'bagging_freq': 3,
 'min_child_samples': 5}
"""
pipeline = make_pipeline(RobustScaler(), lgbm.LGBMRegressor(objective='regression', num_leaves=85, feature_fraction=0.6, bagging_fraction=0.5374366835020357, bagging_freq=3, min_child_samples=5, max_depth=2))
pipeline.fit(X_train, y_train)
lightbgm = pipeline

## BEST PARAMS(LIGHTBGM)

In [None]:
#gd.best_params_

## BEST PARAMS

In [None]:
#gd.best_params_

## STACKING

In [None]:

xgboost_pred1 = xgboost.predict(X_train)
lightbgm_pred1= lightbgm.predict(X_train)

print ("xgboost Model rsme: {:.6f}".format(rmse(y_train, xgboost_pred1)))
print ("lightbgm pred Model rsme: {:.6f}".format(rmse(y_train, lightbgm_pred1)))


# 結果の検証 
xgboost_pred = xgboost.predict(X_test)
lightbgm_pred= lightbgm.predict(X_test)
print ("xgboost Model rsme: {:.6f}".format(rmse(y_test, xgboost_pred)))
print ("lightbgm_pred Model rsme: {:.6f}".format(rmse(y_test, lightbgm_pred)))

final_estimator = MLPRegressor(solver="sgd", random_state=1)
st_model = StackingRegressor(estimators=[('xgboost', xgboost),('lightbgm', lightbgm)], final_estimator=final_estimator, cv=5)

st_model.fit(X_train, y_train)

y_pred_1 = st_model.predict(X_train)
y_pred = st_model.predict(X_test)
print ("Stacking Model rsme: {:.6f}".format(rmse(y_train, y_pred_1)))
print ("Stacking Model rsme: {:.6f}".format(rmse(y_test, y_pred)))

## PREDICT AND SUBMIT

In [None]:
test = df[df_train.shape[0]:].drop(['SalePrice'], axis=1)

submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": np.expm1(st_model.predict(test))
})
submission.to_csv(f'{PATH}submission.csv', index=False)

In [None]:
submission.head()