## IMPORT LIBRARY

In [909]:
import numpy as np
import os
import platform
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import xgboost as xgb
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

## CONSTRAINT

In [910]:
DEBUG = True

In [911]:
if platform.system() == 'Darwin':
    PATH = './'
else:
    PATH = '/content/drive/MyDrive/Colab Notebooks/Kaggle/House-Prices/'

## SET SEED

In [912]:
np.random.seed(1)

## READ DATA

In [913]:
df_train = pd.read_csv(f'{PATH}train.csv')
df_test = pd.read_csv(f'{PATH}test.csv')

# テストIDを別途保存しておく
test_ID = df_test['Id']


## REMOVE OUTER VALUE

In [914]:
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

df_train["TotalSF"] = df_train["1stFlrSF"] + df_train["2ndFlrSF"] + df_train["TotalBsmtSF"]
df_test["TotalSF"] = df_test["1stFlrSF"] + df_test["2ndFlrSF"] + df_test["TotalBsmtSF"]

df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)

## MAKE DATA

In [915]:

# データを合体
df = pd.concat([df_train, df_test], sort=False)

df["PoolQC"] = df["PoolQC"].fillna("None")
df["MiscFeature"] = df["MiscFeature"].fillna("None")
df["Alley"] = df["Alley"].fillna("None")
df["Fence"] = df["Fence"].fillna("None")
df["FireplaceQu"] = df["FireplaceQu"].fillna("None")
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# 'RL'が一番多いのでそれで埋める
df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])

df["Functional"] = df["Functional"].fillna("Typ")

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    df[col] = df[col].fillna('None')
    
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df[col] = df[col].fillna(0)
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    df[col] = df[col].fillna(0)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df[col] = df[col].fillna('None')

df["MasVnrType"] = df["MasVnrType"].fillna("None")
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

# 電気がないことはないので最頻値で埋める
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

df = df.drop(['Utilities'], axis=1)

# キッチンがないことはないので最頻値で埋める
df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])

df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])

df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
df['MSSubClass'] = df['MSSubClass'].fillna("None")

## カテゴリにしたい数値型の特徴量を文字列に変換
#MSSubClass=The building class
df['MSSubClass'] = df['MSSubClass'].apply(str)

#Changing OverallCond into a categorical variable
df['OverallCond'] = df['OverallCond'].astype(str)

#Year and month sold are transformed into categorical features.
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)

from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))
    
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

# 正規化    
numeric_features = df.dtypes[df.dtypes != "object"].index
# 歪みを数値化
skewed_features = df[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skewness = skewed_features[abs(skewed_features) > 0.9]
skewed_features = high_skewness.index

# 0.9以上のものを補正をかける
for feature in skewed_features:
    if feature != 'SalePrice': 
        df[feature] = boxcox1p(df[feature], 0.15)

# one-hot-Encoding
df = pd.get_dummies(df)

# 訓練データを作成
y = np.log1p(df[:df_train.shape[0]]['SalePrice'])
X = df[:df_train.shape[0]].drop(['SalePrice'], axis=1)

# 検証
from sklearn.model_selection import train_test_split
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=1)

In [916]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,10,5.831328,19.212182,0.730463,1,3,0.0,7,4,2003,...,0,0,0,1,0,0,0,0,1,0
1,5,6.221214,19.712205,0.730463,1,3,0.0,6,7,1976,...,0,0,0,1,0,0,0,0,1,0
2,10,5.91494,20.347241,0.730463,1,0,0.0,7,4,2001,...,0,0,0,1,0,0,0,0,1,0
3,11,5.684507,19.691553,0.730463,1,0,0.0,7,4,1915,...,0,0,0,1,1,0,0,0,0,0
4,10,6.314735,21.32516,0.730463,1,0,0.0,8,4,2000,...,0,0,0,1,0,0,0,0,1,0


## RSME

In [917]:
def rmse(y_true,y_pred):
    #RMSEを算出
    rmse = np.sqrt(mean_squared_error(y_true,y_pred))
    return rmse

## MAKE MODEL AND FIT (XGBOOST)

In [918]:
pipeline = make_pipeline(RobustScaler(), xgb.XGBRegressor(gamma=0.001, learning_rate=0.01, max_depth=2, n_estimators=8000))

#params = {'xgbregressor__gamma':[0.001, 0.1, 1, 10, 100],
#          'xgbregressor__max_depth':[2,4,6,8,10],
#          'xgbregressor__learning_rate':[0.0001, 0.001, 0.01],
#          'xgbregressor__n_estimators':[10, 100, 1000]}

#if DEBUG:
#    params = {'xgbregressor__gamma':[0.001, 0.1, 1, 10, 100]}

#gd = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring=make_scorer(rmse,greater_is_better=False))
pipeline.fit(X_train, y_train)
xgboost = pipeline

## BEST PARAM(XGBOOST)

In [919]:
#gd.best_params_

## MAKE MODEL AND FIT (LIGHT BGM)

In [920]:
pipeline = make_pipeline(RobustScaler(), lgbm.LGBMRegressor(learning_rate=0.01, max_depth=2, min_child_samples=3, n_estimators=5000))
params = {'lgbmregressor__min_child_samples':[10, 20, 50, 100],
          'lgbmregressor__max_depth':[2,4,6,8,10],
          'lgbmregressor__learning_rate':[0.0001, 0.001, 0.01],
          'lgbmregressor__n_estimators':[10, 100, 1000]}

#{'lgbmregressor__learning_rate': 0.01,
# 'lgbmregressor__max_depth': 2,
# 'lgbmregressor__min_child_samples': 3,
# 'lgbmregressor__n_estimators': 5000}

if DEBUG:
    params = {'lgbmregressor__learning_rate':[0.0001, 0.001, 0.01],
              'lgbmregressor__min_child_samples':[2, 3, 5]}

gd = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring=make_scorer(rmse,greater_is_better=False))
pipeline.fit(X_train, y_train)
lightbgm = pipeline

## BEST PARAMS(LIGHTBGM)

In [921]:
#gd.best_params_

## RANDOM FOREST

In [922]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(max_depth=2, n_estimators=5000))
#print(pipeline.get_params())
params = {'randomforestregressor__max_depth':[2,4,6],
          'randomforestregressor__n_estimators':[1000, 5000, 8000]}

#if DEBUG:
#    params = {'lgbmregressor__learning_rate':[0.0001, 0.001, 0.01],
#              'lgbmregressor__min_child_samples':[2, 3, 5]}

gd = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring=make_scorer(rmse,greater_is_better=False))
pipeline.fit(X_train, y_train)
#randf = gd.best_estimator_
randf = pipeline
y_pred = randf.predict(X_test)

## IMPORTANCE PARAM

In [923]:
"""
rf = RandomForestRegressor(max_depth=2, n_estimators=5000)
rf.fit(X_train, y_train)

# 重要度が高い順に特徴量を出力
features = X_train.columns
importances = rf.feature_importances_

importances_features = sorted(zip(map(lambda x: round(x, 10), importances), features), reverse=True)
print(importances_features[:30])
"""

'\nrf = RandomForestRegressor(max_depth=2, n_estimators=5000)\nrf.fit(X_train, y_train)\n\n# 重要度が高い順に特徴量を出力\nfeatures = X_train.columns\nimportances = rf.feature_importances_\n\nimportances_features = sorted(zip(map(lambda x: round(x, 10), importances), features), reverse=True)\nprint(importances_features[:30])\n'

## BEST PARAMS

In [924]:
#gd.best_params_

## STACKING

In [925]:

# 結果の検証 
test_pred_1 = xgboost.predict(X_test)
test_pred_2 = lightbgm.predict(X_test)
#test_pred_3 = rf.predict(X_test)

#　各モデル個別の予測精度を平均二乗誤差で確認
print ("xgboost rmse: {:.4f}".format(rmse(y_test, test_pred_1)))
print ("lightbgm rmse: {:.4f}".format(rmse(y_test, test_pred_2)))
#print ("rf rmse {:.4f}".format(rmse(y_test, test_pred_3)))

xgb_pred1 = xgboost.predict(X_valid)
lightbgm_pred1 = lightbgm.predict(X_valid)
#rf_pred1 = rf.predict(X_valid)

stack_pred = np.column_stack((xgb_pred1, lightbgm_pred1))

from sklearn.neural_network import MLPRegressor

model = MLPRegressor(solver="sgd", random_state=1, max_iter=1000, alpha=0.0001,  epsilon=1e-08, learning_rate_init=0.001)
#model = LinearRegression()
model.fit(stack_pred, y_valid)
print(stack_pred.shape)
stack_test_pred = np.column_stack((test_pred_1, test_pred_2))

meta_test_pred = model.predict(stack_test_pred)
just_mean = (test_pred_1 + test_pred_2) / 2
print ("Stacking Model rsme: {:.6f}".format(rmse(y_test, meta_test_pred)))
print ("Just Mean rsme: {:.6f}".format(rmse(y_test, just_mean)))

xgboost rmse: 0.1248
lightbgm rmse: 0.1262
(274, 2)
Stacking Model rsme: 0.123334
Just Mean rsme: 0.123908


## PREDICT AND SUBMIT

In [926]:
test = df[df_train.shape[0]:].drop(['SalePrice'], axis=1)

test_pred_1 = xgboost.predict(test)
test_pred_2 = lightbgm.predict(test)

stack_test_pred = np.column_stack((test_pred_1, test_pred_2))
#model.predict(stack_test_pred)
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": np.expm1(model.predict(stack_test_pred))
})
submission.to_csv(f'{PATH}submission.csv', index=False)

In [927]:
xgb_pred.shape

(1459,)