## IMPORT LIBRARY

In [729]:
import numpy as np
import os
import platform
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import xgboost as xgb
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

## CONSTRAINT

In [730]:
DEBUG = True

In [731]:
if platform.system() == 'Darwin':
    PATH = './'
else:
    PATH = '/content/drive/MyDrive/Colab Notebooks/Kaggle/House-Prices/'

## SET SEED

In [732]:
np.random.seed(1)

## READ DATA

In [733]:
df_train = pd.read_csv(f'{PATH}train.csv')
df_test = pd.read_csv(f'{PATH}test.csv')

# テストIDを別途保存しておく
test_ID = df_test['Id']


## REMOVE OUTER VALUE

In [734]:
#df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

df_train["TotalSF"] = df_train["1stFlrSF"] + df_train["2ndFlrSF"] + df_train["TotalBsmtSF"]
df_test["TotalSF"] = df_test["1stFlrSF"] + df_test["2ndFlrSF"] + df_test["TotalBsmtSF"]

df_train = df_train.drop(df_train[(df_train['TotalSF']>7500) & (df_train['SalePrice']<300000)].index)
df_train = df_train.drop(df_train[(df_train['YearBuilt']<2000) & (df_train['SalePrice']>600000)].index)
df_train = df_train.drop(df_train[(df_train['OverallQual']<5) & (df_train['SalePrice']>200000)].index)
df_train = df_train.drop(df_train[(df_train['OverallQual']<10) & (df_train['SalePrice']>500000)].index)

## MAKE DATA

In [735]:
# データを合体
df = pd.concat([df_train, df_test], sort=False)

# 数値の欠損値に平均値を埋めておく
num_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
for col in num_cols:
    df[col].fillna(np.mean(df[col]), inplace=True)

num2str_list = ['MSSubClass','YrSold','MoSold']
for column in num2str_list:
    df[column] = df[column].astype(str)
    
df.loc[df['Fireplaces']==0,'FireplaceQu']='Nothing'
df['LotFrontage'] = df['LotFrontage'].fillna(df.groupby('1stFlrSF')['LotFrontage'].transform('mean'))
df['LotFrontage'].interpolate(method='linear',inplace=True)
df['LotFrontage']=df['LotFrontage'].astype(int)
df['MasVnrArea'] = df['MasVnrArea'].fillna(df.groupby('MasVnrType')['MasVnrArea'].transform('mean'))
df['MasVnrArea'].interpolate(method='linear',inplace=True)
df['MasVnrArea']=df['MasVnrArea'].astype(int)
df["Fence"] = df["Fence"].fillna("None")
df["FireplaceQu"] = df["FireplaceQu"].fillna("None")
df["Alley"] = df["Alley"].fillna("None")
df["PoolQC"] = df["PoolQC"].fillna("None")
df["MiscFeature"] = df["MiscFeature"].fillna("None")
df.loc[df['BsmtFinSF1']==0,'BsmtFinType1']='Unf'
df.loc[df['BsmtFinSF2']==0,'BsmtQual']='TA'
df['YrBltRmd']=df['YearBuilt']+df['YearRemodAdd']
df['Total_Square_Feet'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] + df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF'])
df['Total_Bath'] = (df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
df['Total_Porch_Area'] = (df['OpenPorchSF'] + df['3SsnPorch'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['WoodDeckSF'])
df['exists_pool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['exists_garage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['exists_fireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
df['exists_bsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['old_house'] = df['YearBuilt'].apply(lambda x: 1 if x <1990 else 0)

for i in df.columns:
    if 'SalePrice' not in i:
        if 'object' in str(df[str(i)].dtype):
            df[str(i)]=df[str(i)].fillna(method='ffill')

# 正規化    
numeric_features = df.dtypes[df.dtypes != "object"].index
# 歪みを数値化
skewed_features = df[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skewness = skewed_features[abs(skewed_features) > 0.9]
skewed_features = high_skewness.index

# 0.9以上のものを補正をかける
for feature in skewed_features:
    if feature != 'SalePrice': 
        df[feature] = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1))

# one-hot-Encoding
df = pd.get_dummies(df)

# 訓練データを作成
y = np.log(df[:df_train.shape[0]]['SalePrice'])
X = df[:df_train.shape[0]].drop(['Id','SalePrice'], axis=1)

# 検証
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## RSME

In [736]:
def rmse(y_true,y_pred):
    #RMSEを算出
    rmse = np.sqrt(mean_squared_error(y_true,y_pred))
    print('rmse',rmse)
    return rmse

## MAKE MODEL AND FIT (XGBOOST)

In [737]:
pipeline = make_pipeline(StandardScaler(), xgb.XGBRegressor(gamma=0.001, learning_rate=0.01, max_depth=2, n_estimators=8000))


#params = {'xgbregressor__gamma':[0.001, 0.1, 1, 10, 100],
#          'xgbregressor__max_depth':[2,4,6,8,10],
#          'xgbregressor__learning_rate':[0.0001, 0.001, 0.01],
#          'xgbregressor__n_estimators':[10, 100, 1000]}

#if DEBUG:
#    params = {'xgbregressor__gamma':[0.001, 0.1, 1, 10, 100]}

#gd = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring=make_scorer(rmse,greater_is_better=False))
pipeline.fit(X_train, y_train)
xgboost = pipeline
y_pred = xgboost.predict(X_test)

## BEST PARAM(XGBOOST)

In [738]:
#gd.best_params_

## PREDICT (XGBOOST)

In [739]:
np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False))

0.3589954545554082

## MAKE MODEL AND FIT (LIGHT BGM)

In [740]:
pipeline = make_pipeline(StandardScaler(), lgbm.LGBMRegressor())
params = {'lgbmregressor__min_child_samples':[10, 20, 50, 100],
          'lgbmregressor__max_depth':[2,4,6,8,10],
          'lgbmregressor__learning_rate':[0.0001, 0.001, 0.01],
          'lgbmregressor__n_estimators':[10, 100, 1000]}

if DEBUG:
    params = {'lgbmregressor__learning_rate':[0.0001, 0.001, 0.01],
              'lgbmregressor__min_child_samples':[2, 3, 5]}

gd = GridSearchCV(estimator=pipeline, param_grid=params, cv=5, scoring=make_scorer(rmse,greater_is_better=False))
gd.fit(X_train, y_train)
lightbgm = gd.best_estimator_
y_pred = lightbgm.predict(X_test)

rmse 0.40122162421664453
rmse 0.42720033811965286
rmse 0.3540793321060869
rmse 0.38366968562422454
rmse 0.36434102806106566
rmse 0.4012353632790429
rmse 0.4271955515742684
rmse 0.3540073156629852
rmse 0.38364956640918085
rmse 0.364328778006568
rmse 0.40120260444361605
rmse 0.42719076810381357
rmse 0.35401480840806476
rmse 0.38369149481214126
rmse 0.3643443024657898
rmse 0.3717764475868815
rmse 0.4010894417172063
rmse 0.32695327369872346
rmse 0.3583882967867607
rmse 0.3375835111462156
rmse 0.3719220172520329
rmse 0.40099785891526485
rmse 0.32676729121343356
rmse 0.358134467342223
rmse 0.3374842568475042
rmse 0.37134171536582283
rmse 0.40104167555508285
rmse 0.32677061305141103
rmse 0.35802615139901334
rmse 0.3375203300825635
rmse 0.19308070677296008
rmse 0.2357657875731462
rmse 0.1812499186300832
rmse 0.21038344107489687
rmse 0.18748914524434243
rmse 0.19400007520084495
rmse 0.23506638242077166
rmse 0.1805136892021905
rmse 0.2101870187538976
rmse 0.1876756376590744
rmse 0.19284796055081

## BEST PARAMS(LIGHTBGM)

In [741]:
gd.best_params_

{'lgbmregressor__learning_rate': 0.01, 'lgbmregressor__min_child_samples': 5}

## SCORE (LIGHTBGM)

In [742]:
np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False))

0.4534298690369743

## PREDICT AND SUBMIT

In [743]:
test = df[df_train.shape[0]:].drop(['Id','SalePrice'], axis=1)
xgb_pred = np.exp(xgboost.predict(test))
lightbgm_pred = np.exp(lightbgm.predict(test))
test_pred = (xgb_pred + lightbgm_pred) / 2
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": test_pred
})
submission.to_csv(f'{PATH}submission.csv', index=False)