In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# def score_dataset(X, y, model=XGBRegressor()):
#     # Label encoding for categoricals
#     for colname in X.select_dtypes(["category", "object"]):
#         X[colname], _ = X[colname].factorize()
#     # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
#     score = cross_val_score(
#         model, X, y, cv=5, scoring="neg_mean_squared_log_error",
#     )
#     score = -1 * score.mean()
#     score = np.sqrt(score)
#     return score

In [11]:
# データを取得
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
all_df = pd.concat([train, test], sort=False).reset_index(drop=True)
all_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [12]:
# 数値に変換していく
object_cols = []
# 文字列の特徴量を抽出
for col in all_df.columns:
    if(all_df[col].dtype == 'object'):
        object_cols.append(col)
# 変換
for col in object_cols:
    le = LabelEncoder()
    all_df[col].fillna('missing', inplace=True)
    all_df[col] = le.fit_transform(all_df[col])
    all_df[col] = all_df[col].astype('category')
    all_df[col] = all_df[col].astype(int)


all_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,3,3,0,...,0,3,4,4,0,2,2008,8,4,208500.0
1,2,20,3,80.0,9600,1,2,3,3,0,...,0,3,4,4,0,5,2007,8,4,181500.0
2,3,60,3,68.0,11250,1,2,0,3,0,...,0,3,4,4,0,9,2008,8,4,223500.0
3,4,70,3,60.0,9550,1,2,0,3,0,...,0,3,4,4,0,2,2006,8,0,140000.0
4,5,60,3,84.0,14260,1,2,0,3,0,...,0,3,4,4,0,12,2008,8,4,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,4,21.0,1936,1,2,3,3,0,...,0,3,4,4,0,6,2006,8,4,
2915,2916,160,4,21.0,1894,1,2,3,3,0,...,0,3,4,4,0,4,2006,8,0,
2916,2917,20,3,160.0,20000,1,2,3,3,0,...,0,3,4,4,0,9,2006,8,0,
2917,2918,85,3,62.0,10441,1,2,3,3,0,...,0,3,2,2,700,7,2006,8,4,


In [17]:
# 学習用データと計算用データに分ける
train_df_le = all_df.loc[all_df['SalePrice'].notnull()]
test_df_le = all_df.loc[all_df['SalePrice'].isnull()]

# 目的変数のlogをとる
train_df_le['SalePrice_log'] = np.log(train_df_le['SalePrice'])

# x = train_df.drop(['Id', 'SalePrice'], axis=1).values
# y = train_df['SalePrice'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_le['SalePrice_log'] = np.log(train_df_le['SalePrice'])


In [18]:
# K分割の定義
folds = 3
kf = KFold(n_splits=folds)

In [19]:
# データの分割
train_x = train_df_le.drop(['SalePrice', 'SalePrice_log', 'Id'], axis=1)
train_y = train_df_le['SalePrice_log']

In [56]:
xgb_params = {
    'learning_rate':0.05,
    'seed':42,
    'max_depth':6,
    'colsample_bytree':0.36,
    'subsample':0.56,
}

In [57]:
models_xgb = []
rmses_xgb = []
oof_xgb = np.zeros(len(train_x))

for train_index, val_index in kf.split(train_x):
    x_train = train_x.iloc[train_index]
    x_valid = train_x.iloc[val_index]
    y_train = train_y.iloc[train_index]
    y_valid = train_y.iloc[val_index]

    xgb_train = xgb.DMatrix(x_train, label=y_train)
    xgb_eval = xgb.DMatrix(x_valid, label=y_valid)
    evals = [(xgb_train,'train'), (xgb_eval,'eval')]

    # 予測
    my_model_xgb = xgb.train(xgb_params,
                             xgb_train,
                             evals=evals,
                             num_boost_round=1000,
                             early_stopping_rounds=20,
                             verbose_eval=False,
                            )


    # スコアの計算
    y_pred = my_model_xgb.predict(xgb_eval)

    tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print('tmp_rmse:',tmp_rmse)
    models_xgb.append(my_model_xgb)
    rmses_xgb.append(tmp_rmse)
    oof_xgb[val_index] = y_pred

print(np.mean(rmses_xgb))
    

tmp_rmse: 0.11289883571088243
tmp_rmse: 0.14219972175538131
tmp_rmse: 0.11381983223932775
0.1229727965685305


In [58]:
# 学習の関数化
def learn(i):
    models_xgb = []
    rmses_xgb = []
    oof_xgb = np.zeros(len(train_x))
    xgb_params['colsample_bytree'] = 0.355+i/1000

    for train_index, val_index in kf.split(train_x):
        x_train = train_x.iloc[train_index]
        x_valid = train_x.iloc[val_index]
        y_train = train_y.iloc[train_index]
        y_valid = train_y.iloc[val_index]

        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_eval = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'), (xgb_eval,'eval')]

        # 予測
        my_model_xgb = xgb.train(xgb_params,
                                xgb_train,
                                evals=evals,
                                num_boost_round=1000,
                                early_stopping_rounds=20,
                                verbose_eval=False,
                                )


        # スコアの計算
        y_pred = my_model_xgb.predict(xgb_eval)

        tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        # print('tmp_rmse:',tmp_rmse)
        models_xgb.append(my_model_xgb)
        rmses_xgb.append(tmp_rmse)
        oof_xgb[val_index] = y_pred

    print(str(i)+': '+str(np.mean(rmses_xgb)))

In [55]:
for i in range(1, 10):
    learn(i)

1: 0.1229727965685305
2: 0.1229727965685305
3: 0.1229727965685305
4: 0.1229727965685305
5: 0.1229727965685305
6: 0.1229727965685305


KeyboardInterrupt: 

In [59]:
np.mean(rmses_xgb)

0.1229727965685305

In [61]:
# 予測
test_x = test_df_le.drop(['SalePrice','Id'], axis=1)

xgb_test = xgb.DMatrix(test_x)
preds_xgb = []
for model in models_xgb:
    pred = model.predict(xgb_test)
    preds_xgb.append(pred)

In [62]:
preds_array_xgb = np.array(preds_xgb)
preds_mean_xgb = np.mean(preds_array_xgb, axis=0)
preds_exp_xgb = np.exp(preds_mean_xgb)

sub4 = pd.Series(data=preds_exp_xgb, index=test_df_le['Id'], name='SalePrice')
sub4.to_csv('./output/submission4.csv')