In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, mean_squared_error
import lightgbm as lgb

In [2]:
#データ取り込み
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

In [3]:
#特徴量のラベルの日本語訳データ
label = pd.read_csv("column_labels.csv")

In [4]:
#特徴量ラベルを英語から日本語に変更(視覚的にわかりやすくするため)
eng_to_jpn = dict(zip(label["English"], label["Japanese"]))

train_jpn = train.rename(columns=eng_to_jpn)
test_jpn = test.rename(columns=eng_to_jpn)

In [5]:
train_jpn.head()

Unnamed: 0,ID,建物クラス,ゾーニング（用途地域）,道路に面する距離,敷地面積,接道の種類,路地の種類,区画形状,地勢,インフラ整備状況,...,プール面積,プールの品質,フェンスの種類,その他の特徴,その他特徴の価値,販売月,販売年,販売タイプ,販売条件,販売価格
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
train_jpn.describe()

Unnamed: 0,ID,建物クラス,道路に面する距離,敷地面積,全体的な品質評価,全体的な状態評価,建築年,改築年,石材仕上げの面積,地下室仕上げ面積1,...,ウッドデッキ面積,オープンポーチ面積,囲いポーチ面積,3シーズンポーチ面積,スクリーンポーチ面積,プール面積,その他特徴の価値,販売月,販売年,販売価格
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
trainX = train_jpn.drop(["販売価格"],axis=1)
y = train_jpn["販売価格"]
testX = test_jpn.copy()

In [8]:
#欠損値補完　[数値→中央値、文字列やカテゴリ→"missing"]　それぞれ補完 (train の中央値で補完)
numeric_cols = trainX.select_dtypes(include="number").columns
imputer = SimpleImputer(strategy='median')
trainX[numeric_cols] = imputer.fit_transform(trainX[numeric_cols])
testX[numeric_cols] = imputer.transform(testX[numeric_cols])

cat_cols = train_jpn.select_dtypes(include="object").columns
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
trainX[cat_cols] = cat_imputer.fit_transform(trainX[cat_cols])
testX[cat_cols] = cat_imputer.transform(testX[cat_cols])

In [9]:
missing = trainX.isnull().sum()
missing_cols = missing[missing > 0]
print(missing_cols)

missing_test = testX.isnull().sum()
missing_coltest = missing_test[missing_test > 0]
print(missing_coltest)

Series([], dtype: int64)
Series([], dtype: int64)


In [10]:
#カテゴリ変数系の特徴量のリスト化
quality = ["全体的な品質評価","全体的な状態評価","外装の品質","外装の状態","地下室の高さ評価","地下室の状態評価","暖房の品質と状態"
            ,"キッチンの品質","暖炉の品質","ガレージの品質","ガレージの状態","プールの品質","その他特徴の価値"]
#カテゴリ特徴量の値の分布確認
for col in quality:
    print(f"\n★ {col} の値の分布")
    print(trainX[col].value_counts(dropna=False))


★ 全体的な品質評価 の値の分布
全体的な品質評価
5.0     397
6.0     374
7.0     319
8.0     168
4.0     116
9.0      43
3.0      20
10.0     18
2.0       3
1.0       2
Name: count, dtype: int64

★ 全体的な状態評価 の値の分布
全体的な状態評価
5.0    821
6.0    252
7.0    205
8.0     72
4.0     57
3.0     25
9.0     22
2.0      5
1.0      1
Name: count, dtype: int64

★ 外装の品質 の値の分布
外装の品質
TA    906
Gd    488
Ex     52
Fa     14
Name: count, dtype: int64

★ 外装の状態 の値の分布
外装の状態
TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: count, dtype: int64

★ 地下室の高さ評価 の値の分布
地下室の高さ評価
TA         649
Gd         618
Ex         121
missing     37
Fa          35
Name: count, dtype: int64

★ 地下室の状態評価 の値の分布
地下室の状態評価
TA         1311
Gd           65
Fa           45
missing      37
Po            2
Name: count, dtype: int64

★ 暖房の品質と状態 の値の分布
暖房の品質と状態
Ex    741
TA    428
Gd    241
Fa     49
Po      1
Name: count, dtype: int64

★ キッチンの品質 の値の分布
キッチンの品質
TA    735
Gd    586
Ex    100
Fa     39
Name: count, dtype: int64

★ 暖炉の品質 の値の分布
暖炉の品質
missing   

In [11]:
#数値の特徴量名と大半が欠損値の特徴量名を除いてリスト化する
quality_cols = ["外装の品質","外装の状態","地下室の高さ評価","地下室の状態評価","暖房の品質と状態"
                ,"キッチンの品質","暖炉の品質","ガレージの品質","ガレージの状態"]

#大半が欠損値の"プールの品質"特徴量を削除
trainX = trainX.drop(["プールの品質"],axis=1)
testX = testX.drop(["プールの品質"],axis=1)

In [12]:
#カテゴリ変数特徴量を順序エンコーディング
for col in quality_cols:
    trainX[col] = trainX[col].map({"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"NA":0,"missing":0})

for col in quality_cols:
    testX[col] = testX[col].map({"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,"NA":0,"missing":0})

In [13]:
#object特徴量名をリスト化
cat_cols = trainX.select_dtypes(include=["object"]).columns

#OOFターゲットエンコーディング
kf = KFold(n_splits=5, shuffle=True, random_state=42)

train_encoded = pd.DataFrame(index=trainX.index)
test_encoded  = pd.DataFrame(index=testX.index)

for col in cat_cols:
    oof_vals   = np.zeros(len(trainX))
    test_vals  = np.zeros(len(testX))
    
    for train_idx, valid_idx in kf.split(trainX):
        te = TargetEncoder(cols=[col], smoothing=5.0)
        te.fit(trainX.loc[train_idx, [col]], y.iloc[train_idx])
        
        oof_vals[valid_idx] = te.transform(trainX.loc[valid_idx, [col]])[col].values
        test_vals += te.transform(testX[[col]])[col].values / kf.n_splits
    
    train_encoded[col + '_enc'] = oof_vals
    test_encoded[col + '_enc']  = test_vals

trainX_enc = pd.concat([trainX.reset_index(drop=True), train_encoded], axis=1)
testX_enc  = pd.concat([testX.reset_index(drop=True), test_encoded], axis=1)

In [14]:
#面積についての特徴量名をリスト化
area_cols = ['敷地面積','石材仕上げの面積','地下室仕上げ面積1','地下室仕上げ面積2','未仕上げ地下面積','地下室総面積','1階の面積','2階の面積',
'低品質仕上げ面積','地上居住面積','地上居住面積','ガレージ面積','ウッドデッキ面積','オープンポーチ面積','3シーズンポーチ面積','スクリーンポーチ面積',
 'プール面積']

In [15]:
#0の割合を出力
for col in area_cols:
    zero_ratio = (trainX_enc[col] == 0).mean()
    print(f'{col}: {zero_ratio:.2%} が 0')

敷地面積: 0.00% が 0
石材仕上げの面積: 59.52% が 0
地下室仕上げ面積1: 31.99% が 0
地下室仕上げ面積2: 88.56% が 0
未仕上げ地下面積: 8.08% が 0
地下室総面積: 2.53% が 0
1階の面積: 0.00% が 0
2階の面積: 56.78% が 0
低品質仕上げ面積: 98.22% が 0
地上居住面積: 0.00% が 0
地上居住面積: 0.00% が 0
ガレージ面積: 5.55% が 0
ウッドデッキ面積: 52.12% が 0
オープンポーチ面積: 44.93% が 0
3シーズンポーチ面積: 98.36% が 0
スクリーンポーチ面積: 92.05% が 0
プール面積: 99.52% が 0


In [16]:
#0が80％以上の特徴量について0,1に変えて、あるかないかという特徴量に変換（バイナリ化）
sparse_cols = []
for col in area_cols:
    zero_ratio = (trainX_enc[col] == 0).mean()
    if zero_ratio >= 0.8:
        new_col = f'{col}_flag'
        trainX_enc[new_col] = trainX_enc[col].apply(lambda x: 1 if x > 0 else 0)
        testX_enc[new_col] = testX_enc[col].apply(lambda x: 1 if x > 0 else 0)
        sparse_cols.append(col)

In [17]:
#改築済みかどうかを特徴量化
trainX_enc['改築済み'] = (trainX_enc['建築年'] != trainX_enc['改築年']).astype(int)
testX_enc['改築済み'] = (testX_enc['建築年'] != testX_enc['改築年']).astype(int)

In [18]:
#建築年数、改築年数、改築が10年以内のものをそれぞれ特徴量化
trainX_enc['建築年数'] = trainX_enc['販売年'] - trainX_enc['建築年']

trainX_enc['改築年数'] = trainX_enc['販売年'] - trainX_enc['改築年']

trainX_enc['最近改築した'] = (trainX_enc['改築年数'] <= 10).astype(int)

testX_enc['建築年数'] = testX_enc['販売年'] - testX_enc['建築年']

testX_enc['改築年数'] = testX_enc['販売年'] - testX_enc['改築年']

testX_enc['最近改築した'] = (testX_enc['改築年数'] <= 10).astype(int)

#建築年、改築年特徴量を削除
trainX_enc_plus = trainX_enc.drop(["建築年","改築年"],axis=1)
testX_enc_plus = testX_enc.drop(["建築年","改築年"],axis=1)

In [19]:
#不要な特徴量を削除
cols_to_drop = sparse_cols + list(cat_cols)

trainX_eng = trainX_enc.drop(cols_to_drop, axis=1)
testX_eng  = testX_enc.drop(cols_to_drop, axis=1)

In [20]:
y.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: 販売価格, dtype: float64

In [21]:
#交差検証+GradientBoostingRegressor
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(trainX_eng)):
    X_train, X_valid = trainX_eng.iloc[train_idx], trainX_eng.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]

    model_GBR = GradientBoostingRegressor()
    model_GBR.fit(X_train, y_train)

    y_pred = model_GBR.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    rmse_scores.append(rmse)

    print(f"Fold {fold + 1} - RMSE: {rmse:.4f}")

print(f"\nAverage RMSE: {np.mean(rmse_scores):.4f}")

Fold 1 - RMSE: 27184.5146
Fold 2 - RMSE: 22479.8979
Fold 3 - RMSE: 45776.0312
Fold 4 - RMSE: 25434.0773
Fold 5 - RMSE: 22229.1880

Average RMSE: 28620.7418


In [22]:
#GradientBoostingRegressor+グリッドサーチ

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

model = GradientBoostingRegressor()
param_grid =  {'max_depth': [3, 5,7],'learning_rate': [0.05,0.1, 0.3],'n_estimators': [50, 100,200]}

gcv = GridSearchCV(estimator=model,param_grid=param_grid,scoring=rmse_scorer,cv=5,n_jobs=-1,verbose=1)

gcv.fit(trainX_eng, y)

best_model = gcv.best_estimator_
y_pred = best_model.predict(trainX_eng)
rmse_train = rmse(y, y_pred)

print(f"\nBest parameters: {gcv.best_params_}")
print(f"Train RMSE: {rmse_train:.4f}")


Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Train RMSE: 14287.4764


In [23]:
pred_gcv_GBR = gcv.predict(testX_eng)

In [24]:
#GradientBoostingRegressor+グリッドサーチ+交差検証
rmse_scores = []
for fold, (train_idx, val_idx) in enumerate(kf.split(trainX_eng)):
    X_train, X_valid = trainX_eng.iloc[train_idx], trainX_eng.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]

    model = GradientBoostingRegressor()
    param_grid =  {'max_depth': [3,5,7],'learning_rate': [0.05,0.1,0.3],'n_estimators': [50,100,200]}
    
    gcv_GBR = GridSearchCV(estimator=model,param_grid=param_grid,scoring=rmse_scorer,cv=kf,n_jobs=-1,verbose=1)
    
    gcv_GBR.fit(X_train, y_train)
    
    best_model = gcv_GBR.best_estimator_
    y_pred = best_model.predict(X_valid)
    rmse_fold = rmse(y_valid, y_pred)

    rmse_scores.append(rmse_fold)
    print(f"Fold {fold + 1} - RMSE: {rmse_fold:.4f}")
    print(f"\nBest parameters: {gcv_GBR.best_params_}")

print(f"\nAverage RMSE: {np.mean(rmse_scores):.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fold 1 - RMSE: 26442.5641

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fold 2 - RMSE: 24539.7064

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fold 3 - RMSE: 46844.7838

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fold 4 - RMSE: 25430.3160

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fold 5 - RMSE: 21875.9563

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}

Average RMSE: 29026.6653


In [25]:
pred_kfold = gcv_GBR.predict(testX_eng)

In [26]:
#XGBoost+グリッドサーチ
model_xgb = XGBRegressor()

gcv_xgb = GridSearchCV(estimator=model_xgb,param_grid=param_grid,scoring=rmse_scorer,cv=5,n_jobs=-1,verbose=1)

gcv_xgb.fit(trainX_eng, y)

best_xgb = gcv_xgb.best_estimator_
y_pred = best_xgb.predict(trainX_eng)
rmse_xgb = rmse(y, y_pred)

print(f"\nBest parameters: {gcv_xgb.best_params_}")
print(f"Train RMSE: {rmse_xgb:.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Train RMSE: 11687.6516


In [27]:
pred_xgb = best_xgb.predict(testX_eng)

In [28]:
#CatBoostRegressor+グリッドサーチ
model_cat = CatBoostRegressor(silent=True)

gcv_cat = GridSearchCV(estimator=model_cat,param_grid=param_grid,scoring=rmse_scorer,cv=5,n_jobs=-1,verbose=1)

gcv_cat.fit(trainX_eng,y)

best_cat = gcv_cat.best_estimator_
y_pred = best_cat.predict(trainX_eng)
rmse_cat = rmse(y, y_pred)

print(f"\nBest parameters: {gcv_cat.best_params_}")
print(f"Train RMSE: {rmse_cat:.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Train RMSE: 12689.8644


In [29]:
pred_cat = best_cat.predict(testX_eng)

In [30]:
#lgb.LGBMRegressor+グリッドサーチ
model_lgb = lgb.LGBMRegressor()

gcv_lgb = GridSearchCV(estimator=model_lgb,param_grid=param_grid,scoring=rmse_scorer,cv=5,n_jobs=-1,verbose=1)

gcv_lgb.fit(trainX_eng,y)

best_lgb = gcv_lgb.best_estimator_
y_pred = best_lgb.predict(trainX_eng)
rmse_lgb = rmse(y, y_pred)

print(f"\nBest parameters: {gcv_lgb.best_params_}")
print(f"Train RMSE: {rmse_lgb:.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001722 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4354
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 82
[LightGBM] [Info] Start training from score 180921.195890

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Train RMSE: 14237.4142


In [31]:
pred_lgb = best_lgb.predict(testX_eng)

In [32]:
# LBに投稿、各モデルのスコアの結果からアンサンブル実施のための重み計算
scores = {
    'LGB': 0.13166,
    'CAT': 0.12762,
    'XGB': 0.13109,
    'GBR': 0.13224
}

inv_scores = {model: 1 / score for model, score in scores.items()}
total = sum(inv_scores.values())
weights = {model: round(val / total, 3) for model, val in inv_scores.items()}

print(weights)

{'LGB': 0.248, 'CAT': 0.256, 'XGB': 0.249, 'GBR': 0.247}


In [33]:
ensemble_pred = (pred_gcv_GBR * 0.247 + pred_lgb * 0.248 + pred_cat * 0.256 + pred_xgb * 0.249)

In [34]:
#LassoCV
model_lasso = LassoCV(cv=5,max_iter=10000,random_state=42)

model_lasso.fit(trainX_eng,y)

pred_lasso = model_lasso.predict(testX_eng)

In [36]:
y_pred_la = model_lasso.predict(trainX_eng)
rmse_la = rmse(y, y_pred_la)

print(f"Train RMSE: {rmse_la:.4f}")

Train RMSE: 36276.7635


In [37]:
from sklearn.linear_model import Ridge

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ベースモデル
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

base_models = [
    LGBMRegressor(),
    CatBoostRegressor(verbose=0),
    XGBRegressor(verbosity=0),
    GradientBoostingRegressor()
]

meta_model = Ridge()

# 各ベースモデルの out-of-fold 予測値
oof_preds = np.zeros((trainX_eng.shape[0], len(base_models)))
test_preds = np.zeros((testX_eng.shape[0], len(base_models)))

for i, model in enumerate(base_models):
    test_preds_i = np.zeros((testX_eng.shape[0], kf.n_splits))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(trainX_eng)):
        X_train, X_val = trainX_eng.iloc[train_idx], trainX_eng.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        oof_preds[val_idx, i] = model.predict(X_val)
        test_preds_i[:, fold] = model.predict(testX_eng)
    
    # 各foldの平均予測
    test_preds[:, i] = test_preds_i.mean(axis=1)

# メタモデルを学習（oof_predsを特徴量として）
meta_model.fit(oof_preds, y)

# 学習データのRMSE
train_rmse = np.sqrt(mean_squared_error(y, meta_model.predict(oof_preds)))
print(f"Stacking Train RMSE: {train_rmse:.4f}")

# テストデータに対する最終予測
final_test_preds = meta_model.predict(test_preds)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000943 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3960
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 82
[LightGBM] [Info] Start training from score 181441.541952
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3957
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 82
[LightGBM] [Info] Start training from score 179651.292808
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3943
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 81
[LightGBM] [Info] St

In [38]:
param_meta = {'alpha': [0.1,1.0,10.0,100.0]}
meta_model_gcv = GridSearchCV(Ridge(), param_grid=param_meta, scoring='neg_root_mean_squared_error', cv=5)

# メタモデルを学習+グリッドサーチ（oof_predsを特徴量として）
meta_model_gcv.fit(oof_preds, y)

# 学習データのRMSE
train_rmse_gcv = np.sqrt(mean_squared_error(y, meta_model_gcv.predict(oof_preds)))
print(f"Stacking Train RMSE: {train_rmse_gcv:.4f}")

# テストデータに対する最終予測
gcv_test_preds = meta_model_gcv.predict(test_preds)

Stacking Train RMSE: 27583.6638


In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
#LinearRegressionとRidge比較
meta_lr = make_pipeline(StandardScaler(with_mean=False),
                        LinearRegression())

cv_rmse = -cross_val_score(meta_lr, oof_preds, y,
                           scoring='neg_root_mean_squared_error',
                           cv=5).mean()

print(f"Meta LinearRegression CV RMSE: {cv_rmse:.5f}")

Meta LinearRegression CV RMSE: 27839.67741


In [40]:
meta_R = make_pipeline(StandardScaler(with_mean=False),
                        Ridge(alpha=1.0))

cv_rmse = -cross_val_score(meta_R, oof_preds, y,
                           scoring='neg_root_mean_squared_error',
                           cv=5).mean()

print(f"Meta Ridge CV RMSE: {cv_rmse:.5f}")

Meta Ridge CV RMSE: 27805.15624


In [86]:
meta_R.fit(oof_preds, y)
train_rmse_R = np.sqrt(mean_squared_error(y, meta_R.predict(oof_preds)))
print(f"Stacking Train RMSE: {train_rmse_R:.4f}")

# テストデータに対する最終予測
R_test_preds = meta_R.predict(test_preds)

Stacking Train RMSE: 27585.3713


In [88]:
sample["SalePrice"] = R_test_preds
sample.to_csv("submission_stacking_R.csv",index=None)

In [42]:
trainX_eng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 83 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1460 non-null   float64
 1   建物クラス             1460 non-null   float64
 2   道路に面する距離          1460 non-null   float64
 3   敷地面積              1460 non-null   float64
 4   全体的な品質評価          1460 non-null   float64
 5   全体的な状態評価          1460 non-null   float64
 6   建築年               1460 non-null   float64
 7   改築年               1460 non-null   float64
 8   石材仕上げの面積          1460 non-null   float64
 9   外装の品質             1460 non-null   int64  
 10  外装の状態             1460 non-null   int64  
 11  地下室の高さ評価          1460 non-null   int64  
 12  地下室の状態評価          1460 non-null   int64  
 13  地下室仕上げ面積1         1460 non-null   float64
 14  未仕上げ地下面積          1460 non-null   float64
 15  地下室総面積            1460 non-null   float64
 16  暖房の品質と状態          1460 non-null   int64  
