In [None]:
import pandas as pd
train = pd.read_csv("../input/house-prices-advanced-regression\
-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression\
-techniques/test.csv")
print('train shape:', train.shape) # 輸出訓練資料的資料量
print('test shape:', test.shape) # 輸出測試資料的資料量


In [None]:
train.info()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew
%matplotlib inline
# 對SalePrice+1後執行以e為底數的對數變換
# 將原始值一同登錄至資料框架中
prices = pd.DataFrame({'price':train['SalePrice'],
    'log(price + 1)':np.log1p(train['SalePrice'])})
print(prices, '¥n')
# 輸出對數變換後的'price'的偏度
print('price skew :', skew(prices['price']))
print('log(price+1) skew:', skew(prices['log(price + 1)']))
# 將變換前與變換後的"SalePrice"做成直方圖
# 設定描繪圖型的尺寸
plt.rcParams['figure.figsize'] = (12.0, 6.0)
prices.hist()


In [None]:
train["SalePrice"] = np.log1p(train["SalePrice"])

In [None]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
# 輸出完成連結的資料
print(all_data.shape)
print(all_data)


In [None]:
from scipy.stats import skew
# 取得非object類型的欄的index
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
print('-----Column of non-object type-----')
print(numeric_feats)
# 除以缺漏值，求出非object類型的欄的偏度
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
print('-----Skewness of non-object type column-----')
print(skewed_feats)
# 僅將偏度大於 0.75 的欄再帶入skewed_feats 
skewed_feats = skewed_feats[skewed_feats > 0.75]
print('-----Skewness greater than 0.75-----')
print(skewed_feats)
# 取得所選出的欄的index
skewed_feats = skewed_feats.index
# 將偏度大於0.75的欄進行對數變換
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data[skewed_feats] # 對偏度大於0.75 的欄執行對數變換後，進行輸出


In [None]:
cc_data = pd.get_dummies(train['LotShape'])
# 新增原本的 'LotShape'
cc_data['LotShape'] = train['LotShape']
# 輸出20 列
cc_data[:20]


In [None]:
all_data = pd.get_dummies(all_data)

In [None]:
all_data = all_data.fillna(all_data[:train.shape[0]].mean())

In [None]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice


In [None]:
from sklearn.model_selection import cross_val_score
def rmse_cv(model):
    """ 均方根誤差
    Parameters:
        model(obj): Model object
    Returns:
        (float) 訓練資料的輸出值與真實值的RMSE
    """
    # 使用交叉驗證取得均方根誤差
    rmse = np.sqrt(-cross_val_score(model, X_train, y,
                   scoring="neg_mean_squared_error", # 均方根誤差
                   cv = 5)) # 將資料分為5份
    return(rmse)


In [None]:
from sklearn.linear_model import Ridge

# 建立Ridge迴歸模型
model_ridge = Ridge()

# 準備10種L2常規化強度
alphas = [0.05, 0.1, 0.5, 1, 5, 10, 15, 30, 50, 75]
# 在各個常規化的強度中執行Ridge迴歸
# 將資料分割為5個部分進行交叉驗證，求出RMSE、並取得該平均值
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]

# 將cv_ridge 轉換為 Series object
cv_ridge = pd.Series(cv_ridge, index = alphas)
# 輸出分數
print('Ridge RMSE loss:')
print(cv_ridge, '\n')
# 輸出分數的平均
print('Ridge RMSE loss Mean:')
print(cv_ridge.mean())

# 將各個常規化的強度製作為圖表
plt.figure(figsize=(10, 5)) # 描繪區域的尺寸
plt.plot(cv_ridge) # 將cv_ridge描繪為圖形
plt.grid() # 顯示格線
plt.title('Validation - by regularization strength')
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.show()


In [None]:
from sklearn.linear_model import LassoCV

# 使用LASSO迴歸模型進行推斷
# 使用4種L1範數進行嘗試
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)

print('Lasso regression RMSE loss:')                    # 透過交叉驗證
print(rmse_cv(model_lasso))                             # 輸出RMSE

print('Average loss:', rmse_cv(model_lasso).mean())     # 輸出RMSE的平均
print('Minimum loss:', rmse_cv(model_lasso).min())      # 輸出RMSE的最小值
print('Best alpha :', model_lasso.alpha_)               # 輸出被系統選用的alpha值


In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label = y)

# 決策樹深度為3、學習率為 0.1
params = {"max_depth":3, "eta":0.1}
# 使用xgboost模型執行交叉驗證
cross_val = xgb.cv(params,
                   dtrain,
                   num_boost_round=1000,    # 決策樹的數量
                   early_stopping_rounds=50)  # 提前中止的監控次數
cross_val


In [None]:
plt.figure(figsize=(8, 6))   # 描繪區域的尺寸
plt.plot(cross_val.loc[30:,["train-rmse-mean"]], linestyle = '--',
         label = 'Train')
plt.plot(cross_val.loc[30:,["test-rmse-mean"]], 
         label = 'Validation')
plt.grid()   # 顯示格線
plt.xlabel('num_boost_round')
plt.ylabel('RMSE')
plt.legend()
plt.show()


In [None]:
model_xgb = xgb.XGBRegressor(n_estimators=410,    # 決策樹數量
                             max_depth=3,         # 決策樹深度
                             learning_rate=0.1)   # 學習率 0.1
model_xgb.fit(X_train, y)
print('xgboost RMSE loss:')
print(rmse_cv(model_xgb).mean()) # 透過交叉驗證輸出RMSE的平均


In [None]:
lasso_preds = np.expm1(model_lasso.predict(X_test))
xgb_preds = np.expm1(model_xgb.predict(X_test))


In [None]:
preds = lasso_preds * 0.7 + xgb_preds * 0.3

In [None]:
solution = pd.DataFrame({"id":test.Id, "SalePrice":preds})
solution.to_csv("ensemble_sol.csv", index = False) 
