# 範例 : (Kaggle)房價預測
***
- 分數以網站評分結果為準, 請同學實際將提交檔(*.csv)上傳試試看  
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/submit

# [教學目標]
- 以下用房價預測資料, 觀查堆疊泛化 (Stacking) 的寫法與效果

# [範例重點]
- 觀察堆疊泛化的準確度 (In[14]), 是否比單一模型準確度為高 (In[11~13])
- 與前一日的混合泛化結果相比呢?  

In [14]:
import os
import pandas as pd
import numpy as np
import copy, time
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder

# Set data directory
dir_data = 'D:\Document\AI\Marathon100D\Assignment\Day_050\data'
# Set the full data file name
f_app_train = os.path.join(dir_data, 'house_train.csv.gz')
f_app_test = os.path.join(dir_data, 'house_test.csv.gz')

# Read CSV files to data frame
df_train = pd.read_csv(f_app_train)
df_test = pd.read_csv(f_app_test)

# Extract target data from training data frame and convert it to natural logarithm value
train_Y = np.log1p(df_train['SalePrice'])

# Create a data frame with primary key
ids = df_test['Id']
                       
# Create training data by dropping primary key and target column
df_train = df_train.drop(['Id', 'SalePrice'] , axis=1)
                       
# Create testing data by dropping primary key          
df_test = df_test.drop(['Id'] , axis=1)
                       
# Combining two data frame
df = pd.concat([df_train, df_test])
                       
# Show top few rows
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [15]:
# 檢查 DataFrame 空缺值的狀態
# Create a function to check ratio of missing data in a data frame
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
   
# Check missing data ratio in each column
na_check(df)

Unnamed: 0,Missing Ratio
PoolQC,99.657417
MiscFeature,96.402878
Alley,93.216855
Fence,80.438506
FireplaceQu,48.646797
LotFrontage,16.649538
GarageFinish,5.447071
GarageYrBlt,5.447071
GarageQual,5.447071
GarageCond,5.447071


In [16]:
# 以下 In[3]~In[7] 只是摘錄房產預測中的一組特徵工程, 並以此組特徵工程跑參數, 若更換其他特徵工程, In[8]的參數需要重新跑
# 部分欄位缺值補 'None'
# Create an array to list all columns with string data type
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'FireplaceQu', 'FireplaceQu', 'FireplaceQu', 
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
             'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'Functional', 'MSSubClass']
# Loop through all elements of the array
for col in none_cols:
    # Fill null value with "None"
    df[col] = df[col].fillna("None")
    
# 部分欄位缺值填補 0
# Create an array to list all columns with numberic data type
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 
             'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']

# Loop through all elements of the array
for col in zero_cols:
     # Fill null value with 0
    df[col] = df[col].fillna(0)

In [17]:
# 部分欄位缺值補眾數
# Create an array of columns that will be using mode to fill up null value
mode_cols = ['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType']
# Loop through all element in the array
for col in mode_cols:
     # Fill up empty cell with mode value
    df[col] = df[col].fillna(df[col].mode()[0])
    
# 'LotFrontage' 有空缺時, 以同一區 (Neighborhood) 的 LotFrontage 中位數填補 (可以視為填補一種群聚編碼 )
# Fill up the missing data using median value
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

# Utilities 參考資訊很少, 所以直接捨棄
# Drop column
df = df.drop(['Utilities'], axis=1)

In [18]:
# 做完各種補缺值, 確認一下有沒有遺漏
# Check missing data again, you should not see any data returned any more.
na_check(df)

Unnamed: 0,Missing Ratio


In [19]:
# 四個數值欄位, 因為相異值有限, 轉成文字

# Create an array with columns of numeric data type and limited distinct values
label_cols = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
# Loop through all element in the array
for col in mode_cols:
    # Convert data type to string
    df[col] = df[col].astype(str)
    
# 相異值不太具有代表性的, 做標籤編碼
# Create an array with column names
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# Loop through all columns
for c in cols:
    # Create label encoder
    lbl = LabelEncoder() 
    # Fit the label encorder
    lbl.fit(list(df[c].values)) 
    # Transform the value
    df[c] = lbl.transform(list(df[c].values))
    
# 由地下室面積 + 1樓面積 + 2樓面積, 計算總坪數特徵 
# Create a new column based on add-up of 3 columns
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']    
    
# 看看目前特徵工程的結果
# Verify the data frames rows and columns
print('Shape df: {}'.format(df.shape))

# Show top N rows
df.head()

Shape df: (2919, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotalSF
0,5,RL,65.0,8450,1,1,3,Lvl,Inside,0,...,0,3,4,,0,1,2,WD,Normal,2566.0
1,0,RL,80.0,9600,1,1,3,Lvl,FR2,0,...,0,3,4,,0,4,1,WD,Normal,2524.0
2,5,RL,68.0,11250,1,1,0,Lvl,Inside,0,...,0,3,4,,0,8,2,WD,Normal,2706.0
3,6,RL,60.0,9550,1,1,0,Lvl,Corner,0,...,0,3,4,,0,1,0,WD,Abnorml,2473.0
4,5,RL,84.0,14260,1,1,0,Lvl,FR2,0,...,0,3,4,,0,11,2,WD,Normal,3343.0


In [20]:
# 把剩下少數重要的類別型欄位, 做獨熱編碼 (已變成數字的欄位, 會自動跳過)
# Convert categorical variable into dummy/indicator variables
df = pd.get_dummies(df)
print(df.shape)

(2919, 221)


In [21]:
# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
# Get the row count of data frame
train_num = train_Y.shape[0]

# Create training data from first row till last row (1460)
train_X = df[:train_num]

# Create test data from last row till first row ( 1460 )
test_X = df[train_num:]


# 使用三種模型 : 線性迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Create a linear regression model
linear = LinearRegression(normalize=False, fit_intercept=True, copy_X=True)

# Create a gradient boosting regression model
gdbt = GradientBoostingRegressor(tol=0.1, subsample=0.37, n_estimators=200, max_features=20, 
                                 max_depth=6, learning_rate=0.03)

# Create a random forest regressor model
rf = RandomForestRegressor(n_estimators=300, min_samples_split=9, min_samples_leaf=10, 
                           max_features='sqrt', max_depth=8, bootstrap=False)

In [22]:
# 線性迴歸預測檔 (結果有部分隨機, 請以 Kaggle 計算的得分為準, 以下模型同理)
# Train linear regression model
linear.fit(train_X, train_Y)

# Test linear regresson model
linear_pred = linear.predict(test_X)

# Create a data frame based on primary key and the predicted target value
sub = pd.DataFrame({'Id': ids, 'SalePrice': np.expm1(linear_pred)})

# Export data to CSV.
sub.to_csv('house_linear.csv', index=False) 

In [23]:
# 梯度提升機預測檔 
# Train the gradient boosting model
gdbt.fit(train_X, train_Y)

# Test the gradient boosting model
gdbt_pred = gdbt.predict(test_X)

# Create a data frame based on primary key with the predicted data from testing
sub = pd.DataFrame({'Id': ids, 'SalePrice': np.expm1(gdbt_pred)})

# Export data frame to CSV file
sub.to_csv('house_gdbt.csv', index=False)

In [24]:
# 隨機森林預測檔 
# Train the random forest classifier model
rf.fit(train_X, train_Y)

# Test the random forest classifier model
rf_pred = rf.predict(test_X)

# Create a data frame based on primary key with the predicted data from testing
sub = pd.DataFrame({'Id': ids, 'SalePrice': np.expm1(rf_pred)})

# Export data frame to CSV file
sub.to_csv('house_rf.csv', index=False)

In [26]:
# 堆疊泛化套件 mlxtend, 需要先行安裝(使用 pip 安裝即可)在執行環境下
# pip install mlxtend
# conda install mlxtend --channel conda-forge ( If you get SSL error, reinstall Anaconda)
from mlxtend.regressor import StackingRegressor

# 因為 Stacking 需要以模型作為第一層的特徵來源, 因此在 StackingRegressor 中,
# 除了要設本身(第二層)的判定模型 - meta_regressor, 也必須填入第一層的單模作為編碼器 - regressors
# 這裡第二層模型(meta_regressor)的參數, 一樣也需要用 Grid/Random Search, 請參閱講義中的 mlxtrend 網頁
# Create a meta esitmator
meta_estimator = GradientBoostingRegressor(tol=10, subsample=0.44, n_estimators=100, 
                                           max_features='log2', max_depth=4, learning_rate=0.1)
# Create a stacking model based on 3 models
stacking = StackingRegressor(regressors=[linear, gdbt, rf], meta_regressor=meta_estimator)

In [27]:
# 堆疊泛化預測檔 : 分數會依每次執行略有出入, 但通常 Public Score(競賽中的提交分數) 會再比單模好一些
# 雖然 Public Score 有可能比 Blending 分數略差, 但是因為不用依賴仔細調整的權重參數, 競賽結束時的 Private Score, 通常會比 Blending 好
# (因為權重依賴於 Public 的分數表現), 這種在未知 / 未曝光資料的預測力提升, 就是我們所謂 "泛化能力比較好" 在競賽/專案中的含意
# Train the stacking model
stacking.fit(train_X, train_Y)

# Test the stacking model
stacking_pred = stacking.predict(test_X)

# Extract predicted data
sub = pd.DataFrame({'Id': ids, 'SalePrice': np.expm1(stacking_pred)})

# Export the predicted data to CSV file
sub.to_csv('house_stacking.csv', index=False)