資料來源：https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview 

# Import packages

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_data = pd.read_csv("train.csv")
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
# 把遺失值過多的欄位排除掉
raw_data = raw_data.dropna(axis = 1, thresh = int(round(raw_data.shape[0]*0.5)) )

In [None]:
# Define Features and Target
PK = "Id"
target = "SalePrice"
numerical_features = [i for i in raw_data.columns if i not in [PK, target] and raw_data[i].dtype != "object"]
classical_features = [i for i in raw_data.columns if i not in [PK, target] and raw_data[i].dtype == "object"]

In [None]:
# 切割成訓練、驗證與測試資料
xtrain, xtest, ytrain, ytest = train_test_split(raw_data[numerical_features+classical_features], raw_data[target], test_size = 0.2)

In [None]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

In [None]:
def generate_one_hot_encoding_features(one_train_Series,
                                       one_test_Series):
    
    # 產生 One-Hot Encoding Object
    oneHotEncoding = OneHotEncoder(handle_unknown = "ignore")

    # 用訓練資料配適 One-Hot Encoding
    oneHotEncoding = oneHotEncoding.fit(one_train_Series.values.reshape((-1, 1)))

    # 產生 One-Hot Encoding 的資料型態
    oneHot_train_data = pd.DataFrame(oneHotEncoding.transform(one_train_Series.values.reshape((-1, 1))).toarray(), columns = oneHotEncoding.categories_[0].tolist() )
    oneHot_test_data = pd.DataFrame(oneHotEncoding.transform(one_test_Series.values.reshape((-1, 1))).toarray(), columns = oneHotEncoding.categories_[0].tolist() )

    return oneHot_train_data, oneHot_test_data

In [None]:
# 把類別資料轉成 One-Hot Encoding
OneHotEncoding_data = [generate_one_hot_encoding_features(one_train_Series = xtrain[one_column], one_test_Series = xtest[one_column]) for one_column in classical_features]

# 建立 One-Hot Encoding 後的訓練資料
preprocessed_xtrain = pd.concat([xtrain.reset_index(drop = True)] + [
    data[0] for data in OneHotEncoding_data
], axis = 1).drop(columns = classical_features)

# 建立 One-Hot Encoding 後的測試資料
preprocessed_xtest = pd.concat([xtest.reset_index(drop = True)]+[
    data[1] for data in OneHotEncoding_data
], axis = 1).drop(columns = classical_features)

In [None]:
print(preprocessed_xtrain.shape, preprocessed_xtest.shape)

In [None]:
# Imputation
KNNimputation = KNNImputer(weights = "distance")

# preprocessed_xtrain = KNNimputation.fit_transform(preprocessed_xtrain) # 輸出格式為 Array
# preprocessed_xtest = KNNimputation.transform(preprocessed_xtest) # 輸出格式為 Array

preprocessed_xtrain = pd.DataFrame(KNNimputation.fit_transform(preprocessed_xtrain), columns = preprocessed_xtrain.columns.tolist())
preprocessed_xtest = pd.DataFrame(KNNimputation.fit_transform(preprocessed_xtest), columns = preprocessed_xtest.columns.tolist())

# ANOVA

In [None]:
from scipy.stats import f_oneway

In [None]:
one_class_column = classical_features[0]

In [None]:
len(classical_features)

In [None]:
one_class_column

In [None]:
unique_calss = raw_data[one_class_column].unique()

In [None]:
unique_calss

In [None]:
target1 = raw_data[raw_data[one_class_column] == unique_calss[0]][target]
target2 = raw_data[raw_data[one_class_column] == unique_calss[1]][target]
target3 = raw_data[raw_data[one_class_column] == unique_calss[2]][target]
target4 = raw_data[raw_data[one_class_column] == unique_calss[3]][target]
target5 = raw_data[raw_data[one_class_column] == unique_calss[4]][target]

In [None]:
f_oneway(target1,target2,target3,target4,target5)

用 tuple 的方式

In [None]:
target_one_class = (target1,target2,target3,target4,target5)

In [None]:
f_oneway(*target_one_class)

In [None]:
# 判斷某個變數在類別變數之間是否有差異

def identify_difference_from_anova(data, column_name, target_name):
    
    # ANOVA
    f_statistics, f_pvalue = f_oneway(*tuple([data[data[column_name] == one_class][target_name] for one_class in data[column_name].unique()]) )
    
    if f_pvalue < 0.05:
        return column_name
    
significant_column = [identify_difference_from_anova(data = raw_data, column_name = one_column_name, target_name = target) for one_column_name in classical_features]

while None in significant_column:
    significant_column.remove(None)

# Exhaustive Feature Selection
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector

In [None]:
model = XGBRegressor()

# 建立特徵挑選物件
efs = ExhaustiveFeatureSelector(model,
                 min_features = 10,              
                 max_features = 11,
                 scoring = 'neg_mean_squared_error',
                 cv = 0)
# 開始執行特徵挑選
efs.fit(preprocessed_xtrain,ytrain)

In [None]:
# 輸出最好的特徵組合
print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

# Sequential Forward Selection

程式碼參考連結：http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#overview    
評估指標參考連結：https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
model = XGBRegressor()

# 建立特徵挑選物件
sfs1 = SequentialFeatureSelector(model, 
           k_features=10, 
           forward=True, 
           floating=False, 
           cv=0)

# 開始執行特徵挑選
sfs1 = sfs1.fit(preprocessed_xtrain,ytrain)                            

In [None]:
# 輸出每一輪特徵挑選狀況
sfs1.subsets_

In [None]:
# 輸出被選入的特徵
sfs1.k_feature_names_

# Sequential Backward Selection

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
model = XGBRegressor()

# 建立特徵挑選物件
bfs1 = SequentialFeatureSelector(model, 
           k_features=200, 
           forward=False, 
           floating=False, 
           cv=0)

# 開始執行特徵挑選
bfs1 = bfs1.fit(preprocessed_xtrain,ytrain)  

In [None]:
# 輸出特徵挑選過程
bfs1.subsets_

In [None]:
# 輸出被選入的特徵
bfs1.k_feature_names_

# Sequential Floating Forward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
model = XGBRegressor()

# 建立特徵挑選物件
sffs = SequentialFeatureSelector(estimator = model,
                                 k_features = 50,
                                 scoring = "neg_mean_squared_error",
                                 cv = 0,
                                 floating = True,
                                 forward = True)

# 開始執行特徵挑選
sffs = sffs.fit(preprocessed_xtrain, ytrain)

In [None]:
# 輸出特徵挑選過程
sffs.subsets_

In [None]:
# 輸出被選入的特徵
sffs.k_feature_names_

# Sequential Floating Backward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
model = XGBRegressor()

# 建立特徵挑選物件
sfbs = SequentialFeatureSelector(estimator = model,
                                 k_features = 50,
                                 scoring = "neg_mean_squared_error",
                                 cv = 0,
                                 floating = True,                                
                                 forward = False)

# 開始執行特徵挑選
sfbs = sfbs.fit(preprocessed_xtrain, ytrain)

In [None]:
# 輸出特徵挑選過程
sfbs.subsets_

In [None]:
# 輸出被挑選到的特徵
sfbs.k_feature_names_

# Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE

In [None]:
model = XGBRegressor()

# 建立 RFE 物件
rfe = RFE(estimator = model,
      n_features_to_select = 200,
      step = 5)

# 執行 RFE
rfe.fit(preprocessed_xtrain, ytrain)

In [None]:
select_index = rfe.get_feature_names_out()
select_index = [eval(i[1:]) for i in select_index]
print(select_index)
print(np.array(preprocessed_xtrain.columns)[select_index])

# Recursive Feature Elimination with Cross-Validation

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
model = XGBRegressor()

# 建立 RFECV 物件
# 建立 RFE 物件
rfecv = RFECV(estimator = model,
      min_features_to_select = 200,
      cv = 5,
      scoring = 'neg_mean_squared_error',
      step = 5)

# 執行 RFECV
rfecv.fit(preprocessed_xtrain, ytrain)

In [None]:
select_index = rfecv.get_feature_names_out()
select_index = [eval(i[1:]) for i in select_index]
print(select_index)
print(np.array(preprocessed_xtrain.columns)[select_index])

In [None]:
rfecv.cv_results_