In [71]:
# https://www.kaggle.com/code/jesucristo/1-house-prices-solution-top-1#EDA

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import os
print(os.listdir("./data"))

import warnings
warnings.filterwarnings('ignore')

['kaggle_house_pred_test.csv', 'kaggle_house_pred_train.csv', 'sample_submission.csv']


In [72]:
train = pd.read_csv('./data/kaggle_house_pred_train.csv')
test = pd.read_csv('./data/kaggle_house_pred_test.csv')

## Feature Engineering

In [73]:
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Dropping outliers after detecting them by eye(删除异常值并重置索引)
train = train.drop(train[(train['OverallQual'] < 5)
                                  & (train['SalePrice'] > 200000)].index)
train = train.drop(train[(train['GrLivArea'] > 4000)
                                  & (train['SalePrice'] < 200000)].index)
train = train.drop(train[(train['GarageArea'] > 1200)
                                  & (train['SalePrice'] < 200000)].index)
train = train.drop(train[(train['TotalBsmtSF'] > 3000)
                                  & (train['SalePrice'] > 320000)].index)
train = train.drop(train[(train['1stFlrSF'] < 3000)
                                  & (train['SalePrice'] > 600000)].index)
train = train.drop(train[(train['1stFlrSF'] > 3000)
                                  & (train['SalePrice'] < 200000)].index)
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

#feature merge(合并训练集和测试集的特征)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test
features = pd.concat([train_features, test_features]).reset_index(drop=True)

# target保存训练集的目标变量
y = train['SalePrice'].reset_index(drop=True)

In [74]:
#Transforming rare values(less than 10) into one group - dimensionality reduction 
# 稀有值处理，将频率小于10的值归为"-1"类
rare_cols = ['Condition1', 'Condition2', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Heating', 'Electrical', 'Functional', 'SaleType']

for col in rare_cols:
    rare_values = features[col].value_counts()[features[col].value_counts() < 10].index
    features[col].replace(rare_values, -1, inplace=True) 


In [75]:
# Ordinal Encoding
def encode(frame, feature):
    '''
    修改frame,需要目标变量salesprices
    '''
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()

    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o

quantitative = [f for f in features.columns if features.dtypes[f] != 'object']
qualitative = [f for f in features.columns if features.dtypes[f] == 'object']

qual_encoded = []
for q in qualitative:  
    encode(train, q)
    qual_encoded.append(q+'_E')

quantitative_data = features[quantitative]
qual_encoded_data = train[qual_encoded]

# 将两部分数据拼接成 DataFrame
features_encoded = pd.concat([quantitative_data, qual_encoded_data], axis=1)

# TODO
#str_columns = train.select_dtypes(include=['object']).columns
#train = train.drop(columns=str_columns)

'''手动标注
# Converting some of the categorical values to numeric ones.
# 特征映射，将分类变量转换为数值型变量
neigh_map = {'MeadowV': 1, 'IDOTRR': 1, 'BrDale': 1, 'BrkSide': 2, 'OldTown': 2, 
             'Edwards': 2, 'Sawyer': 3, 'Blueste': 3, 'SWISU': 3, 'NPkVill': 3, 
             'NAmes': 3, 'Mitchel': 4, 'SawyerW': 5, 'NWAmes': 5, 'Gilbert': 5, 
             'Blmngtn': 5, 'CollgCr': 5, 'ClearCr': 6, 'Crawfor': 6, 'Veenker': 7, 
             'Somerst': 7, 'Timber': 8, 'StoneBr': 9, 'NridgHt': 10, 'NoRidge': 10}

features_encoded['Neighborhood'] = features_encoded['Neighborhood'].map(neigh_map).astype(int)

quality_maps = {'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
                'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
                'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 
                'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}}

for col, mapping in quality_maps.items():
    features_encoded[col] = features_encoded[col].map(mapping).astype(int)
'''

"手动标注\n# Converting some of the categorical values to numeric ones.\n# 特征映射，将分类变量转换为数值型变量\nneigh_map = {'MeadowV': 1, 'IDOTRR': 1, 'BrDale': 1, 'BrkSide': 2, 'OldTown': 2, \n             'Edwards': 2, 'Sawyer': 3, 'Blueste': 3, 'SWISU': 3, 'NPkVill': 3, \n             'NAmes': 3, 'Mitchel': 4, 'SawyerW': 5, 'NWAmes': 5, 'Gilbert': 5, \n             'Blmngtn': 5, 'CollgCr': 5, 'ClearCr': 6, 'Crawfor': 6, 'Veenker': 7, \n             'Somerst': 7, 'Timber': 8, 'StoneBr': 9, 'NridgHt': 10, 'NoRidge': 10}\n\nfeatures_encoded['Neighborhood'] = features_encoded['Neighborhood'].map(neigh_map).astype(int)\n\nquality_maps = {'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, \n                'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, \n                'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, \n                'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, \n                'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ':

In [77]:
# 通用填充函数，根据不同策略填充NaN
def fill_na(df, zero_cols, freq_cols):

    # 填充0值
    df[zero_cols] = df[zero_cols].fillna(0)
    
    # 填充众数
    for col in freq_cols:
        if not df[col].mode().empty:  # 检查是否有众数
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            # 如果没有众数，退而使用其他策略
            df[col].fillna(df[col].mean(), inplace=True)


zero_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
             'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea','Alley_E', 'PoolQC_E', 'MiscFeature_E', 'Fence_E', 'FireplaceQu_E', 'GarageType_E', 
             'GarageFinish_E', 'GarageQual_E', 'GarageCond_E', 'BsmtQual_E', 'BsmtCond_E', 
             'BsmtExposure_E', 'BsmtFinType1_E', 'BsmtFinType2_E', 'MasVnrType_E']

# List of NaN including columns where NaN's actually missing gonna replaced with mode.
# categorical variables so use the number with the most frequecy.
freq_cols = ['Electrical_E', 'Exterior1st_E', 'Exterior2nd_E', 'Functional_E', 'KitchenQual_E','SaleType_E', 'Utilities_E']

# 按照筛选规则填充NaN
fill_na(features_encoded, zero_cols, freq_cols)

# 对MSZoning_E按MSSubClass分组填充
features_encoded['MSZoning_E'] = features_encoded.groupby('MSSubClass')['MSZoning_E'].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x.fillna(x.mean())
)

# 对LotFrontage按Neighborhood_E分组填充
features_encoded['LotFrontage'] = features_encoded.groupby(
    ['Neighborhood_E'])['LotFrontage'].transform(lambda x: x.fillna(x.median())
)

# 按数据类型自动填充
def fill_by_dtype(df):
    # 填充所有object类型的列为'None'
    objects = [col for col in df.columns if df[col].dtype == 'object']
    df.update(df[objects].fillna('None'))
    
    # 填充所有数值类型的列为0
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerics = [col for col in df.columns if df[col].dtype in numeric_dtypes]
    df.update(df[numerics].fillna(0))

# 根据列的数据类型自动填充
fill_by_dtype(features_encoded)

In [78]:
# # fill out NAN(填充缺失值)

# ## List of NaN including columns where NaN's mean none. 
# ## exp. there is no pool in the house.
# none_cols = ['Alley_E', 'PoolQC_E', 'MiscFeature_E', 'Fence_E', 'FireplaceQu_E', 'GarageType_E', 
#              'GarageFinish_E', 'GarageQual_E', 'GarageCond_E', 'BsmtQual_E', 'BsmtCond_E', 
#              'BsmtExposure_E', 'BsmtFinType1_E', 'BsmtFinType2_E', 'MasVnrType_E']

# ## List of NaN including columns where NaN's mean 0.
# ## exp. area 
# zero_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
#              'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea']

# ## List of NaN including columns where NaN's actually missing gonna replaced with mode.
# ## categorical variables so use the number with the most frequecy.
# freq_cols = ['Electrical_E', 'Exterior1st_E', 'Exterior2nd_E', 'Functional_E', 'KitchenQual_E', 
#              'SaleType_E', 'Utilities_E']

# # Filling the list of columns above:

# for col in none_cols:
#     features_encoded[col].replace(np.nan, 'None', inplace=True)

# for col in zero_cols:
#     features_encoded[col].replace(np.nan, 0, inplace=True)

# for col in freq_cols:
#     features_encoded[col].replace(np.nan,features_encoded[col].mode()[0], inplace=True)
    
# # Filling MSZoning according to MSSubClass
# # since different Mssubclass have different MSZoning
# # features_encoded['MSZoning_E'] = features_encoded.groupby('MSSubClass')['MSZoning_E'].transform(
# #     lambda x: x.fillna(x.mode()[0]))

# features_encoded['MSZoning_E'] = features_encoded.groupby('MSSubClass')['MSZoning_E'].transform(
#     lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x.fillna(x.mean()))


# # Filling LotFrontage according to Neighborhood
# # the house nearby tend to have the same area of lot frontage.
# features_encoded['LotFrontage'] = features_encoded.groupby(
#     ['Neighborhood_E'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# #fill according to dtypes automatically
# objects = []
# for i in features_encoded.columns:
#     if features_encoded[i].dtype == object:
#         objects.append(i)
# features_encoded.update(features_encoded[objects].fillna('None'))

# numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# numerics = []
# for i in features_encoded.columns:
#     if features_encoded[i].dtype in numeric_dtypes:
#         numerics.append(i)
# features_encoded.update(features_encoded[numerics].fillna(0))

In [82]:
# Creating new features based on previous observations
# 创建新特征
features_encoded['TotalSF'] = features_encoded['BsmtFinSF1'] + features_encoded['BsmtFinSF2'] + features_encoded['1stFlrSF'] + features_encoded['2ndFlrSF']
features_encoded['TotalBathrooms'] = features_encoded['FullBath'] + 0.5 * features_encoded['HalfBath'] + features_encoded['BsmtFullBath'] + 0.5 * features_encoded['BsmtHalfBath']
features_encoded['TotalPorchSF'] = features_encoded['OpenPorchSF'] + features_encoded['3SsnPorch'] + features_encoded['EnclosedPorch'] + features_encoded['ScreenPorch'] + features_encoded['WoodDeckSF']
features_encoded['YearBlRm'] = features_encoded['YearBuilt'] + features_encoded['YearRemodAdd']

# Merging quality and conditions
# 合并质量和条件相关的特征
features_encoded['TotalExtQual'] = (features_encoded['ExterQual_E'] + features_encoded['ExterCond_E'])
features_encoded['TotalBsmQual'] = (features_encoded['BsmtQual_E'] + features_encoded['BsmtCond_E'] +
                            features_encoded['BsmtFinType1_E'] +
                            features_encoded['BsmtFinType2_E'])
features_encoded['TotalGrgQual'] = (features_encoded['GarageQual_E'] + features_encoded['GarageCond_E'])
features_encoded['TotalQual'] = features_encoded['OverallQual'] + features_encoded[
    'TotalExtQual'] + features_encoded['TotalBsmQual'] + features_encoded[
        'TotalGrgQual'] + features_encoded['KitchenQual_E'] + features_encoded['HeatingQC_E']

## Creating new features_encoded by using new quality indicators
# 创建质量乘积特征
features_encoded['QualGr'] = features_encoded['TotalQual'] * features_encoded['GrLivArea']
features_encoded['QualBsm'] = features_encoded['TotalBsmQual'] * (features_encoded['BsmtFinSF1'] + features_encoded['BsmtFinSF2'])
features_encoded['QualPorch'] = features_encoded['TotalExtQual'] * features_encoded['TotalPorchSF']
features_encoded['QualExt'] = features_encoded['TotalExtQual'] * features_encoded['MasVnrArea']
features_encoded['QualGrg'] = features_encoded['TotalGrgQual'] * features_encoded['GarageArea']
features_encoded['QlLivArea'] = (features_encoded['GrLivArea'] - features_encoded['LowQualFinSF']) * features_encoded['TotalQual']
features_encoded['QualSFNg'] = features_encoded['QualGr'] * features_encoded['Neighborhood_E']

# Creating some simple features_encoded
# 创建简单特征
features_encoded['HasPool'] = features_encoded['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['Has2ndFloor'] = features_encoded['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasGarage'] = features_encoded['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasBsmt'] = features_encoded['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasFireplace'] = features_encoded['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
features_encoded['HasPorch'] = features_encoded['TotalPorchSF'].apply(lambda x: 1 if x > 0 else 0)

pd.set_option('display.max_rows', None)
features_encoded.dtypes

MSSubClass           int64
LotFrontage        float64
LotArea              int64
OverallQual          int64
OverallCond          int64
YearBuilt            int64
YearRemodAdd         int64
MasVnrArea         float64
BsmtFinSF1         float64
BsmtFinSF2         float64
BsmtUnfSF          float64
TotalBsmtSF        float64
1stFlrSF             int64
2ndFlrSF             int64
LowQualFinSF         int64
GrLivArea            int64
BsmtFullBath       float64
BsmtHalfBath       float64
FullBath             int64
HalfBath             int64
BedroomAbvGr         int64
KitchenAbvGr         int64
TotRmsAbvGrd         int64
Fireplaces           int64
GarageYrBlt        float64
GarageCars         float64
GarageArea         float64
WoodDeckSF           int64
OpenPorchSF          int64
EnclosedPorch        int64
3SsnPorch            int64
ScreenPorch          int64
PoolArea           float64
MiscVal            float64
MoSold               int64
YrSold               int64
MSZoning_E         float64
S

In [80]:
# Transforming The Skewed Features
# 处理偏态特征
possible_skewed = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal'
]

# Finding skewness of the numerical features
# 计算数值特征的偏度
skew_features = np.abs(features_encoded[possible_skewed].apply(lambda x: skew(x)).sort_values(ascending=False))

# Filtering skewed features
# 过滤偏度大于0.3的特征
high_skew = skew_features[skew_features > 0.3]

# 获取高偏度特征的索引
skew_index = high_skew.index

# # Applying boxcox transformation to fix skewness
# # 使用Box-Cox变换来修正偏度
# for i in skew_index:
#     features_encoded[i] = boxcox1p(features_encoded[i], boxcox_normmax(features_encoded[i] + 1))

# 检查高偏度特征中是否存在负值或接近零的值
for i in skew_index:
    min_value = features_encoded[i].min()
    print(f"特征 {i} 的最小值为: {min_value}")
    
    # 如果存在负值或接近零的值，平移数据，确保所有值为正数
    if min_value <= 0:
        shift_value = abs(min_value) + 1
        print(f"特征 {i} 包含负值或零，平移 {shift_value}")
        features_encoded[i] = features_encoded[i] + shift_value
    
    # 应用Box-Cox变换
    features_encoded[i] = boxcox1p(features_encoded[i], boxcox_normmax(features_encoded[i] + 1))




'''#automatically
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features_encoded.columns:
    if features_encoded[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = features_encoded[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    features_encoded[i] = boxcox1p(features_encoded[i], boxcox_normmax(features_encoded[i] + 1))
'''

特征 MiscVal 的最小值为: 0
特征 MiscVal 包含负值或零，平移 1
特征 PoolArea 的最小值为: 0
特征 PoolArea 包含负值或零，平移 1
特征 LotArea 的最小值为: 1300


BracketError: The algorithm terminated without finding a valid bracket. Consider trying different initial points.

In [12]:
# 删除无关特征
# TODO: STREET
to_drop = [
    'Utilities',
    'PoolQC',
    'YrSold',
    'MoSold',
    'ExterQual',
    'BsmtQual',
    'GarageQual',
    'KitchenQual',
    'HeatingQC',
]
# 只显示有缺失值的列及其缺失值数量
missing_values = features_encoded.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

Series([], dtype: int64)


In [13]:
# 对分类变量进行独热编码
#features_encoded = pd.get_dummies(data=features_encoded)

# 分离训练集和测试集
train = features_encoded.iloc[:len(y), :]
test = features_encoded.iloc[len(train):, :]

# 设置模型数据
X = train
y = np.log1p(y)

print(X.shape,test.shape,y.shape)

(1449, 100) (1459, 100) (1449,)


In [14]:
# 移除异常数据行
#outliers = [30, 88, 462, 631, 1322]
#X = X.drop(X.index[outliers])
#y = y.drop(y.index[outliers])

# 移除导致过拟合的特征
# overfit = []
# for i in X.columns:
#     counts = X[i].value_counts()
#     zeros = counts.iloc[0]
#     if zeros / len(X) * 100 > 99.94:
#         overfit.append(i)

#overfit = list(overfit)
#X = X.drop(overfit, axis=1)
#test = test.drop(overfit, axis=1)

In [15]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [16]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [17]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=10000000, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=10000000, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [18]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)      

In [19]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

In [20]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [21]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [22]:
score = cv_rmse(ridge)
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

LASSO: 0.1152 (0.0199)
 2024-10-16 13:29:41.128316
elastic net: 0.1152 (0.0199)
 2024-10-16 13:29:58.539224
SVR: 0.1677 (0.0271)
 2024-10-16 13:30:01.906664
lightgbm: 0.1139 (0.0203)
 2024-10-16 13:30:19.501920
gbr: 0.1125 (0.0207)
 2024-10-16 13:32:10.930581
xgboost: 0.1099 (0.0217)
 2024-10-16 13:32:47.676777


In [23]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('Svr')
svr_model_full_data = svr.fit(X, y)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

START Fit
stack_gen
elasticnet
Lasso
Ridge
Svr
GradientBoosting
xgboost
lightgbm


In [24]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [25]:
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

RMSLE score on train data:
0.05660247470061477


In [27]:
print('Predict submission')
submission = pd.read_csv("./data/sample_submission.csv")
submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict(test)))

Predict submission


In [31]:
# print('Blend with Top Kernels submissions\n')
# sub_1 = pd.read_csv('../input/top-10-0-10943-stacking-mice-and-brutal-force/House_Prices_submit.csv')
# sub_2 = pd.read_csv('../input/hybrid-svm-benchmark-approach-0-11180-lb-top-2/hybrid_solution.csv')
# sub_3 = pd.read_csv('../input/lasso-model-for-regression-problem/lasso_sol22_Median.csv')
# submission.iloc[:,1] = np.floor((0.25 * np.floor(np.expm1(blend_models_predict(test)))) + 
#                                 (0.25 * sub_1.iloc[:,1]) + 
#                                 (0.25 * sub_2.iloc[:,1]) + 
#                                 (0.25 * sub_3.iloc[:,1]))

In [30]:
q1 = submission['SalePrice'].quantile(0.005)
q2 = submission['SalePrice'].quantile(0.995)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)
submission.to_csv("submission.csv", index=False)

In [31]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,73716.0
1,1462,82628.0
2,1463,86176.0
3,1464,92147.0
4,1465,88132.0
