In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


# Imports

In [2]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.linear_model import Ridge
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Target variable


In [4]:
y = train.SalePrice
train.drop(['SalePrice'], axis=1, inplace=True)
data = pd.concat([train, test], axis=0)

# Handle missing values


In [5]:
data['PoolQC'] = data['PoolQC'].fillna("None")
data['MiscFeature'] = data['MiscFeature'].fillna("None")
data['Alley'] = data['Alley'].fillna("None")
data['Fence'] = data['Fence'].fillna("None")
data['FireplaceQu'] = data['FireplaceQu'].fillna("None")

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    data[col] = data[col].fillna("None")

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    data[col] = data[col].fillna(0)

for col in ('BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual'):
    data[col] = data[col].fillna("None")

for col in ('MasVnrType', 'MasVnrArea'):
    data[col] = data[col].fillna("None" if col == "MasVnrType" else 0)

data['MSZoning'] = data['MSZoning'].fillna(data['MSZoning'].mode()[0])
data['Functional'] = data['Functional'].fillna("Typ")
data['Electrical'] = data['Electrical'].fillna(data['Electrical'].mode()[0])
data['KitchenQual'] = data['KitchenQual'].fillna(data['KitchenQual'].mode()[0])
data['Exterior1st'] = data['Exterior1st'].fillna(data['Exterior1st'].mode()[0])
data['Exterior2nd'] = data['Exterior2nd'].fillna(data['Exterior2nd'].mode()[0])
data['SaleType'] = data['SaleType'].fillna(data['SaleType'].mode()[0])
data['Utilities'] = data['Utilities'].fillna(data['Utilities'].mode()[0])
data['LotFrontage'] = data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))


# Label Encoding


In [6]:
cols = data.select_dtypes(include=["object"]).columns
for col in cols:
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col].astype(str))

# Add TotalSF feature
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

# Handle skewness


In [7]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = skewed_feats[abs(skewed_feats) > 0.75]
for feat in skewness.index:
    data[feat] = np.log1p(data[feat])

# Split back to train/test


In [8]:
X = data[:train.shape[0]]
X_test = data[train.shape[0]:]

# Log transform target


In [9]:
y = np.log1p(y)


# Final check for any missing values


In [10]:
# Final check for any missing values
X = X.fillna(X.median())
X_test = X_test.fillna(X.median())

# Define models


In [11]:
def rmsle_cv(model):
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(X)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse.mean()

ridge = Ridge(alpha=20)
xgb_model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, subsample=0.7)
lgb_model = lgb.LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=5000)
cat_model = CatBoostRegressor(iterations=500, learning_rate=0.05, depth=3, verbose=False)

# Fit all models


In [12]:
ridge.fit(X, y)
xgb_model.fit(X, y)
lgb_model.fit(X, y)
cat_model.fit(X, y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3644
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 75
[LightGBM] [Info] Start training from score 12.024057


<catboost.core.CatBoostRegressor at 0x7d9cdc18c450>

# Average predictions


In [13]:
pred_ridge = ridge.predict(X_test)
pred_xgb = xgb_model.predict(X_test)
pred_lgb = lgb_model.predict(X_test)
pred_cat = cat_model.predict(X_test)


# Final blended prediction


In [14]:
final_preds = (pred_ridge + pred_xgb + pred_lgb + pred_cat) / 4
final_preds = np.expm1(final_preds)

# Submission


In [15]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = final_preds
sub.to_csv('submission.csv', index=False)