In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [47]:
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def pre_process(df):
    # removing nans
    cols = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
            "GarageCond", "BsmtExposure", "BsmtFinType2", "BsmtFinType1", "BsmtCond", "BsmtQual", "MasVnrType",
            "Electrical", "KitchenQual", "Functional", "SaleType", "MSZoning", "Utilities", "Exterior1st",
            "Exterior2nd"]
    df.loc[:, cols] = df.loc[:, cols].fillna("zzz")
    cols = ["GarageYrBlt", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", 
            "BsmtHalfBath", "GarageCars", "GarageArea", "TotalBsmtSF"]
    df.loc[:, cols] = df.loc[:, cols].fillna(0)
    df.loc[:, "LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    assert df.isna().sum().max() < len(df) / 100
    df = df.dropna()
    
    # features reworking
    df.loc[:, "YearBuilt"] = (df.loc[:, "YearBuilt"] - df.loc[:, "YearBuilt"].min()).astype(float)
    df.loc[:, "YearRemodAdd"] = (df.loc[:, "YearRemodAdd"] - df.loc[:, "YearRemodAdd"].min()).astype(float)
    df.loc[:, "WhenSold"] = (df.loc[:, "YrSold"] - df.loc[:, "YrSold"].min()).astype(float)
    df.loc[:, "WhenSold"] = df.loc[:, "WhenSold"] + df.loc[:, "MoSold"].astype(float) / 12
    df = df.drop(["YrSold", "MoSold"], axis=1)
    
    # new features
    df.loc[:, "totalSf"] = df.loc[:, "1stFlrSF"] + df.loc[:, "2ndFlrSF"]
    
    # skewed features
    numeric_feats = df.dtypes[df.dtypes != "object"].index.tolist()
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index.tolist()
    df.loc[:, skewed_feats] = np.log1p(df.loc[:, skewed_feats])
    
    # rescaling
    """
    scaler = MinMaxScaler()
    cols = df.dtypes[df.dtypes == float].index.tolist()
    cols += ["MiscVal", "LotArea", "totalSf", "1stFlrSF", "2ndFlrSF", "GrLivArea", "WoodDeckSF", "OpenPorchSF"]
    scaler.fit(df.loc[:, cols])
    df.loc[:, cols] = scaler.transform(df.loc[:, cols])
    """
    
    # dummies
    cols = ["MSSubClass"]
    df[cols] = df[cols].astype(object)
    df = pd.get_dummies(df, drop_first=True)
    
    # objects to int
    """
    cols = df.dtypes[df.dtypes == object].index.tolist()
    cols += ["MSSubClass"]
    for c in cols:
        le = LabelEncoder()
        le.fit(df.loc[:, c].values) 
        df.loc[:, c] = le.transform(list(df.loc[:, c].values))
    """
        
    return df

In [48]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df = pre_process(pd.concat([df_train.drop("SalePrice", axis=1), df_test]))

x = df.loc[df["Id"].isin(df_train["Id"]), :]
x = x.drop("Id", axis=1).to_numpy()
y = np.log1p(df_train["SalePrice"].loc[df_train["Id"].isin(df["Id"])]).to_numpy()

In [49]:
"""
testing simple models
"""
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

def get_score(model, x, y, n_folds=5):
    kfold = KFold(n_splits=n_folds, shuffle=True)
    scores = list()
    for train_index, test_index in kfold.split(x):
        train_x, test_x = x[train_index], x[test_index]
        train_y, test_y = y[train_index], y[test_index]
        model.fit(train_x, train_y)
        scores.append(r2_score(test_y, model.predict(test_x)))
    return np.mean(scores), np.std(scores)

from sklearn.linear_model import LinearRegression
linreg = make_pipeline(RobustScaler(), LinearRegression())
print("linreg", get_score(linreg, x, y))

from sklearn.linear_model import Ridge
ridge = make_pipeline(RobustScaler(), Ridge(alpha=0.1, random_state=0))
print("ridge", get_score(ridge, x, y))

from sklearn.linear_model import LassoCV
lasso = make_pipeline(RobustScaler(), LassoCV(random_state=0))
print("lasso", get_score(lasso, x, y))

from sklearn.linear_model import ElasticNet
elasticnet = make_pipeline(RobustScaler(), ElasticNet(random_state=0))
print("elasticnet", get_score(lasso, x, y))

from sklearn.kernel_ridge import KernelRidge
krr = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
print("krr", get_score(krr, x, y))

from sklearn.svm import SVR
svm = make_pipeline(RobustScaler(), SVR(gamma=1e-2))
print("svm", get_score(svm, x, y))

from sklearn.tree import DecisionTreeRegressor
print("tree", get_score(DecisionTreeRegressor(random_state=0, max_depth=6), x, y))

linreg (0.8103072468249412, 0.0772716587343406)
ridge (0.8690616054847984, 0.04824590857246475)
lasso (0.8867335525404185, 0.03831551869610051)
elasticnet (0.8920999566999918, 0.030327518342146108)
krr (0.7954720746592994, 0.08859825431003987)
svm (0.8702718609733635, 0.04257195027352872)
tree (0.7392306223819352, 0.03412397305990792)


In [40]:
from sklearn.ensemble import GradientBoostingRegressor
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
print("gboost", get_score(GBoost, x, y))

import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
print("xgboost", get_score(model_xgb, x, y))

import lightgbm as lgb
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
print("lgboost", get_score(model_lgb, x, y))

from sklearn.ensemble import RandomForestRegressor
print("forest", get_score(RandomForestRegressor(random_state=0, max_depth=6), x, y))

gboost (0.8932432133431281, 0.038942416408498264)
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only us

In [41]:
from sklearn.neural_network import MLPRegressor
neuralnet = make_pipeline(RobustScaler(), MLPRegressor(random_state=0, max_iter=10000,
                                                       hidden_layer_sizes=[20, 20, 10], solver='adam'))
print("neuralnet", get_score(neuralnet, x, y))

neuralnet (0.45993950679231654, 0.10501618285501939)


In [10]:
"""
averaged model
"""
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, x, y):
        self.models_ = [clone(m) for m in self.models]
        # Train cloned base models
        for model in self.models_:
            model.fit(x, y)
        return self
    
    # now we do the predictions for cloned models and average them
    def predict(self, x):
        predictions = np.column_stack([model.predict(x) for model in self.models_])
        return np.mean(predictions, axis=1)
    
averaged_models = AveragingModels(models=(elasticnet, GBoost, svm, ridge))
print(get_score(averaged_models, x, y))

(0.8514773701068874, 0.017668851941946704)


In [29]:
"""
ensemble model
"""
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    def fit(self, x, y):
        self.base_models_ = [list() for m in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        out_of_fold_predictions = np.zeros((x.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, test_index in kfold.split(x, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(x[train_index], y[train_index])
                yp = instance.predict(x[test_index])
                out_of_fold_predictions[test_index, i] = yp
                
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    def predict(self, x):
        meta_features = np.column_stack([
            np.column_stack([model.predict(x) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)
    
model = StackingAveragedModels(base_models=(elasticnet, GBoost, svm, ridge), meta_model=lasso)
l = 2 * len(x) // 3
model.fit(x[:l], y[:l])
print(r2_score(y[l:], model.predict(x[l:])))

0.9030899567861812


In [15]:
"""
getting final prediction
"""
model.fit(x, y)
model_xgb.fit(x, y)
model_lgb.fit(x, y)

x_to_predict = df.loc[df["Id"].isin(df_test["Id"]), :]
indices = x_to_predict["Id"]
x_to_predict = x_to_predict.drop("Id", axis=1).to_numpy()

yp_model = model.predict(x_to_predict)
yp_xgb = model_xgb.predict(x_to_predict)
yp_lgb = model_lgb.predict(x_to_predict)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [16]:
yp = 0.7 * yp_model + 0.15 * yp_xgb + 0.15 * yp_lgb
yp = np.expm1(yp)

dfp = pd.DataFrame({'Id': indices, 'SalePrice': yp})
dfp = pd.merge(df_test["Id"], dfp, how='left')
dfp["SalePrice"] = dfp["SalePrice"].fillna(dfp["SalePrice"].median())
dfp.reset_index(drop=True)
dfp.to_csv("submission.csv", index=False)

In [43]:
"""
sklearn stack
"""
from sklearn.ensemble import StackingRegressor

base_learners = [('elasticnet', elasticnet), ('GBoost', GBoost), ('svm', svm), ('ridge', ridge)]
final_stack = StackingRegressor(estimators=base_learners, final_estimator=lasso, passthrough=True,
                                verbose=False, cv=5)
l = 2 * len(x) // 3
final_stack.fit(x[:l], y[:l])
print(r2_score(y[l:], final_stack.predict(x[l:])))

0.9041360948112087


In [45]:
final_stack.fit(x, y)

x_to_predict = df.loc[df["Id"].isin(df_test["Id"]), :]
indices = x_to_predict["Id"]
x_to_predict = x_to_predict.drop("Id", axis=1).to_numpy()

yp_stack = final_stack.predict(x_to_predict)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [46]:
yp = np.expm1(yp_stack)

dfp = pd.DataFrame({'Id': indices, 'SalePrice': yp})
dfp = pd.merge(df_test["Id"], dfp, how='left')
dfp["SalePrice"] = dfp["SalePrice"].fillna(dfp["SalePrice"].median())
dfp.reset_index(drop=True)
dfp.to_csv("submission.csv", index=False)