In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from math import sqrt
from scipy.stats import skew

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import KFold, GridSearchCV

In [3]:
from sklearn.linear_model import BayesianRidge, Ridge, RidgeCV, LinearRegression, ElasticNet, LassoCV, Lasso
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor



# Load data 

In [4]:
with open("../input/preprocessed_data.pkl", "rb") as f:
    train_data = pickle.load(f)
    test_data = pickle.load(f)
    ids = pickle.load(f)
    log_train_lables = pickle.load(f)
    test_labels = pickle.load(f)

In [5]:
print('There are {0} instances in training data'.format(train_data.shape[0]))
print('There are {0} instances in testing data'.format(test_data.shape[0]))
print('There are {0} features'.format(test_data.shape[1]))

There are 1304 instances in training data
There are 146 instances in testing data
There are 282 features


# Build function

In [6]:
# Set-up
TARGET = 'SalePrice'
NFOLDS = 5
SEED = 3
NROWS = None
ntrain = train_data.shape[0]
ntest = test_data.shape[0]

In [7]:
# Convert from df to np array
x_train = np.array(train_data)
x_test = np.array(test_data)
y_train = log_train_lables

In [8]:
# Reset index. If not do this one, there is a bug later
y_train = y_train.reset_index()
del y_train['index']
y_train = y_train['SalePrice']

In [9]:
# Create k-fold
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED).split(x_train)

In [10]:
# Make scorer
scorer = make_scorer(mean_squared_error, greater_is_better=True)

In [11]:
def modelPredict(model):
    model.fit(x_train, y_train)
    log_pred = model.predict(x_test)
    pred = np.expm1(log_pred)
    return log_pred, pred

In [12]:
def modelEvaluate(model):
    true_label = np.log1p(test_labels)
    pred_label = modelPredict(model)[0]
    return sqrt(mean_squared_error(true_label, pred_label))

In [13]:
def paraSearch(model, parameters):
    gs = GridSearchCV(model, parameters, cv=5, scoring=scorer)
    gs.fit(x_train, y_train)
    return gs.best_params_, gs.best_score_, gs.best_estimator_

# Build model

## Parameters

In [14]:
#Linear
ln_params = {
   'normalize': False,
}

#Lasso
ls_params = {
    'alpha': [0.005, 0.05],
}

#Ridge
rd_params = {
    'alpha': [10, 15, 16],
}

In [15]:
#RandomForestRegressor
rf_params = {
    'n_jobs': [16, 20],
    'n_estimators': [100, 150, 200],
    'max_features': [1, 3, 5],
    'max_depth': [3, 5, 7, 9],
    'min_samples_leaf': [1, 3, 5],
}

In [27]:
#XGBRegressor
xgb_params = {
    'colsample_bytree': [0.5, 0.7, 0.9],
    'subsample': [0.5, 0.6, 0.7],
    'learning_rate': [0.075],
    'max_depth': [1, 3, 5, 7],
    'min_child_weight': [1, 3, 5],
}

```python

# This can be used to test a case of GridSearchCV function
ls = Lasso()
gs = GridSearchCV(ls, ls_params, scoring=scorer, cv=5)
gs.fit(x_train, y_train)
gs.grid_scores_, gs.best_params_, gs.best_score_
```

## Do cross-validation

In [17]:
ls = Lasso()

In [18]:
paraSearch(ls, ls_params)

({'alpha': 0.05},
 0.032986355051724356,
 Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False))

In [19]:
rd = Ridge()

In [20]:
paraSearch(rd, rd_params)

({'alpha': 16},
 0.012377279207184611,
 Ridge(alpha=16, copy_X=True, fit_intercept=True, max_iter=None,
    normalize=False, random_state=None, solver='auto', tol=0.001))

In [21]:
rf = RandomForestRegressor()

In [22]:
paraSearch(rf, rf_params)

({'max_depth': 3,
  'max_features': 1,
  'min_samples_leaf': 5,
  'n_estimators': 200,
  'n_jobs': 20},
 0.11534032004695714,
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
            max_features=1, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=20,
            oob_score=False, random_state=None, verbose=0, warm_start=False))

In [23]:
xgb = XGBRegressor()

In [28]:
paraSearch(xgb, xgb_params)

({'colsample_bytree': 0.5,
  'learning_rate': 0.075,
  'max_depth': 1,
  'min_child_weight': 1,
  'subsample': 0.7},
 0.02423044365148852,
 XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
        gamma=0, learning_rate=0.075, max_delta_step=0, max_depth=1,
        min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
        objective='reg:linear', reg_alpha=0, reg_lambda=1,
        scale_pos_weight=1, seed=0, silent=True, subsample=0.7))

## Predict (1st layer)

In [None]:
ls = 

modelPredict(ls)

In [None]:
rd = 

modelPredict(rd)

In [None]:
rf = 

modelPredict(rf)

In [None]:
xgb = 

modelPredict(xgb)

## Predict (2nd layer)

# Evaluate model

In [None]:
modelEvaluate(br)