In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew

%config InlineBackend.figure_format = 'png'

In [87]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [88]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))


In [89]:
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [90]:
all_data = pd.get_dummies(all_data)

In [91]:
#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data[:train.shape[0]].mean())

In [92]:
#creating matrices for sklearn:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [93]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [94]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4]).fit(X_train, y)
lasso_preds = np.expm1(model_lasso.predict(X_test))

In [95]:
elastic = ElasticNet(alpha=0.0005, l1_ratio=0.9)
elastic.fit(X_train, y)
elas_preds = np.expm1(elastic.predict(X_test))

In [96]:
import itertools
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label = y)
dtest = xgb.DMatrix(X_test)  

subsample = 0.8
colsample_bytree = 0.8
eta = 0.2
max_depth = 8

num_boost_round = 400 
early_stopping_rounds = 10
test_size = 0.2 

# start the training
print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
    "objective": "reg:linear",
    "booster" : "gbtree", 
    "eval_metric": "rmse", # this is the metric for the leardboard
    "eta": eta, # shrinking parameters to prevent overfitting
    "tree_method": 'exact',
    "max_depth": max_depth,
    "subsample": subsample, # collect 80% of the data only to prevent overfitting
    "colsample_bytree": colsample_bytree,
    "silent": 1,
    "seed": 0,
}

watchlist = [(dtrain, 'train')] # list of things to evaluate and print
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True) # find the best score
xg_pred = np.expm1(gbm.predict(dtest))

XGBoost params. ETA: 0.2, MAX_DEPTH: 8, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8
[0]	train-rmse:9.23086
Will train until train-rmse hasn't improved in 10 rounds.
[1]	train-rmse:7.3908
[2]	train-rmse:5.9189
[3]	train-rmse:4.74021
[4]	train-rmse:3.79763
[5]	train-rmse:3.0428
[6]	train-rmse:2.4399
[7]	train-rmse:1.95902
[8]	train-rmse:1.57274
[9]	train-rmse:1.26409
[10]	train-rmse:1.01763
[11]	train-rmse:0.820866
[12]	train-rmse:0.663372
[13]	train-rmse:0.537915
[14]	train-rmse:0.437979
[15]	train-rmse:0.358703
[16]	train-rmse:0.295242
[17]	train-rmse:0.244099
[18]	train-rmse:0.203533
[19]	train-rmse:0.171034
[20]	train-rmse:0.144659
[21]	train-rmse:0.12465
[22]	train-rmse:0.108988
[23]	train-rmse:0.096007
[24]	train-rmse:0.087014
[25]	train-rmse:0.079559
[26]	train-rmse:0.073485
[27]	train-rmse:0.06854
[28]	train-rmse:0.064621
[29]	train-rmse:0.061588
[30]	train-rmse:0.058223
[31]	train-rmse:0.056175
[32]	train-rmse:0.054576
[33]	train-rmse:0.052084
[34]	train-rmse:0.050064
[35]	train-rmse

In [100]:
final_result=0.7*lasso_preds+0.2*xg_pred+0.1*elas_preds

solution = pd.DataFrame({"id":test.Id, "SalePrice":final_result}, columns=['id', 'SalePrice'])
solution.to_csv("ElasticLasso5.csv", index = False)