In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
sns.set_palette("muted")

from sklearn import linear_model
from sklearn import preprocessing
from scipy import stats
import xgboost as xgb

## Resources and Tutorials

- [Great feature engineering from a Kaggle kernel](https://www.kaggle.com/humananalog/house-prices-advanced-regression-techniques/xgboost-lasso/code)
- [XGBoost tuning tutorial here](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)

## Data Import

In [None]:
# read in data
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)

In [None]:
# separate the labels from the predictors in the training set
y = train.loc[:,"SalePrice"]
train = train.drop("SalePrice", axis = 1)

In [None]:
# inspect the training set
train.head()

In [None]:
# inspect the test set
test.head()

## Basic Exploratory Plots

In [None]:
plt.hist(y)
plt.xlabel("Sale Price")
plt.ylabel("Frequency")
plt.show()

In [None]:
for col, dtype in zip(train.columns, train.dtypes):
    if dtype is not np.dtype("object"):
        col_data = train.loc[:,col].dropna()
        print("{}'s skewness: {:.2f}".format(col, stats.skew(col_data)))
        plt.hist(col_data)
        plt.xlabel(col)
        plt.ylabel("Frequency")
        plt.show()

## Preliminary Data Processing

In [None]:
# log the response variable since it's skewed
y = np.log(y)

# split the features between numeric and categorical
train_numeric = train.select_dtypes(exclude = [np.dtype("object")])
train_categorical = train.select_dtypes(include = [np.dtype("object")])

test_numeric = test.select_dtypes(exclude = [np.dtype("object")])
test_categorical = test.select_dtypes(include = [np.dtype("object")])

# re-encode the dummy variables 

# first combine the training and test sets so that the dummy variable encoding will be consistent
all_categorical = pd.concat((train_categorical,test_categorical))
all_categorical = pd.get_dummies(all_categorical)

# separate the training and test sets again (categorical)
train_categorical = all_categorical.loc[:train.shape[0],]
test_categorical = all_categorical.loc[(train.shape[0]+1):,]

# filling in missing numeric values with the mean

train_numeric = train_numeric.fillna(train_numeric.mean())
test_numeric = test_numeric.fillna(test_numeric.mean())

# scale all numeric variables by subtracting by the mean and dividing by the standard deviation 
# do not include the encoded dummy variables

# scaler function preserves the means and standard deviations of the training set to be used on the test set 
scaler = preprocessing.StandardScaler().fit(train_numeric)

train_numeric = pd.DataFrame(scaler.transform(train_numeric), columns = train_numeric.columns, index = train_numeric.index)
test_numeric = pd.DataFrame(scaler.transform(test_numeric), columns = test_numeric.columns, index = test_numeric.index)

In [None]:
# merge the numeric and categorical features 

train_scaled = pd.merge(left = train_numeric, right = train_categorical, left_index = True, right_index = True)
test_scaled = pd.merge(left = test_numeric, right = test_categorical, left_index = True, right_index = True)

In [None]:
# # re-encode the dummy variables 

# # first combine the training and test sets so that the dummy variable encoding will be consistent
# all_data = pd.concat((train,test))
# all_data = pd.get_dummies(all_data)

# train = all_data.loc[:train.shape[0],]
# test = all_data.loc[(train.shape[0]+1):,]

In [None]:
# log the response variable since it's skewed

# y = np.log(y)

In [None]:
# # filling in missing values with the mean

# train = train.fillna(train.mean())
# test = test.fillna(test.mean())

In [None]:
# # scale all variables by subtracting by the mean and dividing by the standard deviation 
# # this includes the encoded dummy variables

# # scaler function preserves the means and standard deviations of the training set to be used on the test set 

# scaler = preprocessing.StandardScaler().fit(train)

# train_scaled = scaler.transform(train)
# test_scaled = scaler.transform(test)

# Model Fitting

## Linear Regression

In [None]:
# initialize and fit the model
lm = linear_model.LinearRegression()
lm.fit(X = train_scaled, y = y)

In [None]:
# predict and write out submission
results = pd.DataFrame(lm.predict(test_scaled), index = test.index, columns = ["SalePrice"])
results["SalePrice"] = np.exp(results["SalePrice"])
results.to_csv("submission.csv")

In [None]:
# check if there are inf predictions
results.sort_values(by = "SalePrice", ascending = False).head()

##  Lasso Regression

In [None]:
# initialize and fit the model
lassocv = linear_model.LassoCV()
lassocv.fit(X = train_scaled, y = np.ravel(y))

In [None]:
# predict and write out submission
results = pd.DataFrame(lassocv.predict(test_scaled), index = test.index, columns = ["SalePrice"])
results["SalePrice"] = np.exp(results["SalePrice"])
results.to_csv("submission.csv")

## XGBoost

In [None]:
# initialize and fit the model
xgb_model = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42)
xgb_model.fit(X = train_scaled, y = y)

In [None]:
# predict and write out submission

results = pd.DataFrame(xgb_model.predict(test_scaled), index = test.index, columns = ["SalePrice"])
results["SalePrice"] = np.exp(results["SalePrice"])
results.to_csv("submission.csv")

## Building an sklearn pipeline

In [None]:
from sklearn.pipeline import Pipeline
# why do we need to use a pipeline?
# when we normalize our features in the training set and then do cross validation, we're actually "leaking" information 
# feature normalization and cross validation should be wrapped together in a pipeline

In [None]:
# read in data
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)

# separate the labels from the predictors in the training set
y = train.loc[:,"SalePrice"]
train = train.drop("SalePrice", axis = 1)

# log the response variable since it's skewed
y = np.log(y)

# split the features between numeric and categorical
train_numeric = train.select_dtypes(exclude = [np.dtype("object")])
train_categorical = train.select_dtypes(include = [np.dtype("object")])

test_numeric = test.select_dtypes(exclude = [np.dtype("object")])
test_categorical = test.select_dtypes(include = [np.dtype("object")])

# re-encode the dummy variables 

# first combine the training and test sets so that the dummy variable encoding will be consistent
all_categorical = pd.concat((train_categorical,test_categorical))
all_categorical = pd.get_dummies(all_categorical)

# separate the training and test sets again (categorical)
train_categorical = all_categorical.loc[:train.shape[0],]
test_categorical = all_categorical.loc[(train.shape[0]+1):,]

# merge the numeric and categorical features 

train = pd.merge(left = train_numeric, right = train_categorical, left_index = True, right_index = True)
test = pd.merge(left = test_numeric, right = test_categorical, left_index = True, right_index = True)

## Lasso Pipeline

In [None]:
# input to pipeline: training set with encoded categorical variables

# 1st pipeline step: filling in NaNs
# 2nd pipeline step: standard scaler
# 3rd pipeline step: lasso cross validation
lasso_pipeline = Pipeline([("impute mean", preprocessing.Imputer()),
                           ("preprocessing", preprocessing.StandardScaler()),
                           ("lasso", linear_model.LassoCV())])

In [None]:
lasso_pipeline.fit(X = train, y = y)

In [None]:
# predict and write out submission
results = pd.DataFrame(lasso_pipeline.predict(test), index = test.index, columns = ["SalePrice"])
results["SalePrice"] = np.exp(results["SalePrice"])
results.to_csv("submission.csv")

In [None]:
lasso_pipeline.get_params()["steps"][2][1].coef_

#  XGBoost Pipeline

> Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

$$\sqrt{\frac{1}{n}\sum_{i=1}^{n}{(log(y\_pred_i) - log(y\_obs_i))^2}}$$

In [None]:
# define the error function used for this competition
# we should define a function that takes in y_true and y_pre (single data points)
# and outputs the gradient and hessian
# objective(y_true, y_pred) -> grad, hess

def rmsle(y_true, y_pred):
    # define the gradient (1st derivative)
    grad = 0
    # define the hessian (2nd derivative)
    hess = 0
    return grad, hess

In [None]:
xgb_model = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                )

xgb_model = xgb.XGBRegressor(
            

            )

In [None]:
xgb_pipeline = Pipeline([("impute mean", preprocessing.Imputer()),
                           ("preprocessing", preprocessing.StandardScaler()),
                           ("xgb", xgb_model)])

In [None]:
xgb_pipeline.fit(X = train, y = y)

In [None]:
# predict and write out submission
results = pd.DataFrame(xgb_pipeline.predict(test), index = test.index, columns = ["SalePrice"])
results["SalePrice"] = np.exp(results["SalePrice"])
results.to_csv("submission.csv")

In [None]:
xgb.plot_importance(xgb_pipeline.get_params()["steps"][2][1])
plt.show()

In [None]:
xgb_model.fit(X = train, y = y)
# predict and write out submission
results = pd.DataFrame(xgb_model.predict(test), index = test.index, columns = ["SalePrice"])
results["SalePrice"] = np.exp(results["SalePrice"])
results.to_csv("submission.csv")

In [None]:
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
