## The part of cleaning data is from Kaggle Kernel, Author: Alexandru Papiu

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import sys

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr


%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [2]:
train = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv")

In [None]:
train.head()

In [3]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [None]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
prices.hist()

In [4]:
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

In [5]:
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

  import sys


In [6]:
all_data = pd.get_dummies(all_data)

In [7]:
#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

In [8]:
#creating matrices for sklearn:
X = all_data[:train.shape[0]]
X_val = all_data[train.shape[0]:]
y = train.SalePrice


In [None]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size = 0.33)


#my_model = RandomForestRegressor()

my_model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.4, learning_rate = 0.1, max_depth = 3, alpha = 0, n_estimators = 500)


my_model.fit(X_train, y_train)

# model evaluation
y_pred = my_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("RMSE: %f" % (rmse))

 RMSE here -> Kaggle score: 0.154 -> 0.167, 0.138 -> 0.153, 0.129 ->0.135

In [None]:
# model evaluation
y_pred = my_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("RMSE: %f" % (rmse))

In [None]:
## For submitting the result
#

predicted_prices = np.expm1(my_model.predict(X_val))

# We will look at the predicted prices to ensure we have something sensible.

print(predicted_prices)

In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)