# Simple XGBoost Starter (~0.0655)

In [34]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc


In [44]:
print('Loading data ...')
path = 'D:/project/study/kaggle/data/Zillow Prize Zillow’s Home Value Prediction (Zestimate)'

train = pd.read_csv(path + "/train_2016_v2.csv", parse_dates=["transactiondate"])
prop = pd.read_csv(path + '/properties_2016.csv')
sample = pd.read_csv(path +'/sample_submission.csv')


Loading data ...


In [36]:
print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
	if dtype == np.float64:
		prop[c] = prop[c].astype(np.float32)

Binding to float32


In [37]:
print('Creating training set ...')

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values

Creating training set ...


In [38]:
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

(90275, 55) (90275,)


In [39]:
print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

Building DMatrix...


28

In [40]:
print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:squarederror'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['verbosity'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

Training ...
[0]	train-mae:0.48806	valid-mae:0.48112




[10]	train-mae:0.40222	valid-mae:0.39544
[20]	train-mae:0.33268	valid-mae:0.32610
[30]	train-mae:0.27652	valid-mae:0.27013
[40]	train-mae:0.23132	valid-mae:0.22521
[50]	train-mae:0.19506	valid-mae:0.18933
[60]	train-mae:0.16612	valid-mae:0.16073
[70]	train-mae:0.14312	valid-mae:0.13805
[80]	train-mae:0.12497	valid-mae:0.12024
[90]	train-mae:0.11078	valid-mae:0.10635
[100]	train-mae:0.09982	valid-mae:0.09569
[110]	train-mae:0.09145	valid-mae:0.08761
[120]	train-mae:0.08514	valid-mae:0.08160
[130]	train-mae:0.08045	valid-mae:0.07722
[140]	train-mae:0.07700	valid-mae:0.07408
[150]	train-mae:0.07450	valid-mae:0.07184
[160]	train-mae:0.07268	valid-mae:0.07025
[170]	train-mae:0.07136	valid-mae:0.06914
[180]	train-mae:0.07040	valid-mae:0.06839
[190]	train-mae:0.06970	valid-mae:0.06788
[200]	train-mae:0.06919	valid-mae:0.06753
[210]	train-mae:0.06882	valid-mae:0.06729
[220]	train-mae:0.06854	valid-mae:0.06714
[230]	train-mae:0.06833	valid-mae:0.06703
[240]	train-mae:0.06816	valid-mae:0.06696
[

In [41]:
print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)

del x_test; gc.collect()

Building test set ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


7

In [42]:
print('Predicting on test ...')

p_test = clf.predict(d_test)

del d_test; gc.collect()

sub = pd.read_csv(path+'/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

Predicting on test ...


In [43]:
print('Writing csv ...')
sub.to_csv('xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion

Writing csv ...
