In [13]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc  #가비지 컬렉션 인터페이스

### 데이터 로딩

In [14]:
print('Loading data...')

train = pd.read_csv('train_2016_v2.csv')
prop = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')

Loading data...


  prop = pd.read_csv('properties_2016.csv')


In [42]:
sample.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712,parcelid
0,10754147,0,0,0,0,0,0,10754147
1,10759547,0,0,0,0,0,0,10759547
2,10843547,0,0,0,0,0,0,10843547
3,10859147,0,0,0,0,0,0,10859147
4,10879947,0,0,0,0,0,0,10879947


### float64 -> float32 dtype 변경

In [15]:
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

Binding to float32


### train data 처리

In [16]:
print('Creating training set...')
df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values

print(x_train.shape, y_train.shape)

Creating training set...
(90275, 55) (90275,)


In [17]:
# train_columns 객체에 저장
train_columns = x_train.columns

In [28]:
# dtype이 object인 컬럼값 처리  nan => False
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

In [29]:
del df_train; gc.collect()

247

In [30]:
# train, test data split
split = 80000  # split를 80000으로 하면 약 88%로 분리됨
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]


In [33]:
print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label = y_train)
d_valid = xgb.DMatrix(x_valid, label = y_valid)

Building DMatrix...


In [34]:
del x_train, x_valid; gc.collect()

99

### 학습

In [38]:
print('Training...')

#parameter setting
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:squarederror'  # 버전 차이로 파라미터 변경
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['verbosity'] = 1  # 버전 차이로 파라미터 변경

Training...


In [39]:
# xgbboost 학습

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-mae:0.48806	valid-mae:0.48112
[10]	train-mae:0.40222	valid-mae:0.39544
[20]	train-mae:0.33268	valid-mae:0.32610
[30]	train-mae:0.27652	valid-mae:0.27013
[40]	train-mae:0.23132	valid-mae:0.22521
[50]	train-mae:0.19506	valid-mae:0.18932
[60]	train-mae:0.16612	valid-mae:0.16073
[70]	train-mae:0.14312	valid-mae:0.13805
[80]	train-mae:0.12497	valid-mae:0.12022
[90]	train-mae:0.11079	valid-mae:0.10634
[100]	train-mae:0.09982	valid-mae:0.09569
[110]	train-mae:0.09145	valid-mae:0.08760
[120]	train-mae:0.08514	valid-mae:0.08159
[130]	train-mae:0.08045	valid-mae:0.07722
[140]	train-mae:0.07701	valid-mae:0.07409
[150]	train-mae:0.07450	valid-mae:0.07185
[160]	train-mae:0.07269	valid-mae:0.07027
[170]	train-mae:0.07137	valid-mae:0.06915
[180]	train-mae:0.07041	valid-mae:0.06839
[190]	train-mae:0.06971	valid-mae:0.06788
[200]	train-mae:0.06920	valid-mae:0.06754
[210]	train-mae:0.06883	valid-mae:0.06731
[220]	train-mae:0.06854	valid-mae:0.06716
[230]	train-mae:0.06833	valid-mae:0.06705
[24

In [40]:
# 학습에 사용한 데이터 삭제
del d_train, d_valid

In [41]:
print('Building test set...')

# merge를 위한 컬럼 생성
sample['parcelid'] = sample['ParcelId']

#sample과 prop data merge
df_test = sample.merge(prop, on = 'parcelid', how = 'left')

df_test.head(10)

Building test set...


Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712,parcelid,airconditioningtypeid,architecturalstyletypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,0,0,0,0,0,0,10754147,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,0,0,0,0,0,0,10759547,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,0,0,0,0,0,0,10843547,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.369141,,,
3,10859147,0,0,0,0,0,0,10859147,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.570312,,,
4,10879947,0,0,0,0,0,0,10879947,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.169922,,,
5,10898347,0,0,0,0,0,0,10898347,,,...,1.0,,176383.0,283315.0,2015.0,106932.0,3661.280029,,,
6,10933547,0,0,0,0,0,0,10933547,,,...,,,397945.0,554573.0,2015.0,156628.0,6773.339844,,,
7,10940747,0,0,0,0,0,0,10940747,,,...,1.0,,101998.0,688486.0,2015.0,586488.0,7857.839844,,,
8,10954547,0,0,0,0,0,0,10954547,,,...,,,,9.0,2015.0,9.0,,,,
9,10976347,0,0,0,0,0,0,10976347,,,...,1.0,,218440.0,261201.0,2015.0,42761.0,4054.76001,,,


In [43]:
# test를 위한 data 생성 후 제거
del prop; gc.collect()

60

In [45]:
#df_test에서 train_columns 만 인덱싱 x_test 저장(train data와 동일하게)
x_test = df_test[train_columns]

# nan 값 처리(nan => False)
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[c] = (x_test[c] == True)


In [46]:
del df_test, sample; gc.collect()

50

In [47]:
d_test = xgb.DMatrix(x_test)

In [48]:
del x_test; gc.collect()

48

In [49]:
print('Predicting on test...')

p_test = clf.predict(d_test)

Predicting on test...


In [50]:
del d_test; gc.collect()

41

In [53]:
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

In [54]:
# 시기와 상관없이 동일한 예측값으로
sub.head(10)

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.183125,0.183125,0.183125,0.183125,0.183125,0.183125
1,10759547,-0.023807,-0.023807,-0.023807,-0.023807,-0.023807,-0.023807
2,10843547,0.08234,0.08234,0.08234,0.08234,0.08234,0.08234
3,10859147,0.296743,0.296743,0.296743,0.296743,0.296743,0.296743
4,10879947,0.058633,0.058633,0.058633,0.058633,0.058633,0.058633
5,10898347,-0.101038,-0.101038,-0.101038,-0.101038,-0.101038,-0.101038
6,10933547,-0.007425,-0.007425,-0.007425,-0.007425,-0.007425,-0.007425
7,10940747,-0.458164,-0.458164,-0.458164,-0.458164,-0.458164,-0.458164
8,10954547,-0.200903,-0.200903,-0.200903,-0.200903,-0.200903,-0.200903
9,10976347,0.083378,0.083378,0.083378,0.083378,0.083378,0.083378


In [55]:
print('Writing csv...')
sub.to_csv('xgb_starter.csv', index = False, float_format='%.4f')

Writing csv...
