In [1]:
%load_ext autoreload
%autoreload 2
import src.data_proc as data_proc

import numpy as np
import pandas as pd
import sys
import os
import gc
import random
pd.options.display.max_columns = None
pd.options.mode.chained_assignment = None
pd.options.display.float_format

from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor, Pool

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [2]:
"""
    Drop id and label columns + Feature selection for CatBoost
"""
def catboost_drop_features(features):
    # id and label (not features)
    unused_feature_list = ['parcelid', 'logerror']

    # too many missing (LightGBM is robust against bad/unrelated features, so this step might not be needed)
    missing_list = ['framing_id', 'architecture_style_id', 'story_id', 'perimeter_area', 'basement_sqft', 'storage_sqft']
    unused_feature_list += missing_list

    # not useful
    bad_feature_list = ['fireplace_flag', 'deck_id', 'pool_unk_1', 'construction_id', 'fips', 'county_id']
    unused_feature_list += bad_feature_list

    # really hurts performance
    unused_feature_list += ['county_landuse_code_id', 'zoning_description_id']

    return features.drop(unused_feature_list, axis=1, errors='ignore')

# Data Loading

In [3]:
%%time
# Read DataFrames from hdf5
features_2016 = pd.read_hdf('hdf5/features.h5', 'features_2016')  # All features except for datetime for 2016
features_2017 = pd.read_hdf('hdf5/features.h5', 'features_2017')  # All features except for datetime for 2017
train = pd.read_hdf('hdf5/train.h5', 'train')  # Concatenated 2016 and 2017 training data with labels

CPU times: user 12.7 s, sys: 2.75 s, total: 15.4 s
Wall time: 15.4 s


# Training and Tuning

In [4]:
catboost_features = catboost_drop_features(train)
print("Number of features for CatBoost: {}".format(len(catboost_features.columns)))
catboost_features.head(10)

Number of features for CatBoost: 69


Unnamed: 0,cooling_id,bathroom_cnt,bedroom_cnt,quality_id,floor1_sqft,finished_area_sqft_calc,floor1_sqft_unk,base_total_area,fireplace_cnt,bathroom_full_cnt,garage_cnt,garage_sqft,spa_flag,heating_id,latitude,longitude,lot_sqft,pool_cnt,pool_total_size,pool_unk_2,pool_unk_3,landuse_type_id,census_1,city_id,neighborhood_id,region_zip,room_cnt,bathroom_small_cnt,unit_cnt,patio_sqft,year_built,story_cnt,tax_structure,tax_parcel,tax_year,tax_land,tax_property,tax_overdue_flag,tax_overdue_year,census_2,avg_garage_size,property_tax_per_sqft,location_1,location_2,location_3,location_4,missing_finished_area,missing_total_area,missing_bathroom_cnt_calc,derived_room_cnt,avg_area_per_room,derived_avg_area_per_room,region_zip-groupcnt,region_zip-lot_sqft-diff,region_zip-lot_sqft-percent,region_zip-year_built-diff,region_zip-finished_area_sqft_calc-diff,region_zip-finished_area_sqft_calc-percent,region_zip-tax_structure-diff,region_zip-tax_structure-percent,region_zip-tax_land-diff,region_zip-tax_land-percent,region_zip-tax_property-diff,region_zip-tax_property-percent,region_zip-property_tax_per_sqft-diff,region_zip-property_tax_per_sqft-percent,year,month,quarter
0,0,2.0,3.0,4.0,,1684.0,,,,2.0,,,,1,34280992.0,-118488536.0,7528.0,,,,,230,60371068.0,12447.0,31817.0,96370.0,0.0,,1.0,,1959.0,,122754.0,360170.0,2015.0,237416.0,6735.879883,,,60371070000000.0,,3.999929,-84207544.0,152769536.0,-24963276.0,93525260.0,0.0,1.0,0.0,5.0,,336.799988,14719.0,-13398.96875,-0.640273,-3.998413,-247.725464,-0.128241,-50475.015625,-0.291377,51026.42,0.273762,2047.035645,0.436576,1.521634,0.613984,0,1,1
1,-1,3.5,4.0,,,2263.0,,,,3.0,2.0,468.0,,-1,33668120.0,-117677552.0,3643.0,,,,,230,60590524.0,32380.0,,96962.0,0.0,1.0,,,2014.0,,346458.0,585529.0,2015.0,239071.0,10153.019531,,,,234.0,4.486531,-84009432.0,151345664.0,-25170656.0,92506896.0,0.0,1.0,0.0,7.5,,301.733337,17682.0,-2715.032715,-0.427024,35.535156,526.538208,0.303225,213678.171875,1.609267,16302.67,0.073182,6339.847656,1.662618,2.160548,0.928875,0,1,1
2,0,3.0,2.0,4.0,,2217.0,,,,3.0,,,,1,34136312.0,-118175032.0,11423.0,,,,,230,60374640.0,47019.0,275411.0,96293.0,0.0,,1.0,,1940.0,,61994.0,119906.0,2015.0,57912.0,11484.480469,,,60374640000000.0,,5.18019,-84038720.0,152311344.0,-24951204.0,93223828.0,0.0,1.0,0.0,5.0,,443.399994,4422.0,-14927.021484,-0.56649,-12.917847,-173.867432,-0.072721,-236757.28125,-0.79249,-427605.1,-0.880721,1845.573242,0.191471,1.178391,0.294465,0,1,1
3,0,2.0,2.0,4.0,,839.0,,,,2.0,,,,1,33755800.0,-118309000.0,70859.0,,,,,235,60372964.0,12447.0,54300.0,96222.0,0.0,,1.0,,1987.0,,171518.0,244880.0,2015.0,73362.0,3048.73999,,,60372960000000.0,,3.633778,-84553200.0,152064800.0,-25398700.0,92910300.0,0.0,1.0,0.0,4.0,,209.75,7293.0,-43346.804688,-0.37955,21.690186,-782.150757,-0.482466,30903.765625,0.219777,-129440.8,-0.638259,-1337.844971,-0.304986,0.830251,0.296145,0,1,1
4,-1,2.5,4.0,,,2283.0,,,,2.0,2.0,598.0,,-1,33485644.0,-117700232.0,6000.0,1.0,,,1.0,230,60590424.0,17686.0,,96961.0,8.0,1.0,,,1981.0,2.0,169574.0,434551.0,2015.0,264977.0,5488.959961,,,60590420000000.0,299.0,2.404275,-84214592.0,151185872.0,-25364472.0,92335760.0,0.0,1.0,0.0,6.5,285.375,351.230774,9875.0,-1155.377441,-0.16147,0.695679,244.801147,0.120107,-50359.125,-0.228975,-195977.2,-0.425156,-2742.87207,-0.333203,-1.339566,-0.357805,0,1,1
5,0,4.0,4.0,1.0,,3067.0,,,,4.0,,,,1,33870088.0,-118402768.0,2708.0,,,,,230,60376212.0,29712.0,,96109.0,0.0,,1.0,,1982.0,,880650.0,2447951.0,2015.0,1567301.0,27126.570312,,,60376210000000.0,,8.84466,-84532680.0,152272864.0,-25331296.0,93071472.0,0.0,1.0,0.0,8.0,,383.375,6257.0,-5556.242188,-0.672323,16.456299,1096.405884,0.556383,606664.5,2.214221,1006336.0,1.793937,17306.941406,1.762484,3.837239,0.766311,0,1,1
6,-1,1.0,2.0,7.0,,1297.0,,,,1.0,,,,6,33899476.0,-118212720.0,6677.0,,,,,230,60375416.0,24174.0,,96091.0,0.0,,1.0,,1939.0,,64549.0,111521.0,2015.0,46972.0,2304.969971,,,60375420000000.0,,1.777155,-84313248.0,152112192.0,-25206884.0,93005836.0,0.0,1.0,0.0,3.0,,432.333344,8157.0,337.631348,0.053259,-5.380371,-76.406006,-0.055632,-18591.453125,-0.223615,-51044.85,-0.520776,-1033.223877,-0.309516,-0.852975,-0.324309,0,1,1
7,-1,2.5,3.0,,853.0,1763.0,853.0,,1.0,2.0,2.0,0.0,,-1,34207204.0,-119165592.0,,,,,,235,61110032.0,13150.0,,97101.0,6.0,1.0,,,1994.0,2.0,107000.0,306000.0,2015.0,199000.0,3745.5,,,61110030000000.0,0.0,2.124504,-84958384.0,153372800.0,-25375592.0,93790000.0,0.0,1.0,0.0,5.5,293.833344,320.545441,9519.0,,,17.713867,-45.426758,-0.025119,-40700.34375,-0.27556,54505.03,0.377211,107.666992,0.029596,0.115993,0.057751,0,1,1
8,-1,1.0,2.0,,,796.0,,,,1.0,1.0,0.0,,-1,33549600.0,-117678000.0,,,,,,235,60590424.0,25459.0,,96987.0,0.0,,,,1984.0,,66834.0,210064.0,2015.0,143230.0,2172.879883,,,60590420000000.0,0.0,2.729748,-84128400.0,151227600.0,-25289400.0,92388600.0,0.0,1.0,0.0,3.0,,265.333344,22021.0,,,-1.618286,-1312.215088,-0.622429,-162042.0,-0.70799,-194908.1,-0.576416,-3827.224121,-0.63786,-0.038506,-0.01391,0,1,1
9,-1,2.0,2.0,,,1260.0,,,,2.0,1.0,0.0,,-1,33612700.0,-117742000.0,,,,,,235,60590628.0,46098.0,,96963.0,5.0,,,,1977.0,1.0,109977.0,190960.0,2015.0,80983.0,1940.26001,,,60590630000000.0,0.0,1.539889,-84129296.0,151354704.0,-25258300.0,92483700.0,0.0,1.0,0.0,4.0,252.0,315.0,6232.0,,,4.549438,45.907349,0.037812,14055.046875,0.146526,-55970.12,-0.408681,-568.701172,-0.226668,-0.458037,-0.229256,0,1,1


In [5]:
# Prepare training and cross-validation data
catboost_label = train.logerror.astype(np.float32)
print(catboost_label.head())

# Transform to Numpy matrices
catboost_X = catboost_features.values
catboost_y = catboost_label.values

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(catboost_X, catboost_y, test_size=0.2)

# Remove outlier examples from X_train and y_train; Keep them in X_val and y_val for proper cross-validation
outlier_threshold = 0.4
mask = (abs(y_train) <= outlier_threshold)
X_train = X_train[mask, :]
y_train = y_train[mask]

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

0    0.0276
1   -0.1684
2   -0.0040
3    0.0218
4   -0.0050
Name: logerror, dtype: float32
X_train shape: (131426, 69)
y_train shape: (131426,)
X_val shape: (33578, 69)
y_val shape: (33578,)


In [6]:
# Specify feature names and categorical features for CatBoost
feature_names = [s for s in catboost_features.columns]
categorical_features = ['cooling_id', 'heating_id', 'landuse_type_id', 'year', 'month', 'quarter']
categorical_features += ['fips', 'county_id']

categorical_indices = []
for i, n in enumerate(catboost_features.columns):
    if n in categorical_features:
        categorical_indices.append(i)
print(categorical_indices)

[0, 13, 21, 66, 67, 68]


In [7]:
# CatBoost parameters
params = {}
params['loss_function'] = 'MAE'
params['eval_metric'] = 'MAE'
params['nan_mode'] = 'Min'  # Method to handle NaN (set NaN to either Min or Max)
params['random_seed'] = 0

params['iterations'] = 1000  # default 1000, use early stopping during training
params['learning_rate'] = 0.015  # default 0.03

params['border_count'] = 254  # default 254 (alias max_bin, suggested to keep at default for best quality)

params['max_depth'] = 6  # default 6 (must be <= 16, 6 to 10 is recommended)
params['random_strength'] = 1  # default 1 (used during splitting to deal with overfitting, try different values)
params['l2_leaf_reg'] = 5  # default 3 (used for leaf value calculation, try different values)
params['bagging_temperature'] = 1  # default 1 (higher value -> more aggressive bagging, try different values)

In [8]:
# Train CatBoost Regressor with cross-validated early-stopping
val_pool = Pool(X_val, y_val, cat_features=categorical_indices)

np.random.seed(42)
random.seed(36)
# model = CatBoostRegressor(loss_function='MAE', eval_metric='MAE')
model = CatBoostRegressor(**params)
model.fit(X_train, y_train,
          cat_features=categorical_indices,
          use_best_model=True, eval_set=val_pool, early_stopping_rounds=50, verbose=True)

# Evaluate model performance
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

0:	learn: 0.0527019	test: 0.0697997	best: 0.0697997 (0)	total: 130ms	remaining: 2m 9s
1:	learn: 0.0525798	test: 0.0696712	best: 0.0696712 (1)	total: 193ms	remaining: 1m 36s
2:	learn: 0.0524819	test: 0.0695656	best: 0.0695656 (2)	total: 264ms	remaining: 1m 27s
3:	learn: 0.0524046	test: 0.0694793	best: 0.0694793 (3)	total: 321ms	remaining: 1m 19s
4:	learn: 0.0523393	test: 0.0694083	best: 0.0694083 (4)	total: 389ms	remaining: 1m 17s
5:	learn: 0.0522903	test: 0.0693541	best: 0.0693541 (5)	total: 452ms	remaining: 1m 14s
6:	learn: 0.0522543	test: 0.0693196	best: 0.0693196 (6)	total: 522ms	remaining: 1m 13s
7:	learn: 0.0522140	test: 0.0692749	best: 0.0692749 (7)	total: 583ms	remaining: 1m 12s
8:	learn: 0.0521860	test: 0.0692477	best: 0.0692477 (8)	total: 642ms	remaining: 1m 10s
9:	learn: 0.0521638	test: 0.0692264	best: 0.0692264 (9)	total: 703ms	remaining: 1m 9s
10:	learn: 0.0521329	test: 0.0691938	best: 0.0691938 (10)	total: 773ms	remaining: 1m 9s
11:	learn: 0.0521154	test: 0.0691780	best: 0

96:	learn: 0.0514849	test: 0.0687438	best: 0.0687438 (96)	total: 6.33s	remaining: 59s
97:	learn: 0.0514790	test: 0.0687413	best: 0.0687413 (97)	total: 6.39s	remaining: 58.9s
98:	learn: 0.0514716	test: 0.0687378	best: 0.0687378 (98)	total: 6.46s	remaining: 58.8s
99:	learn: 0.0514664	test: 0.0687327	best: 0.0687327 (99)	total: 6.52s	remaining: 58.7s
100:	learn: 0.0514638	test: 0.0687312	best: 0.0687312 (100)	total: 6.59s	remaining: 58.6s
101:	learn: 0.0514578	test: 0.0687316	best: 0.0687312 (100)	total: 6.65s	remaining: 58.6s
102:	learn: 0.0514542	test: 0.0687301	best: 0.0687301 (102)	total: 6.71s	remaining: 58.4s
103:	learn: 0.0514515	test: 0.0687299	best: 0.0687299 (103)	total: 6.77s	remaining: 58.3s
104:	learn: 0.0514490	test: 0.0687295	best: 0.0687295 (104)	total: 6.83s	remaining: 58.2s
105:	learn: 0.0514436	test: 0.0687218	best: 0.0687218 (105)	total: 6.89s	remaining: 58.1s
106:	learn: 0.0514394	test: 0.0687196	best: 0.0687196 (106)	total: 6.95s	remaining: 58s
107:	learn: 0.0514318	

191:	learn: 0.0511691	test: 0.0686374	best: 0.0686365 (182)	total: 12.3s	remaining: 51.6s
192:	learn: 0.0511681	test: 0.0686375	best: 0.0686365 (182)	total: 12.3s	remaining: 51.5s
193:	learn: 0.0511679	test: 0.0686377	best: 0.0686365 (182)	total: 12.3s	remaining: 51.3s
194:	learn: 0.0511627	test: 0.0686352	best: 0.0686352 (194)	total: 12.4s	remaining: 51.2s
195:	learn: 0.0511603	test: 0.0686353	best: 0.0686352 (194)	total: 12.5s	remaining: 51.2s
196:	learn: 0.0511587	test: 0.0686356	best: 0.0686352 (194)	total: 12.5s	remaining: 51.1s
197:	learn: 0.0511557	test: 0.0686350	best: 0.0686350 (197)	total: 12.6s	remaining: 51s
198:	learn: 0.0511542	test: 0.0686346	best: 0.0686346 (198)	total: 12.7s	remaining: 51s
199:	learn: 0.0511525	test: 0.0686344	best: 0.0686344 (199)	total: 12.7s	remaining: 50.9s
200:	learn: 0.0511502	test: 0.0686336	best: 0.0686336 (200)	total: 12.8s	remaining: 50.8s
201:	learn: 0.0511461	test: 0.0686336	best: 0.0686336 (200)	total: 12.8s	remaining: 50.7s
202:	learn: 0.

283:	learn: 0.0509402	test: 0.0686041	best: 0.0686024 (282)	total: 18s	remaining: 45.5s
284:	learn: 0.0509389	test: 0.0686028	best: 0.0686024 (282)	total: 18.1s	remaining: 45.4s
285:	learn: 0.0509366	test: 0.0686027	best: 0.0686024 (282)	total: 18.1s	remaining: 45.3s
286:	learn: 0.0509350	test: 0.0686037	best: 0.0686024 (282)	total: 18.2s	remaining: 45.2s
287:	learn: 0.0509339	test: 0.0686037	best: 0.0686024 (282)	total: 18.3s	remaining: 45.1s
288:	learn: 0.0509324	test: 0.0686026	best: 0.0686024 (282)	total: 18.3s	remaining: 45.1s
289:	learn: 0.0509304	test: 0.0686014	best: 0.0686014 (289)	total: 18.4s	remaining: 45s
290:	learn: 0.0509278	test: 0.0686019	best: 0.0686014 (289)	total: 18.4s	remaining: 44.9s
291:	learn: 0.0509262	test: 0.0686014	best: 0.0686014 (291)	total: 18.5s	remaining: 44.9s
292:	learn: 0.0509229	test: 0.0686012	best: 0.0686012 (292)	total: 18.6s	remaining: 44.8s
293:	learn: 0.0509191	test: 0.0685982	best: 0.0685982 (293)	total: 18.6s	remaining: 44.7s
294:	learn: 0.

375:	learn: 0.0507284	test: 0.0685695	best: 0.0685693 (368)	total: 23.8s	remaining: 39.5s
376:	learn: 0.0507273	test: 0.0685690	best: 0.0685690 (376)	total: 23.9s	remaining: 39.4s
377:	learn: 0.0507250	test: 0.0685687	best: 0.0685687 (377)	total: 23.9s	remaining: 39.4s
378:	learn: 0.0507223	test: 0.0685691	best: 0.0685687 (377)	total: 24s	remaining: 39.3s
379:	learn: 0.0507204	test: 0.0685696	best: 0.0685687 (377)	total: 24.1s	remaining: 39.3s
380:	learn: 0.0507196	test: 0.0685692	best: 0.0685687 (377)	total: 24.1s	remaining: 39.2s
381:	learn: 0.0507175	test: 0.0685686	best: 0.0685686 (381)	total: 24.2s	remaining: 39.1s
382:	learn: 0.0507142	test: 0.0685681	best: 0.0685681 (382)	total: 24.3s	remaining: 39.1s
383:	learn: 0.0507106	test: 0.0685662	best: 0.0685662 (383)	total: 24.3s	remaining: 39s
384:	learn: 0.0507082	test: 0.0685663	best: 0.0685662 (383)	total: 24.4s	remaining: 39s
385:	learn: 0.0507053	test: 0.0685669	best: 0.0685662 (383)	total: 24.5s	remaining: 38.9s
386:	learn: 0.05

469:	learn: 0.0505341	test: 0.0685352	best: 0.0685352 (469)	total: 29.8s	remaining: 33.6s
470:	learn: 0.0505323	test: 0.0685360	best: 0.0685352 (469)	total: 29.9s	remaining: 33.5s
471:	learn: 0.0505313	test: 0.0685358	best: 0.0685352 (469)	total: 29.9s	remaining: 33.5s
472:	learn: 0.0505291	test: 0.0685361	best: 0.0685352 (469)	total: 30s	remaining: 33.4s
473:	learn: 0.0505258	test: 0.0685354	best: 0.0685352 (469)	total: 30s	remaining: 33.3s
474:	learn: 0.0505233	test: 0.0685355	best: 0.0685352 (469)	total: 30.1s	remaining: 33.3s
475:	learn: 0.0505227	test: 0.0685357	best: 0.0685352 (469)	total: 30.2s	remaining: 33.2s
476:	learn: 0.0505202	test: 0.0685365	best: 0.0685352 (469)	total: 30.2s	remaining: 33.1s
477:	learn: 0.0505186	test: 0.0685374	best: 0.0685352 (469)	total: 30.3s	remaining: 33.1s
478:	learn: 0.0505168	test: 0.0685367	best: 0.0685352 (469)	total: 30.4s	remaining: 33s
479:	learn: 0.0505141	test: 0.0685346	best: 0.0685346 (479)	total: 30.4s	remaining: 32.9s
480:	learn: 0.05

561:	learn: 0.0503704	test: 0.0685212	best: 0.0685207 (556)	total: 35.5s	remaining: 27.7s
562:	learn: 0.0503690	test: 0.0685218	best: 0.0685207 (556)	total: 35.6s	remaining: 27.6s
563:	learn: 0.0503677	test: 0.0685219	best: 0.0685207 (556)	total: 35.7s	remaining: 27.6s
564:	learn: 0.0503660	test: 0.0685225	best: 0.0685207 (556)	total: 35.7s	remaining: 27.5s
565:	learn: 0.0503642	test: 0.0685226	best: 0.0685207 (556)	total: 35.8s	remaining: 27.5s
566:	learn: 0.0503623	test: 0.0685232	best: 0.0685207 (556)	total: 35.9s	remaining: 27.4s
567:	learn: 0.0503621	test: 0.0685232	best: 0.0685207 (556)	total: 35.9s	remaining: 27.3s
568:	learn: 0.0503613	test: 0.0685225	best: 0.0685207 (556)	total: 36s	remaining: 27.2s
569:	learn: 0.0503590	test: 0.0685223	best: 0.0685207 (556)	total: 36s	remaining: 27.2s
570:	learn: 0.0503568	test: 0.0685224	best: 0.0685207 (556)	total: 36.1s	remaining: 27.1s
571:	learn: 0.0503549	test: 0.0685237	best: 0.0685207 (556)	total: 36.1s	remaining: 27s
572:	learn: 0.05

In [None]:
# Print CatBoost feature importance
feature_importance = [(feature_names[i], value) for i, value in enumerate(model.get_feature_importance())]
feature_importance.sort(key=lambda x: x[1], reverse=True)
for k, v in feature_importance:
    print("{}: {}".format(k, v))

In [None]:
Shrink model to first 557 iterations.
Train score: 5.043362352223646
Val score: 6.852068572689314

# Train on all data + Make predictions

In [9]:
# Train CatBoost on all given training data (preparing for submission)
outlier_threshold = 0.4
mask = (abs(catboost_y) <= outlier_threshold)
catboost_X = catboost_X[mask, :]
catboost_y = catboost_y[mask]
print("catboost_X: {}".format(catboost_X.shape))
print("catboost_y: {}".format(catboost_y.shape))

params['iterations'] = 800
print(params)
np.random.seed(42)
random.seed(36)
model = CatBoostRegressor(**params)
model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=True)

catboost_X: (164299, 69)
catboost_y: (164299,)
{'loss_function': 'MAE', 'eval_metric': 'MAE', 'nan_mode': 'Min', 'random_seed': 0, 'iterations': 800, 'learning_rate': 0.015, 'border_count': 254, 'max_depth': 6, 'random_strength': 1, 'l2_leaf_reg': 5, 'bagging_temperature': 1}
0:	learn: 0.0527824	total: 88ms	remaining: 1m 10s
1:	learn: 0.0526610	total: 160ms	remaining: 1m 3s
2:	learn: 0.0525722	total: 244ms	remaining: 1m 4s
3:	learn: 0.0524973	total: 319ms	remaining: 1m 3s
4:	learn: 0.0524338	total: 396ms	remaining: 1m 2s
5:	learn: 0.0523881	total: 468ms	remaining: 1m 1s
6:	learn: 0.0523437	total: 548ms	remaining: 1m 2s
7:	learn: 0.0523133	total: 608ms	remaining: 1m
8:	learn: 0.0522813	total: 690ms	remaining: 1m
9:	learn: 0.0522489	total: 777ms	remaining: 1m 1s
10:	learn: 0.0522216	total: 848ms	remaining: 1m
11:	learn: 0.0522018	total: 918ms	remaining: 1m
12:	learn: 0.0521815	total: 998ms	remaining: 1m
13:	learn: 0.0521641	total: 1.07s	remaining: 1m
14:	learn: 0.0521483	total: 1.14s	rem

157:	learn: 0.0514087	total: 11.6s	remaining: 47s
158:	learn: 0.0514064	total: 11.6s	remaining: 46.9s
159:	learn: 0.0514028	total: 11.7s	remaining: 46.8s
160:	learn: 0.0513978	total: 11.8s	remaining: 46.7s
161:	learn: 0.0513952	total: 11.8s	remaining: 46.6s
162:	learn: 0.0513908	total: 11.9s	remaining: 46.6s
163:	learn: 0.0513869	total: 12s	remaining: 46.5s
164:	learn: 0.0513852	total: 12.1s	remaining: 46.5s
165:	learn: 0.0513816	total: 12.2s	remaining: 46.5s
166:	learn: 0.0513776	total: 12.3s	remaining: 46.5s
167:	learn: 0.0513749	total: 12.3s	remaining: 46.4s
168:	learn: 0.0513724	total: 12.4s	remaining: 46.3s
169:	learn: 0.0513678	total: 12.5s	remaining: 46.3s
170:	learn: 0.0513677	total: 12.5s	remaining: 46.1s
171:	learn: 0.0513658	total: 12.6s	remaining: 46s
172:	learn: 0.0513657	total: 12.6s	remaining: 45.7s
173:	learn: 0.0513651	total: 12.7s	remaining: 45.6s
174:	learn: 0.0513615	total: 12.8s	remaining: 45.6s
175:	learn: 0.0513584	total: 12.8s	remaining: 45.5s
176:	learn: 0.0513

317:	learn: 0.0510422	total: 22.7s	remaining: 34.4s
318:	learn: 0.0510406	total: 22.8s	remaining: 34.3s
319:	learn: 0.0510373	total: 22.8s	remaining: 34.2s
320:	learn: 0.0510364	total: 22.9s	remaining: 34.2s
321:	learn: 0.0510342	total: 23s	remaining: 34.1s
322:	learn: 0.0510316	total: 23s	remaining: 34s
323:	learn: 0.0510303	total: 23.1s	remaining: 33.9s
324:	learn: 0.0510296	total: 23.2s	remaining: 33.9s
325:	learn: 0.0510285	total: 23.3s	remaining: 33.8s
326:	learn: 0.0510267	total: 23.3s	remaining: 33.8s
327:	learn: 0.0510251	total: 23.4s	remaining: 33.7s
328:	learn: 0.0510228	total: 23.5s	remaining: 33.6s
329:	learn: 0.0510204	total: 23.6s	remaining: 33.6s
330:	learn: 0.0510194	total: 23.6s	remaining: 33.5s
331:	learn: 0.0510160	total: 23.7s	remaining: 33.4s
332:	learn: 0.0510136	total: 23.8s	remaining: 33.4s
333:	learn: 0.0510123	total: 23.9s	remaining: 33.3s
334:	learn: 0.0510111	total: 23.9s	remaining: 33.2s
335:	learn: 0.0510076	total: 24s	remaining: 33.1s
336:	learn: 0.051005

479:	learn: 0.0507594	total: 34.2s	remaining: 22.8s
480:	learn: 0.0507578	total: 34.3s	remaining: 22.7s
481:	learn: 0.0507558	total: 34.3s	remaining: 22.6s
482:	learn: 0.0507544	total: 34.4s	remaining: 22.6s
483:	learn: 0.0507521	total: 34.5s	remaining: 22.5s
484:	learn: 0.0507508	total: 34.5s	remaining: 22.4s
485:	learn: 0.0507488	total: 34.6s	remaining: 22.4s
486:	learn: 0.0507474	total: 34.7s	remaining: 22.3s
487:	learn: 0.0507459	total: 34.7s	remaining: 22.2s
488:	learn: 0.0507457	total: 34.8s	remaining: 22.1s
489:	learn: 0.0507444	total: 34.9s	remaining: 22.1s
490:	learn: 0.0507424	total: 34.9s	remaining: 22s
491:	learn: 0.0507402	total: 35s	remaining: 21.9s
492:	learn: 0.0507381	total: 35.1s	remaining: 21.8s
493:	learn: 0.0507363	total: 35.1s	remaining: 21.8s
494:	learn: 0.0507340	total: 35.2s	remaining: 21.7s
495:	learn: 0.0507332	total: 35.3s	remaining: 21.6s
496:	learn: 0.0507320	total: 35.4s	remaining: 21.6s
497:	learn: 0.0507295	total: 35.4s	remaining: 21.5s
498:	learn: 0.05

639:	learn: 0.0505086	total: 45.2s	remaining: 11.3s
640:	learn: 0.0505073	total: 45.3s	remaining: 11.2s
641:	learn: 0.0505059	total: 45.3s	remaining: 11.2s
642:	learn: 0.0505037	total: 45.4s	remaining: 11.1s
643:	learn: 0.0505026	total: 45.5s	remaining: 11s
644:	learn: 0.0505014	total: 45.5s	remaining: 10.9s
645:	learn: 0.0505011	total: 45.6s	remaining: 10.9s
646:	learn: 0.0504999	total: 45.7s	remaining: 10.8s
647:	learn: 0.0504985	total: 45.7s	remaining: 10.7s
648:	learn: 0.0504971	total: 45.8s	remaining: 10.7s
649:	learn: 0.0504960	total: 45.9s	remaining: 10.6s
650:	learn: 0.0504950	total: 45.9s	remaining: 10.5s
651:	learn: 0.0504935	total: 46s	remaining: 10.4s
652:	learn: 0.0504924	total: 46.1s	remaining: 10.4s
653:	learn: 0.0504913	total: 46.1s	remaining: 10.3s
654:	learn: 0.0504901	total: 46.2s	remaining: 10.2s
655:	learn: 0.0504898	total: 46.3s	remaining: 10.2s
656:	learn: 0.0504893	total: 46.3s	remaining: 10.1s
657:	learn: 0.0504876	total: 46.4s	remaining: 10s
658:	learn: 0.0504

799:	learn: 0.0502753	total: 56.2s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1a51b75f28>

In [10]:
%%time
def predict_and_export(model, features_2016, features_2017, file_name):
    # Construct DataFrame for prediction results
    submission_2016 = pd.DataFrame()
    submission_2017 = pd.DataFrame()
    submission_2016['ParcelId'] = features_2016.parcelid
    submission_2017['ParcelId'] = features_2017.parcelid
    
    # Construct features and make prediction
    test_features_2016 = catboost_drop_features(features_2016)
    test_features_2017 = catboost_drop_features(features_2017)
    
    # test_dates = ['2016-10-01', '2016-11-01', '2016-12-01', '2017-10-01', '2017-11-01', '2017-12-01']
    # test_columns = ['201610', '201611', '201612', '201710', '201711', '201712']
    test_dates = ['2016-10-01', '2017-10-01']
    test_columns = ['201610', '201710']
    
    for d, c in zip(test_dates, test_columns):
        print("Start predicting for {}".format(c))
        
        test_features = test_features_2016 if d[:4] == '2016' else test_features_2017
        test_features['transactiondate'] = d
        data_proc.add_simple_datetime_features(test_features)
        pred_test = [float(format(x, '.4f')) for x in model.predict(test_features)]        
        if d[:4] == '2016':
            submission_2016[c] = pred_test
        else:
            submission_2017[c] = pred_test
        
        print("Finished predicting for {}".format(c))
    
    submission_2016['201611'] = submission_2016['201610']
    submission_2016['201612'] = submission_2016['201610']
    submission_2017['201711'] = submission_2017['201710']
    submission_2017['201712'] = submission_2017['201710']
    
    submission = submission_2016.merge(how='inner', right=submission_2017, on='ParcelId')
    
    print("Length of submission DataFrame: {}".format(len(submission)))
    print("Submission header:")
    print(submission.head())
    submission.to_csv(file_name, index=False)
    return submission  # Return the results so that we can analyze or sanity check it

submission = predict_and_export(model, features_2016, features_2017, 'data/submission_180104_catboost_v2.csv')

Start predicting for 201610
Finished predicting for 201610
Start predicting for 201710
Finished predicting for 201710
Length of submission DataFrame: 2985217
Submission header:
   ParcelId  201610  201611  201612  201710  201711  201712
0  10754147  0.0138  0.0138  0.0138  0.0093  0.0093  0.0093
1  10759547  0.0124  0.0124  0.0124  0.0146  0.0146  0.0146
2  10843547  0.0239  0.0239  0.0239  0.0443  0.0443  0.0443
3  10859147  0.0759  0.0759  0.0759  0.0760  0.0760  0.0760
4  10879947  0.0199  0.0199  0.0199  0.0237  0.0237  0.0237
CPU times: user 6min 14s, sys: 3.25 s, total: 6min 17s
Wall time: 6min 2s
