In [1]:
%load_ext autoreload
%autoreload 2
import src.data_proc as data_proc

import numpy as np
import pandas as pd
import sys
import os
import gc
import random
pd.options.display.max_columns = None
pd.options.mode.chained_assignment = None
pd.options.display.float_format

from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor, Pool

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [2]:
"""
    Drop id and label columns + Feature selection for CatBoost
"""
def catboost_drop_features(features):
    # id and label (not features)
    unused_feature_list = ['parcelid', 'logerror']

    # too many missing
    missing_list = ['framing_id', 'architecture_style_id', 'story_id', 'perimeter_area', 'basement_sqft', 'storage_sqft']
    unused_feature_list += missing_list

    # not useful
    bad_feature_list = ['fireplace_flag', 'deck_id', 'pool_unk_1', 'construction_id', 'fips', 'county_id']
    unused_feature_list += bad_feature_list

    # hurts performance
    unused_feature_list += ['county_landuse_code_id', 'zoning_description_id']

    return features.drop(unused_feature_list, axis=1, errors='ignore')

# Data Loading

In [3]:
%%time
# Read DataFrames from hdf5
features_2016 = pd.read_hdf('hdf5/features.h5', 'features_2016')  # All features except for datetime for 2016
features_2017 = pd.read_hdf('hdf5/features.h5', 'features_2017')  # All features except for datetime for 2017
train = pd.read_hdf('hdf5/train.h5', 'train')  # Concatenated 2016 and 2017 training data with labels

CPU times: user 12.7 s, sys: 2.78 s, total: 15.5 s
Wall time: 15.5 s


# Training and Tuning

In [4]:
catboost_features = catboost_drop_features(train)
print("Number of features for CatBoost: {}".format(len(catboost_features.columns)))
catboost_features.head(10)

Number of features for CatBoost: 69


Unnamed: 0,cooling_id,bathroom_cnt,bedroom_cnt,quality_id,floor1_sqft,finished_area_sqft_calc,floor1_sqft_unk,base_total_area,fireplace_cnt,bathroom_full_cnt,garage_cnt,garage_sqft,spa_flag,heating_id,latitude,longitude,lot_sqft,pool_cnt,pool_total_size,pool_unk_2,pool_unk_3,landuse_type_id,census_1,city_id,neighborhood_id,region_zip,room_cnt,bathroom_small_cnt,unit_cnt,patio_sqft,year_built,story_cnt,tax_structure,tax_parcel,tax_year,tax_land,tax_property,tax_overdue_flag,tax_overdue_year,census_2,avg_garage_size,property_tax_per_sqft,location_1,location_2,location_3,location_4,missing_finished_area,missing_total_area,missing_bathroom_cnt_calc,derived_room_cnt,avg_area_per_room,derived_avg_area_per_room,region_zip-groupcnt,region_zip-lot_sqft-diff,region_zip-lot_sqft-percent,region_zip-year_built-diff,region_zip-finished_area_sqft_calc-diff,region_zip-finished_area_sqft_calc-percent,region_zip-tax_structure-diff,region_zip-tax_structure-percent,region_zip-tax_land-diff,region_zip-tax_land-percent,region_zip-tax_property-diff,region_zip-tax_property-percent,region_zip-property_tax_per_sqft-diff,region_zip-property_tax_per_sqft-percent,year,month,quarter
0,0,2.0,3.0,4.0,,1684.0,,,,2.0,,,,1,34280992.0,-118488536.0,7528.0,,,,,230,60371068.0,12447.0,31817.0,96370.0,0.0,,1.0,,1959.0,,122754.0,360170.0,2015.0,237416.0,6735.879883,,,60371070000000.0,,3.999929,-84207544.0,152769536.0,-24963276.0,93525260.0,0.0,1.0,0.0,5.0,,336.799988,14719.0,-13398.96875,-0.640273,-3.998413,-247.725464,-0.128241,-50475.015625,-0.291377,51026.42,0.273762,2047.035645,0.436576,1.521634,0.613984,0,1,1
1,-1,3.5,4.0,,,2263.0,,,,3.0,2.0,468.0,,-1,33668120.0,-117677552.0,3643.0,,,,,230,60590524.0,32380.0,,96962.0,0.0,1.0,,,2014.0,,346458.0,585529.0,2015.0,239071.0,10153.019531,,,,234.0,4.486531,-84009432.0,151345664.0,-25170656.0,92506896.0,0.0,1.0,0.0,7.5,,301.733337,17682.0,-2715.032715,-0.427024,35.535156,526.538208,0.303225,213678.171875,1.609267,16302.67,0.073182,6339.847656,1.662618,2.160548,0.928875,0,1,1
2,0,3.0,2.0,4.0,,2217.0,,,,3.0,,,,1,34136312.0,-118175032.0,11423.0,,,,,230,60374640.0,47019.0,275411.0,96293.0,0.0,,1.0,,1940.0,,61994.0,119906.0,2015.0,57912.0,11484.480469,,,60374640000000.0,,5.18019,-84038720.0,152311344.0,-24951204.0,93223828.0,0.0,1.0,0.0,5.0,,443.399994,4422.0,-14927.021484,-0.56649,-12.917847,-173.867432,-0.072721,-236757.28125,-0.79249,-427605.1,-0.880721,1845.573242,0.191471,1.178391,0.294465,0,1,1
3,0,2.0,2.0,4.0,,839.0,,,,2.0,,,,1,33755800.0,-118309000.0,70859.0,,,,,235,60372964.0,12447.0,54300.0,96222.0,0.0,,1.0,,1987.0,,171518.0,244880.0,2015.0,73362.0,3048.73999,,,60372960000000.0,,3.633778,-84553200.0,152064800.0,-25398700.0,92910300.0,0.0,1.0,0.0,4.0,,209.75,7293.0,-43346.804688,-0.37955,21.690186,-782.150757,-0.482466,30903.765625,0.219777,-129440.8,-0.638259,-1337.844971,-0.304986,0.830251,0.296145,0,1,1
4,-1,2.5,4.0,,,2283.0,,,,2.0,2.0,598.0,,-1,33485644.0,-117700232.0,6000.0,1.0,,,1.0,230,60590424.0,17686.0,,96961.0,8.0,1.0,,,1981.0,2.0,169574.0,434551.0,2015.0,264977.0,5488.959961,,,60590420000000.0,299.0,2.404275,-84214592.0,151185872.0,-25364472.0,92335760.0,0.0,1.0,0.0,6.5,285.375,351.230774,9875.0,-1155.377441,-0.16147,0.695679,244.801147,0.120107,-50359.125,-0.228975,-195977.2,-0.425156,-2742.87207,-0.333203,-1.339566,-0.357805,0,1,1
5,0,4.0,4.0,1.0,,3067.0,,,,4.0,,,,1,33870088.0,-118402768.0,2708.0,,,,,230,60376212.0,29712.0,,96109.0,0.0,,1.0,,1982.0,,880650.0,2447951.0,2015.0,1567301.0,27126.570312,,,60376210000000.0,,8.84466,-84532680.0,152272864.0,-25331296.0,93071472.0,0.0,1.0,0.0,8.0,,383.375,6257.0,-5556.242188,-0.672323,16.456299,1096.405884,0.556383,606664.5,2.214221,1006336.0,1.793937,17306.941406,1.762484,3.837239,0.766311,0,1,1
6,-1,1.0,2.0,7.0,,1297.0,,,,1.0,,,,6,33899476.0,-118212720.0,6677.0,,,,,230,60375416.0,24174.0,,96091.0,0.0,,1.0,,1939.0,,64549.0,111521.0,2015.0,46972.0,2304.969971,,,60375420000000.0,,1.777155,-84313248.0,152112192.0,-25206884.0,93005836.0,0.0,1.0,0.0,3.0,,432.333344,8157.0,337.631348,0.053259,-5.380371,-76.406006,-0.055632,-18591.453125,-0.223615,-51044.85,-0.520776,-1033.223877,-0.309516,-0.852975,-0.324309,0,1,1
7,-1,2.5,3.0,,853.0,1763.0,853.0,,1.0,2.0,2.0,0.0,,-1,34207204.0,-119165592.0,,,,,,235,61110032.0,13150.0,,97101.0,6.0,1.0,,,1994.0,2.0,107000.0,306000.0,2015.0,199000.0,3745.5,,,61110030000000.0,0.0,2.124504,-84958384.0,153372800.0,-25375592.0,93790000.0,0.0,1.0,0.0,5.5,293.833344,320.545441,9519.0,,,17.713867,-45.426758,-0.025119,-40700.34375,-0.27556,54505.03,0.377211,107.666992,0.029596,0.115993,0.057751,0,1,1
8,-1,1.0,2.0,,,796.0,,,,1.0,1.0,0.0,,-1,33549600.0,-117678000.0,,,,,,235,60590424.0,25459.0,,96987.0,0.0,,,,1984.0,,66834.0,210064.0,2015.0,143230.0,2172.879883,,,60590420000000.0,0.0,2.729748,-84128400.0,151227600.0,-25289400.0,92388600.0,0.0,1.0,0.0,3.0,,265.333344,22021.0,,,-1.618286,-1312.215088,-0.622429,-162042.0,-0.70799,-194908.1,-0.576416,-3827.224121,-0.63786,-0.038506,-0.01391,0,1,1
9,-1,2.0,2.0,,,1260.0,,,,2.0,1.0,0.0,,-1,33612700.0,-117742000.0,,,,,,235,60590628.0,46098.0,,96963.0,5.0,,,,1977.0,1.0,109977.0,190960.0,2015.0,80983.0,1940.26001,,,60590630000000.0,0.0,1.539889,-84129296.0,151354704.0,-25258300.0,92483700.0,0.0,1.0,0.0,4.0,252.0,315.0,6232.0,,,4.549438,45.907349,0.037812,14055.046875,0.146526,-55970.12,-0.408681,-568.701172,-0.226668,-0.458037,-0.229256,0,1,1


In [5]:
# Prepare training and cross-validation data
catboost_label = train.logerror.astype(np.float32)
print(catboost_label.head())

# Transform to Numpy matrices
catboost_X = catboost_features.values
catboost_y = catboost_label.values

# Perform shuffled train/test split
np.random.seed(42)
random.seed(10)
X_train, X_val, y_train, y_val = train_test_split(catboost_X, catboost_y, test_size=0.2)

# Remove outlier examples from X_train and y_train; Keep them in X_val and y_val for proper cross-validation
outlier_threshold = 0.4
mask = (abs(y_train) <= outlier_threshold)
X_train = X_train[mask, :]
y_train = y_train[mask]

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

0    0.0276
1   -0.1684
2   -0.0040
3    0.0218
4   -0.0050
Name: logerror, dtype: float32
X_train shape: (131426, 69)
y_train shape: (131426,)
X_val shape: (33578, 69)
y_val shape: (33578,)


In [6]:
# Specify feature names and categorical features for CatBoost
feature_names = [s for s in catboost_features.columns]
categorical_features = ['cooling_id', 'heating_id', 'landuse_type_id', 'year', 'month', 'quarter']

categorical_indices = []
for i, n in enumerate(catboost_features.columns):
    if n in categorical_features:
        categorical_indices.append(i)
print(categorical_indices)

[0, 13, 21, 66, 67, 68]


In [7]:
# CatBoost parameters
params = {}
params['loss_function'] = 'MAE'
params['eval_metric'] = 'MAE'
params['nan_mode'] = 'Min'  # Method to handle NaN (set NaN to either Min or Max)
params['random_seed'] = 0

params['iterations'] = 1000  # default 1000, use early stopping during training
params['learning_rate'] = 0.015  # default 0.03

params['border_count'] = 254  # default 254 (alias max_bin, suggested to keep at default for best quality)

params['max_depth'] = 6  # default 6 (must be <= 16, 6 to 10 is recommended)
params['random_strength'] = 1  # default 1 (used during splitting to deal with overfitting, try different values)
params['l2_leaf_reg'] = 5  # default 3 (used for leaf value calculation, try different values)
params['bagging_temperature'] = 1  # default 1 (higher value -> more aggressive bagging, try different values)

In [8]:
# Train CatBoost Regressor with cross-validated early-stopping
val_pool = Pool(X_val, y_val, cat_features=categorical_indices)

np.random.seed(42)
random.seed(36)
# model = CatBoostRegressor(loss_function='MAE', eval_metric='MAE')
model = CatBoostRegressor(**params)
model.fit(X_train, y_train,
          cat_features=categorical_indices,
          use_best_model=True, eval_set=val_pool, early_stopping_rounds=50, verbose=True)

# Evaluate model performance
print("Train score: {}".format(abs(model.predict(X_train) - y_train).mean() * 100))
print("Val score: {}".format(abs(model.predict(X_val) - y_val).mean() * 100))

0:	learn: 0.0527019	test: 0.0697997	best: 0.0697997 (0)	total: 131ms	remaining: 2m 11s
1:	learn: 0.0525798	test: 0.0696712	best: 0.0696712 (1)	total: 205ms	remaining: 1m 42s
2:	learn: 0.0524819	test: 0.0695656	best: 0.0695656 (2)	total: 276ms	remaining: 1m 31s
3:	learn: 0.0524046	test: 0.0694793	best: 0.0694793 (3)	total: 332ms	remaining: 1m 22s
4:	learn: 0.0523393	test: 0.0694083	best: 0.0694083 (4)	total: 395ms	remaining: 1m 18s
5:	learn: 0.0522903	test: 0.0693541	best: 0.0693541 (5)	total: 459ms	remaining: 1m 15s
6:	learn: 0.0522543	test: 0.0693196	best: 0.0693196 (6)	total: 532ms	remaining: 1m 15s
7:	learn: 0.0522140	test: 0.0692749	best: 0.0692749 (7)	total: 594ms	remaining: 1m 13s
8:	learn: 0.0521860	test: 0.0692477	best: 0.0692477 (8)	total: 658ms	remaining: 1m 12s
9:	learn: 0.0521638	test: 0.0692264	best: 0.0692264 (9)	total: 716ms	remaining: 1m 10s
10:	learn: 0.0521329	test: 0.0691938	best: 0.0691938 (10)	total: 786ms	remaining: 1m 10s
11:	learn: 0.0521154	test: 0.0691780	best

96:	learn: 0.0514849	test: 0.0687438	best: 0.0687438 (96)	total: 6.56s	remaining: 1m 1s
97:	learn: 0.0514790	test: 0.0687413	best: 0.0687413 (97)	total: 6.62s	remaining: 1m
98:	learn: 0.0514716	test: 0.0687378	best: 0.0687378 (98)	total: 6.69s	remaining: 1m
99:	learn: 0.0514664	test: 0.0687327	best: 0.0687327 (99)	total: 6.75s	remaining: 1m
100:	learn: 0.0514638	test: 0.0687312	best: 0.0687312 (100)	total: 6.82s	remaining: 1m
101:	learn: 0.0514578	test: 0.0687316	best: 0.0687312 (100)	total: 6.88s	remaining: 1m
102:	learn: 0.0514542	test: 0.0687301	best: 0.0687301 (102)	total: 6.94s	remaining: 1m
103:	learn: 0.0514515	test: 0.0687299	best: 0.0687299 (103)	total: 7s	remaining: 1m
104:	learn: 0.0514490	test: 0.0687295	best: 0.0687295 (104)	total: 7.07s	remaining: 1m
105:	learn: 0.0514436	test: 0.0687218	best: 0.0687218 (105)	total: 7.13s	remaining: 1m
106:	learn: 0.0514394	test: 0.0687196	best: 0.0687196 (106)	total: 7.19s	remaining: 1m
107:	learn: 0.0514318	test: 0.0687164	best: 0.06871

189:	learn: 0.0511739	test: 0.0686386	best: 0.0686365 (182)	total: 12.6s	remaining: 53.8s
190:	learn: 0.0511727	test: 0.0686378	best: 0.0686365 (182)	total: 12.7s	remaining: 53.6s
191:	learn: 0.0511691	test: 0.0686374	best: 0.0686365 (182)	total: 12.7s	remaining: 53.6s
192:	learn: 0.0511681	test: 0.0686375	best: 0.0686365 (182)	total: 12.8s	remaining: 53.5s
193:	learn: 0.0511679	test: 0.0686377	best: 0.0686365 (182)	total: 12.8s	remaining: 53.3s
194:	learn: 0.0511627	test: 0.0686352	best: 0.0686352 (194)	total: 12.9s	remaining: 53.2s
195:	learn: 0.0511603	test: 0.0686353	best: 0.0686352 (194)	total: 12.9s	remaining: 53.1s
196:	learn: 0.0511587	test: 0.0686356	best: 0.0686352 (194)	total: 13s	remaining: 53s
197:	learn: 0.0511557	test: 0.0686350	best: 0.0686350 (197)	total: 13.1s	remaining: 53s
198:	learn: 0.0511542	test: 0.0686346	best: 0.0686346 (198)	total: 13.1s	remaining: 52.9s
199:	learn: 0.0511525	test: 0.0686344	best: 0.0686344 (199)	total: 13.2s	remaining: 52.8s
200:	learn: 0.05

281:	learn: 0.0509471	test: 0.0686049	best: 0.0686046 (280)	total: 18.4s	remaining: 46.9s
282:	learn: 0.0509430	test: 0.0686024	best: 0.0686024 (282)	total: 18.5s	remaining: 46.8s
283:	learn: 0.0509402	test: 0.0686041	best: 0.0686024 (282)	total: 18.6s	remaining: 46.8s
284:	learn: 0.0509389	test: 0.0686028	best: 0.0686024 (282)	total: 18.7s	remaining: 46.8s
285:	learn: 0.0509366	test: 0.0686027	best: 0.0686024 (282)	total: 18.7s	remaining: 46.7s
286:	learn: 0.0509350	test: 0.0686037	best: 0.0686024 (282)	total: 18.8s	remaining: 46.7s
287:	learn: 0.0509339	test: 0.0686037	best: 0.0686024 (282)	total: 18.8s	remaining: 46.6s
288:	learn: 0.0509324	test: 0.0686026	best: 0.0686024 (282)	total: 18.9s	remaining: 46.5s
289:	learn: 0.0509304	test: 0.0686014	best: 0.0686014 (289)	total: 19s	remaining: 46.5s
290:	learn: 0.0509278	test: 0.0686019	best: 0.0686014 (289)	total: 19.1s	remaining: 46.4s
291:	learn: 0.0509262	test: 0.0686014	best: 0.0686014 (291)	total: 19.1s	remaining: 46.4s
292:	learn: 

375:	learn: 0.0507284	test: 0.0685695	best: 0.0685693 (368)	total: 24.6s	remaining: 40.9s
376:	learn: 0.0507273	test: 0.0685690	best: 0.0685690 (376)	total: 24.7s	remaining: 40.8s
377:	learn: 0.0507250	test: 0.0685687	best: 0.0685687 (377)	total: 24.8s	remaining: 40.8s
378:	learn: 0.0507223	test: 0.0685691	best: 0.0685687 (377)	total: 24.8s	remaining: 40.7s
379:	learn: 0.0507204	test: 0.0685696	best: 0.0685687 (377)	total: 24.9s	remaining: 40.6s
380:	learn: 0.0507196	test: 0.0685692	best: 0.0685687 (377)	total: 24.9s	remaining: 40.5s
381:	learn: 0.0507175	test: 0.0685686	best: 0.0685686 (381)	total: 25s	remaining: 40.5s
382:	learn: 0.0507142	test: 0.0685681	best: 0.0685681 (382)	total: 25.1s	remaining: 40.4s
383:	learn: 0.0507106	test: 0.0685662	best: 0.0685662 (383)	total: 25.2s	remaining: 40.4s
384:	learn: 0.0507082	test: 0.0685663	best: 0.0685662 (383)	total: 25.2s	remaining: 40.3s
385:	learn: 0.0507053	test: 0.0685669	best: 0.0685662 (383)	total: 25.3s	remaining: 40.2s
386:	learn: 

467:	learn: 0.0505430	test: 0.0685368	best: 0.0685368 (467)	total: 30.6s	remaining: 34.7s
468:	learn: 0.0505387	test: 0.0685368	best: 0.0685368 (467)	total: 30.6s	remaining: 34.7s
469:	learn: 0.0505341	test: 0.0685352	best: 0.0685352 (469)	total: 30.7s	remaining: 34.6s
470:	learn: 0.0505323	test: 0.0685360	best: 0.0685352 (469)	total: 30.8s	remaining: 34.6s
471:	learn: 0.0505313	test: 0.0685358	best: 0.0685352 (469)	total: 30.8s	remaining: 34.5s
472:	learn: 0.0505291	test: 0.0685361	best: 0.0685352 (469)	total: 30.9s	remaining: 34.4s
473:	learn: 0.0505258	test: 0.0685354	best: 0.0685352 (469)	total: 31s	remaining: 34.4s
474:	learn: 0.0505233	test: 0.0685355	best: 0.0685352 (469)	total: 31s	remaining: 34.3s
475:	learn: 0.0505227	test: 0.0685357	best: 0.0685352 (469)	total: 31.1s	remaining: 34.2s
476:	learn: 0.0505202	test: 0.0685365	best: 0.0685352 (469)	total: 31.2s	remaining: 34.2s
477:	learn: 0.0505186	test: 0.0685374	best: 0.0685352 (469)	total: 31.2s	remaining: 34.1s
478:	learn: 0.

562:	learn: 0.0503690	test: 0.0685218	best: 0.0685207 (556)	total: 36.3s	remaining: 28.2s
563:	learn: 0.0503677	test: 0.0685219	best: 0.0685207 (556)	total: 36.4s	remaining: 28.1s
564:	learn: 0.0503660	test: 0.0685225	best: 0.0685207 (556)	total: 36.5s	remaining: 28.1s
565:	learn: 0.0503642	test: 0.0685226	best: 0.0685207 (556)	total: 36.5s	remaining: 28s
566:	learn: 0.0503623	test: 0.0685232	best: 0.0685207 (556)	total: 36.6s	remaining: 27.9s
567:	learn: 0.0503621	test: 0.0685232	best: 0.0685207 (556)	total: 36.6s	remaining: 27.9s
568:	learn: 0.0503613	test: 0.0685225	best: 0.0685207 (556)	total: 36.7s	remaining: 27.8s
569:	learn: 0.0503590	test: 0.0685223	best: 0.0685207 (556)	total: 36.7s	remaining: 27.7s
570:	learn: 0.0503568	test: 0.0685224	best: 0.0685207 (556)	total: 36.8s	remaining: 27.7s
571:	learn: 0.0503549	test: 0.0685237	best: 0.0685207 (556)	total: 36.9s	remaining: 27.6s
572:	learn: 0.0503529	test: 0.0685236	best: 0.0685207 (556)	total: 36.9s	remaining: 27.5s
573:	learn: 

In [9]:
# Print CatBoost feature importance
feature_importance = [(feature_names[i], value) for i, value in enumerate(model.get_feature_importance())]
feature_importance.sort(key=lambda x: x[1], reverse=True)
for k, v in feature_importance:
    print("{}: {}".format(k, v))

year_built: 3.815504943312758
month: 3.341390913786271
finished_area_sqft_calc: 2.9881990742140028
derived_avg_area_per_room: 2.9152995560505466
lot_sqft: 2.852983218430526
region_zip-tax_land-percent: 2.7838096416092517
location_1: 2.6608847459102836
region_zip-property_tax_per_sqft-diff: 2.6378702728856664
region_zip-property_tax_per_sqft-percent: 2.6353462503451452
quarter: 2.607530602408685
region_zip-finished_area_sqft_calc-diff: 2.55763656401783
region_zip-finished_area_sqft_calc-percent: 2.44560110917512
region_zip-year_built-diff: 2.403254440174032
region_zip-tax_property-diff: 2.264144884305663
city_id: 2.14410787807871
region_zip: 2.0865081473324323
tax_parcel: 2.080794749977784
region_zip-lot_sqft-diff: 2.0615574712579186
pool_cnt: 2.0479476103654637
location_3: 1.980962931630648
region_zip-tax_land-diff: 1.9802882395039663
landuse_type_id: 1.958203540370167
region_zip-groupcnt: 1.934593979305251
region_zip-tax_structure-diff: 1.9149878765443034
latitude: 1.8275803312825185


In [None]:
Shrink model to first 557 iterations.
Train score: 5.043362352223646
Val score: 6.852068572689314

# Train on all data + Make predictions

In [14]:
# Train CatBoost on all given training data (preparing for submission)
outlier_threshold = 0.4
mask = (abs(catboost_y) <= outlier_threshold)
catboost_X = catboost_X[mask, :]
catboost_y = catboost_y[mask]
print("catboost_X: {}".format(catboost_X.shape))
print("catboost_y: {}".format(catboost_y.shape))

# 1000 works better for 800 and 1200 for current hyperparameters
params['iterations'] = 1000
print(params)
np.random.seed(42)
random.seed(36)
model = CatBoostRegressor(**params)
model.fit(catboost_X, catboost_y, cat_features=categorical_indices, verbose=True)

catboost_X: (164299, 69)
catboost_y: (164299,)
{'loss_function': 'MAE', 'eval_metric': 'MAE', 'nan_mode': 'Min', 'random_seed': 0, 'iterations': 1000, 'learning_rate': 0.015, 'border_count': 254, 'max_depth': 6, 'random_strength': 1, 'l2_leaf_reg': 5, 'bagging_temperature': 1}
0:	learn: 0.0527824	total: 101ms	remaining: 1m 40s
1:	learn: 0.0526610	total: 173ms	remaining: 1m 26s
2:	learn: 0.0525722	total: 246ms	remaining: 1m 21s
3:	learn: 0.0524973	total: 326ms	remaining: 1m 21s
4:	learn: 0.0524338	total: 399ms	remaining: 1m 19s
5:	learn: 0.0523881	total: 472ms	remaining: 1m 18s
6:	learn: 0.0523437	total: 564ms	remaining: 1m 19s
7:	learn: 0.0523133	total: 624ms	remaining: 1m 17s
8:	learn: 0.0522813	total: 699ms	remaining: 1m 16s
9:	learn: 0.0522489	total: 775ms	remaining: 1m 16s
10:	learn: 0.0522216	total: 844ms	remaining: 1m 15s
11:	learn: 0.0522018	total: 902ms	remaining: 1m 14s
12:	learn: 0.0521815	total: 970ms	remaining: 1m 13s
13:	learn: 0.0521641	total: 1.05s	remaining: 1m 13s
14:	

155:	learn: 0.0514150	total: 11.5s	remaining: 1m 2s
156:	learn: 0.0514129	total: 11.6s	remaining: 1m 2s
157:	learn: 0.0514087	total: 11.7s	remaining: 1m 2s
158:	learn: 0.0514064	total: 11.8s	remaining: 1m 2s
159:	learn: 0.0514028	total: 11.8s	remaining: 1m 2s
160:	learn: 0.0513978	total: 11.9s	remaining: 1m 2s
161:	learn: 0.0513952	total: 12s	remaining: 1m 1s
162:	learn: 0.0513908	total: 12.1s	remaining: 1m 1s
163:	learn: 0.0513869	total: 12.1s	remaining: 1m 1s
164:	learn: 0.0513852	total: 12.2s	remaining: 1m 1s
165:	learn: 0.0513816	total: 12.3s	remaining: 1m 1s
166:	learn: 0.0513776	total: 12.4s	remaining: 1m 1s
167:	learn: 0.0513749	total: 12.4s	remaining: 1m 1s
168:	learn: 0.0513724	total: 12.5s	remaining: 1m 1s
169:	learn: 0.0513678	total: 12.6s	remaining: 1m 1s
170:	learn: 0.0513677	total: 12.6s	remaining: 1m 1s
171:	learn: 0.0513658	total: 12.7s	remaining: 1m 1s
172:	learn: 0.0513657	total: 12.7s	remaining: 1m
173:	learn: 0.0513651	total: 12.8s	remaining: 1m
174:	learn: 0.051361

315:	learn: 0.0510452	total: 22.7s	remaining: 49.2s
316:	learn: 0.0510438	total: 22.8s	remaining: 49.1s
317:	learn: 0.0510422	total: 22.9s	remaining: 49s
318:	learn: 0.0510406	total: 22.9s	remaining: 49s
319:	learn: 0.0510373	total: 23s	remaining: 48.9s
320:	learn: 0.0510364	total: 23.1s	remaining: 48.8s
321:	learn: 0.0510342	total: 23.1s	remaining: 48.7s
322:	learn: 0.0510316	total: 23.2s	remaining: 48.6s
323:	learn: 0.0510303	total: 23.3s	remaining: 48.5s
324:	learn: 0.0510296	total: 23.3s	remaining: 48.5s
325:	learn: 0.0510285	total: 23.4s	remaining: 48.4s
326:	learn: 0.0510267	total: 23.5s	remaining: 48.4s
327:	learn: 0.0510251	total: 23.6s	remaining: 48.3s
328:	learn: 0.0510228	total: 23.7s	remaining: 48.3s
329:	learn: 0.0510204	total: 23.8s	remaining: 48.2s
330:	learn: 0.0510194	total: 23.8s	remaining: 48.2s
331:	learn: 0.0510160	total: 23.9s	remaining: 48.2s
332:	learn: 0.0510136	total: 24s	remaining: 48.1s
333:	learn: 0.0510123	total: 24.1s	remaining: 48s
334:	learn: 0.0510111	

476:	learn: 0.0507652	total: 34.4s	remaining: 37.7s
477:	learn: 0.0507627	total: 34.5s	remaining: 37.7s
478:	learn: 0.0507619	total: 34.6s	remaining: 37.6s
479:	learn: 0.0507594	total: 34.6s	remaining: 37.5s
480:	learn: 0.0507578	total: 34.7s	remaining: 37.5s
481:	learn: 0.0507558	total: 34.8s	remaining: 37.4s
482:	learn: 0.0507544	total: 34.9s	remaining: 37.3s
483:	learn: 0.0507521	total: 34.9s	remaining: 37.2s
484:	learn: 0.0507508	total: 35s	remaining: 37.2s
485:	learn: 0.0507488	total: 35.1s	remaining: 37.1s
486:	learn: 0.0507474	total: 35.1s	remaining: 37s
487:	learn: 0.0507459	total: 35.2s	remaining: 37s
488:	learn: 0.0507457	total: 35.3s	remaining: 36.9s
489:	learn: 0.0507444	total: 35.4s	remaining: 36.8s
490:	learn: 0.0507424	total: 35.5s	remaining: 36.8s
491:	learn: 0.0507402	total: 35.5s	remaining: 36.7s
492:	learn: 0.0507381	total: 35.6s	remaining: 36.6s
493:	learn: 0.0507363	total: 35.7s	remaining: 36.5s
494:	learn: 0.0507340	total: 35.7s	remaining: 36.5s
495:	learn: 0.0507

635:	learn: 0.0505147	total: 46s	remaining: 26.3s
636:	learn: 0.0505134	total: 46s	remaining: 26.2s
637:	learn: 0.0505120	total: 46.1s	remaining: 26.2s
638:	learn: 0.0505105	total: 46.2s	remaining: 26.1s
639:	learn: 0.0505086	total: 46.3s	remaining: 26s
640:	learn: 0.0505073	total: 46.4s	remaining: 26s
641:	learn: 0.0505059	total: 46.5s	remaining: 25.9s
642:	learn: 0.0505037	total: 46.6s	remaining: 25.9s
643:	learn: 0.0505026	total: 46.7s	remaining: 25.8s
644:	learn: 0.0505014	total: 46.8s	remaining: 25.7s
645:	learn: 0.0505011	total: 46.8s	remaining: 25.7s
646:	learn: 0.0504999	total: 46.9s	remaining: 25.6s
647:	learn: 0.0504985	total: 47s	remaining: 25.5s
648:	learn: 0.0504971	total: 47.1s	remaining: 25.4s
649:	learn: 0.0504960	total: 47.1s	remaining: 25.4s
650:	learn: 0.0504950	total: 47.2s	remaining: 25.3s
651:	learn: 0.0504935	total: 47.3s	remaining: 25.2s
652:	learn: 0.0504924	total: 47.3s	remaining: 25.2s
653:	learn: 0.0504913	total: 47.4s	remaining: 25.1s
654:	learn: 0.0504901	

796:	learn: 0.0502795	total: 57.8s	remaining: 14.7s
797:	learn: 0.0502788	total: 57.8s	remaining: 14.6s
798:	learn: 0.0502770	total: 57.9s	remaining: 14.6s
799:	learn: 0.0502753	total: 58s	remaining: 14.5s
800:	learn: 0.0502726	total: 58s	remaining: 14.4s
801:	learn: 0.0502719	total: 58.1s	remaining: 14.3s
802:	learn: 0.0502710	total: 58.2s	remaining: 14.3s
803:	learn: 0.0502701	total: 58.2s	remaining: 14.2s
804:	learn: 0.0502686	total: 58.3s	remaining: 14.1s
805:	learn: 0.0502657	total: 58.4s	remaining: 14s
806:	learn: 0.0502638	total: 58.4s	remaining: 14s
807:	learn: 0.0502615	total: 58.5s	remaining: 13.9s
808:	learn: 0.0502596	total: 58.6s	remaining: 13.8s
809:	learn: 0.0502572	total: 58.7s	remaining: 13.8s
810:	learn: 0.0502557	total: 58.7s	remaining: 13.7s
811:	learn: 0.0502541	total: 58.8s	remaining: 13.6s
812:	learn: 0.0502526	total: 58.9s	remaining: 13.5s
813:	learn: 0.0502518	total: 59s	remaining: 13.5s
814:	learn: 0.0502508	total: 59s	remaining: 13.4s
815:	learn: 0.0502479	to

955:	learn: 0.0499796	total: 1m 9s	remaining: 3.19s
956:	learn: 0.0499769	total: 1m 9s	remaining: 3.12s
957:	learn: 0.0499746	total: 1m 9s	remaining: 3.04s
958:	learn: 0.0499732	total: 1m 9s	remaining: 2.97s
959:	learn: 0.0499708	total: 1m 9s	remaining: 2.9s
960:	learn: 0.0499693	total: 1m 9s	remaining: 2.83s
961:	learn: 0.0499677	total: 1m 9s	remaining: 2.75s
962:	learn: 0.0499665	total: 1m 9s	remaining: 2.68s
963:	learn: 0.0499652	total: 1m 9s	remaining: 2.61s
964:	learn: 0.0499640	total: 1m 9s	remaining: 2.54s
965:	learn: 0.0499626	total: 1m 10s	remaining: 2.46s
966:	learn: 0.0499604	total: 1m 10s	remaining: 2.39s
967:	learn: 0.0499575	total: 1m 10s	remaining: 2.32s
968:	learn: 0.0499559	total: 1m 10s	remaining: 2.25s
969:	learn: 0.0499543	total: 1m 10s	remaining: 2.17s
970:	learn: 0.0499522	total: 1m 10s	remaining: 2.1s
971:	learn: 0.0499511	total: 1m 10s	remaining: 2.03s
972:	learn: 0.0499476	total: 1m 10s	remaining: 1.96s
973:	learn: 0.0499459	total: 1m 10s	remaining: 1.88s
974:	

<catboost.core.CatBoostRegressor at 0x1a4ce3f9e8>

In [13]:
%%time
def predict_and_export(model, features_2016, features_2017, file_name):
    # Construct DataFrame for prediction results
    submission_2016 = pd.DataFrame()
    submission_2017 = pd.DataFrame()
    submission_2016['ParcelId'] = features_2016.parcelid
    submission_2017['ParcelId'] = features_2017.parcelid
    
    # Construct features and make prediction
    test_features_2016 = catboost_drop_features(features_2016)
    test_features_2017 = catboost_drop_features(features_2017)
    
    test_dates = ['2016-10-01', '2017-10-01']
    test_columns = ['201610', '201710']
    
    for d, c in zip(test_dates, test_columns):
        print("Start predicting for {}".format(c))
        
        test_features = test_features_2016 if d[:4] == '2016' else test_features_2017
        test_features['transactiondate'] = d
        data_proc.add_simple_datetime_features(test_features)
        pred_test = [float(format(x, '.4f')) for x in model.predict(test_features)]        
        if d[:4] == '2016':
            submission_2016[c] = pred_test
        else:
            submission_2017[c] = pred_test
        
        print("Finished predicting for {}".format(c))
    
    submission_2016['201611'] = submission_2016['201610']
    submission_2016['201612'] = submission_2016['201610']
    submission_2017['201711'] = submission_2017['201710']
    submission_2017['201712'] = submission_2017['201710']
    
    submission = submission_2016.merge(how='inner', right=submission_2017, on='ParcelId')
    
    print("Length of submission DataFrame: {}".format(len(submission)))
    print("Submission header:")
    print(submission.head())
    submission.to_csv(file_name, index=False)
    return submission  # Return the results so that we can analyze or sanity check it

submission = predict_and_export(model, features_2016, features_2017, 'data/submission_180104_catboost_v2s.csv')

Start predicting for 201610
Finished predicting for 201610
Start predicting for 201710
Finished predicting for 201710
Length of submission DataFrame: 2985217
Submission header:
   ParcelId  201610  201611  201612  201710  201711  201712
0  10754147  0.0133  0.0133  0.0133  0.0097  0.0097  0.0097
1  10759547  0.0164  0.0164  0.0164  0.0228  0.0228  0.0228
2  10843547  0.0180  0.0180  0.0180  0.0354  0.0354  0.0354
3  10859147  0.0756  0.0756  0.0756  0.0767  0.0767  0.0767
4  10879947  0.0257  0.0257  0.0257  0.0281  0.0281  0.0281
CPU times: user 6min 14s, sys: 7.84 s, total: 6min 22s
Wall time: 6min 5s
