In [69]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [70]:
from sklearn import decomposition, ensemble, linear_model, metrics, model_selection, pipeline, preprocessing

In [71]:
import catboost

In [118]:
train_path = '../data/geophy/train.csv'
train = pd.read_csv(train_path, low_memory=False)
test_path = '../data/geophy/test.csv'
test = pd.read_csv(test_path, low_memory=False)

### Data exploration

In [119]:
print("dataset size: {}, positive/negative fraction: {}".format(train.shape, train['target__office'].sum()/train.shape[0]))

dataset size: (687369, 64), positive/negative fraction: 0.008120820112632371


In [121]:
print("testset size: {}".format(test.shape))

testset size: (171843, 63)


In [122]:
data.columns

Index(['index', 'borough', 'block', 'lot', 'schooldist', 'council', 'zipcode',
       'firecomp', 'policeprct', 'healthcenterdistrict', 'healtharea',
       'sanitboro', 'sanitdistrict', 'sanitsub', 'zonedist1', 'zonedist2',
       'zonedist3', 'zonedist4', 'overlay1', 'overlay2', 'spdist1', 'spdist2',
       'spdist3', 'ltdheight', 'splitzone', 'landuse', 'easements', 'lotarea',
       'bldgarea', 'comarea', 'resarea', 'officearea', 'retailarea',
       'garagearea', 'strgearea', 'factryarea', 'otherarea', 'numbldgs',
       'numfloors', 'unitstotal', 'lotfront', 'lotdepth', 'bldgfront',
       'bldgdepth', 'ext', 'proxcode', 'irrlotcode', 'lottype', 'bsmtcode',
       'assessland', 'assesstot', 'exemptland', 'exempttot', 'yearbuilt',
       'yearalter1', 'yearalter2', 'histdist', 'landmark', 'builtfar',
       'tract2010', 'xcoord', 'ycoord', 'zonemap', 'target__office'],
      dtype='object')

In [79]:
len(data['index'].unique())

687369

In [73]:
for col in data.columns:
    if pd.isna(data[col]).any():
        print(col, data[col].dtype, len(data[col].unique()))

schooldist float64 33
council float64 52
zipcode float64 209
firecomp object 349
policeprct float64 78
healthcenterdistrict float64 31
healtharea float64 211
sanitboro float64 9
sanitdistrict float64 28
sanitsub object 61
zonedist1 object 163
zonedist2 object 146
zonedist3 object 59
zonedist4 object 12
overlay1 object 11
overlay2 object 11
spdist1 object 79
spdist2 object 19
spdist3 float64 1
ltdheight object 3
splitzone object 3
landuse float64 12
ext object 4
proxcode float64 5
irrlotcode object 3
lottype float64 11
bsmtcode float64 7
histdist object 138
landmark object 3
xcoord float64 129382
ycoord float64 132303
zonemap object 130


### Data cleanup

In [123]:
def cleanup(data):
    data.fillna({'zipcode': min(data['zipcode'])-1, 'lottype': max(data['lottype'])+1, 'bsmtcode': max(data['bsmtcode'])+1},
                inplace=True)
    
    for col in data.columns:
        if pd.isna(data[col]).any():
            if data[col].dtype == 'object':
                data[col].fillna('None', inplace=True)
            if data[col].dtype == 'float64':
                data[col].fillna(0., inplace=True)
    
    return data

In [124]:
train = cleanup(train)
test = cleanup(test)

In [125]:
# and check
pd.isna(train).any().any(), pd.isna(train).any().any()

(False, False)

### Make and apply LabelEncoders, make feature matrix

In [126]:
def make_label_encoders(all_data):
    label_encoders = {}
    for col in data.columns:
        if data[col].dtype == 'object':
            le = preprocessing.LabelEncoder()
            le.fit(data[col])
            label_encoders[col] = le
    return label_encoders

In [127]:
def df_to_feature_matrix(data, label_encoders):
    X = np.empty((data.shape[0], 0))
    for col in data.columns:
        if col in label_encoders:
            X = np.hstack((X, label_encoders[col].transform(data[col]).reshape(X.shape[0], 1)))
        else:
            X = np.hstack((X, data[col].values.reshape(X.shape[0], 1)))
    return X

In [132]:
label_encoders = make_label_encoders(train.iloc[:,1:-1].append(test))

In [133]:
X, y = df_to_feature_matrix(train.iloc[:,1:-1], label_encoders), train['target__office'].astype(int)

In [134]:
# sanity check
X.shape, y.shape

((687369, 62), (687369,))

### Apply logistic regression and CatBoost

In [83]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

In [89]:
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression(solver='lbfgs', max_iter=1000))

In [90]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9933732924043819

In [92]:
preds = clf.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = clf.predict(X_test)
print(metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00    545465
           1       0.79      0.23      0.35      4430

   micro avg       0.99      0.99      0.99    549895
   macro avg       0.89      0.61      0.68    549895
weighted avg       0.99      0.99      0.99    549895

              precision    recall  f1-score   support

           0       0.99      1.00      1.00    136322
           1       0.85      0.25      0.39      1152

   micro avg       0.99      0.99      0.99    137474
   macro avg       0.92      0.63      0.69    137474
weighted avg       0.99      0.99      0.99    137474



In [94]:
# class_weight='balanced'
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1000))

In [95]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9503615229061495

In [96]:
preds = clf.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = clf.predict(X_test)
print(metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97    545465
           1       0.14      0.97      0.24      4430

   micro avg       0.95      0.95      0.95    549895
   macro avg       0.57      0.96      0.61    549895
weighted avg       0.99      0.95      0.97    549895

              precision    recall  f1-score   support

           0       1.00      0.95      0.97    136322
           1       0.14      0.97      0.25      1152

   micro avg       0.95      0.95      0.95    137474
   macro avg       0.57      0.96      0.61    137474
weighted avg       0.99      0.95      0.97    137474



**Although the average scores look ok, the results are actually quite poor in terms of F1. Not fitting well to training data indicates high bias, therefore at least a more complex model is necessary.**

In [97]:
model = catboost.CatBoostClassifier(loss_function='Logloss', verbose=True)
model.fit(X_train, y_train)
preds = model.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = model.predict(X_test)
print(metrics.classification_report(y_test, preds))

Learning rate set to 0.099913
0:	learn: 0.3593081	total: 407ms	remaining: 6m 46s
1:	learn: 0.1572746	total: 637ms	remaining: 5m 17s
2:	learn: 0.0906157	total: 839ms	remaining: 4m 38s
3:	learn: 0.0535793	total: 1.04s	remaining: 4m 18s
4:	learn: 0.0355246	total: 1.23s	remaining: 4m 5s
5:	learn: 0.0248084	total: 1.43s	remaining: 3m 56s
6:	learn: 0.0168040	total: 1.63s	remaining: 3m 51s
7:	learn: 0.0135457	total: 1.81s	remaining: 3m 45s
8:	learn: 0.0116110	total: 2.01s	remaining: 3m 41s
9:	learn: 0.0098330	total: 2.21s	remaining: 3m 38s
10:	learn: 0.0076834	total: 2.4s	remaining: 3m 35s
11:	learn: 0.0071433	total: 2.58s	remaining: 3m 32s
12:	learn: 0.0062917	total: 2.76s	remaining: 3m 29s
13:	learn: 0.0060403	total: 2.97s	remaining: 3m 28s
14:	learn: 0.0055434	total: 3.15s	remaining: 3m 26s
15:	learn: 0.0053159	total: 3.34s	remaining: 3m 25s
16:	learn: 0.0050345	total: 3.52s	remaining: 3m 23s
17:	learn: 0.0048962	total: 3.71s	remaining: 3m 22s
18:	learn: 0.0047192	total: 3.89s	remaining: 3

159:	learn: 0.0023181	total: 32.3s	remaining: 2m 49s
160:	learn: 0.0023088	total: 32.5s	remaining: 2m 49s
161:	learn: 0.0022960	total: 32.7s	remaining: 2m 49s
162:	learn: 0.0022853	total: 32.9s	remaining: 2m 48s
163:	learn: 0.0022791	total: 33.1s	remaining: 2m 48s
164:	learn: 0.0022706	total: 33.4s	remaining: 2m 48s
165:	learn: 0.0022665	total: 33.6s	remaining: 2m 48s
166:	learn: 0.0022627	total: 33.8s	remaining: 2m 48s
167:	learn: 0.0022464	total: 34s	remaining: 2m 48s
168:	learn: 0.0022426	total: 34.2s	remaining: 2m 48s
169:	learn: 0.0022374	total: 34.4s	remaining: 2m 47s
170:	learn: 0.0022346	total: 34.6s	remaining: 2m 47s
171:	learn: 0.0022304	total: 34.8s	remaining: 2m 47s
172:	learn: 0.0022222	total: 35s	remaining: 2m 47s
173:	learn: 0.0022176	total: 35.2s	remaining: 2m 47s
174:	learn: 0.0022146	total: 35.4s	remaining: 2m 46s
175:	learn: 0.0022096	total: 35.6s	remaining: 2m 46s
176:	learn: 0.0022032	total: 35.8s	remaining: 2m 46s
177:	learn: 0.0022008	total: 36s	remaining: 2m 46s

316:	learn: 0.0016657	total: 1m 4s	remaining: 2m 19s
317:	learn: 0.0016640	total: 1m 4s	remaining: 2m 18s
318:	learn: 0.0016601	total: 1m 5s	remaining: 2m 18s
319:	learn: 0.0016588	total: 1m 5s	remaining: 2m 18s
320:	learn: 0.0016573	total: 1m 5s	remaining: 2m 18s
321:	learn: 0.0016542	total: 1m 5s	remaining: 2m 18s
322:	learn: 0.0016393	total: 1m 5s	remaining: 2m 17s
323:	learn: 0.0016363	total: 1m 5s	remaining: 2m 17s
324:	learn: 0.0016319	total: 1m 6s	remaining: 2m 17s
325:	learn: 0.0016293	total: 1m 6s	remaining: 2m 17s
326:	learn: 0.0016274	total: 1m 6s	remaining: 2m 16s
327:	learn: 0.0016245	total: 1m 6s	remaining: 2m 16s
328:	learn: 0.0016195	total: 1m 6s	remaining: 2m 16s
329:	learn: 0.0016165	total: 1m 7s	remaining: 2m 16s
330:	learn: 0.0016126	total: 1m 7s	remaining: 2m 16s
331:	learn: 0.0016054	total: 1m 7s	remaining: 2m 15s
332:	learn: 0.0015999	total: 1m 7s	remaining: 2m 15s
333:	learn: 0.0015969	total: 1m 7s	remaining: 2m 15s
334:	learn: 0.0015922	total: 1m 8s	remaining: 

471:	learn: 0.0012543	total: 1m 35s	remaining: 1m 47s
472:	learn: 0.0012524	total: 1m 35s	remaining: 1m 46s
473:	learn: 0.0012495	total: 1m 36s	remaining: 1m 46s
474:	learn: 0.0012478	total: 1m 36s	remaining: 1m 46s
475:	learn: 0.0012383	total: 1m 36s	remaining: 1m 46s
476:	learn: 0.0012376	total: 1m 36s	remaining: 1m 46s
477:	learn: 0.0012370	total: 1m 36s	remaining: 1m 45s
478:	learn: 0.0012353	total: 1m 37s	remaining: 1m 45s
479:	learn: 0.0012342	total: 1m 37s	remaining: 1m 45s
480:	learn: 0.0012318	total: 1m 37s	remaining: 1m 45s
481:	learn: 0.0012291	total: 1m 37s	remaining: 1m 45s
482:	learn: 0.0012242	total: 1m 37s	remaining: 1m 44s
483:	learn: 0.0012230	total: 1m 38s	remaining: 1m 44s
484:	learn: 0.0012204	total: 1m 38s	remaining: 1m 44s
485:	learn: 0.0012183	total: 1m 38s	remaining: 1m 44s
486:	learn: 0.0012161	total: 1m 38s	remaining: 1m 44s
487:	learn: 0.0012142	total: 1m 39s	remaining: 1m 43s
488:	learn: 0.0012128	total: 1m 39s	remaining: 1m 43s
489:	learn: 0.0012109	total:

625:	learn: 0.0009600	total: 2m 9s	remaining: 1m 17s
626:	learn: 0.0009575	total: 2m 9s	remaining: 1m 17s
627:	learn: 0.0009568	total: 2m 9s	remaining: 1m 16s
628:	learn: 0.0009559	total: 2m 10s	remaining: 1m 16s
629:	learn: 0.0009540	total: 2m 10s	remaining: 1m 16s
630:	learn: 0.0009530	total: 2m 10s	remaining: 1m 16s
631:	learn: 0.0009514	total: 2m 10s	remaining: 1m 16s
632:	learn: 0.0009495	total: 2m 10s	remaining: 1m 15s
633:	learn: 0.0009483	total: 2m 11s	remaining: 1m 15s
634:	learn: 0.0009472	total: 2m 11s	remaining: 1m 15s
635:	learn: 0.0009452	total: 2m 11s	remaining: 1m 15s
636:	learn: 0.0009442	total: 2m 11s	remaining: 1m 15s
637:	learn: 0.0009431	total: 2m 11s	remaining: 1m 14s
638:	learn: 0.0009404	total: 2m 12s	remaining: 1m 14s
639:	learn: 0.0009395	total: 2m 12s	remaining: 1m 14s
640:	learn: 0.0009372	total: 2m 12s	remaining: 1m 14s
641:	learn: 0.0009334	total: 2m 12s	remaining: 1m 13s
642:	learn: 0.0009320	total: 2m 12s	remaining: 1m 13s
643:	learn: 0.0009301	total: 2m

780:	learn: 0.0007369	total: 2m 45s	remaining: 46.3s
781:	learn: 0.0007348	total: 2m 45s	remaining: 46.1s
782:	learn: 0.0007341	total: 2m 45s	remaining: 45.9s
783:	learn: 0.0007332	total: 2m 45s	remaining: 45.7s
784:	learn: 0.0007322	total: 2m 46s	remaining: 45.5s
785:	learn: 0.0007309	total: 2m 46s	remaining: 45.3s
786:	learn: 0.0007296	total: 2m 46s	remaining: 45.1s
787:	learn: 0.0007288	total: 2m 46s	remaining: 44.9s
788:	learn: 0.0007273	total: 2m 47s	remaining: 44.7s
789:	learn: 0.0007253	total: 2m 47s	remaining: 44.5s
790:	learn: 0.0007240	total: 2m 47s	remaining: 44.3s
791:	learn: 0.0007232	total: 2m 47s	remaining: 44.1s
792:	learn: 0.0007215	total: 2m 48s	remaining: 43.9s
793:	learn: 0.0007209	total: 2m 48s	remaining: 43.7s
794:	learn: 0.0007200	total: 2m 48s	remaining: 43.5s
795:	learn: 0.0007186	total: 2m 48s	remaining: 43.2s
796:	learn: 0.0007178	total: 2m 48s	remaining: 43s
797:	learn: 0.0007166	total: 2m 49s	remaining: 42.8s
798:	learn: 0.0007158	total: 2m 49s	remaining: 4

937:	learn: 0.0005791	total: 3m 19s	remaining: 13.2s
938:	learn: 0.0005785	total: 3m 19s	remaining: 13s
939:	learn: 0.0005781	total: 3m 19s	remaining: 12.8s
940:	learn: 0.0005769	total: 3m 20s	remaining: 12.6s
941:	learn: 0.0005762	total: 3m 20s	remaining: 12.3s
942:	learn: 0.0005758	total: 3m 20s	remaining: 12.1s
943:	learn: 0.0005749	total: 3m 20s	remaining: 11.9s
944:	learn: 0.0005744	total: 3m 21s	remaining: 11.7s
945:	learn: 0.0005732	total: 3m 21s	remaining: 11.5s
946:	learn: 0.0005722	total: 3m 21s	remaining: 11.3s
947:	learn: 0.0005705	total: 3m 21s	remaining: 11.1s
948:	learn: 0.0005699	total: 3m 21s	remaining: 10.8s
949:	learn: 0.0005693	total: 3m 22s	remaining: 10.6s
950:	learn: 0.0005682	total: 3m 22s	remaining: 10.4s
951:	learn: 0.0005679	total: 3m 22s	remaining: 10.2s
952:	learn: 0.0005672	total: 3m 22s	remaining: 10s
953:	learn: 0.0005661	total: 3m 22s	remaining: 9.78s
954:	learn: 0.0005650	total: 3m 23s	remaining: 9.57s
955:	learn: 0.0005646	total: 3m 23s	remaining: 9.3

**This looks more reasonable, if not overfitting. I will try labeling some categorical features: borough, zoning district, overlay, special purpose.**

In [99]:
model = catboost.CatBoostClassifier(loss_function='Logloss', verbose=True)

In [100]:
model.fit(X_train, y_train, cat_features=[0, 13, 14, 15, 16, 17, 18, 19, 20, 21])
preds = model.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = model.predict(X_test)
print(metrics.classification_report(y_test, preds))

Learning rate set to 0.099913
0:	learn: 0.4005947	total: 985ms	remaining: 16m 24s
1:	learn: 0.2159185	total: 1.84s	remaining: 15m 20s
2:	learn: 0.1233197	total: 2.67s	remaining: 14m 47s
3:	learn: 0.0653847	total: 3.55s	remaining: 14m 43s
4:	learn: 0.0430103	total: 4.47s	remaining: 14m 50s
5:	learn: 0.0271573	total: 5.34s	remaining: 14m 45s
6:	learn: 0.0189759	total: 6.21s	remaining: 14m 40s
7:	learn: 0.0122446	total: 7.21s	remaining: 14m 54s
8:	learn: 0.0099563	total: 8.07s	remaining: 14m 49s
9:	learn: 0.0078742	total: 8.97s	remaining: 14m 47s
10:	learn: 0.0072487	total: 9.83s	remaining: 14m 43s
11:	learn: 0.0063668	total: 10.6s	remaining: 14m 36s
12:	learn: 0.0059648	total: 11.6s	remaining: 14m 39s
13:	learn: 0.0055218	total: 12.4s	remaining: 14m 35s
14:	learn: 0.0052260	total: 13.3s	remaining: 14m 34s
15:	learn: 0.0050407	total: 14.2s	remaining: 14m 32s
16:	learn: 0.0048681	total: 15.1s	remaining: 14m 32s
17:	learn: 0.0046526	total: 16s	remaining: 14m 30s
18:	learn: 0.0044676	total: 

153:	learn: 0.0023494	total: 2m 17s	remaining: 12m 34s
154:	learn: 0.0023461	total: 2m 18s	remaining: 12m 34s
155:	learn: 0.0023429	total: 2m 19s	remaining: 12m 34s
156:	learn: 0.0023360	total: 2m 20s	remaining: 12m 33s
157:	learn: 0.0023354	total: 2m 21s	remaining: 12m 33s
158:	learn: 0.0023325	total: 2m 22s	remaining: 12m 33s
159:	learn: 0.0023297	total: 2m 23s	remaining: 12m 33s
160:	learn: 0.0023239	total: 2m 24s	remaining: 12m 32s
161:	learn: 0.0023180	total: 2m 25s	remaining: 12m 32s
162:	learn: 0.0023168	total: 2m 26s	remaining: 12m 32s
163:	learn: 0.0023135	total: 2m 27s	remaining: 12m 31s
164:	learn: 0.0023096	total: 2m 28s	remaining: 12m 30s
165:	learn: 0.0023033	total: 2m 29s	remaining: 12m 29s
166:	learn: 0.0023010	total: 2m 30s	remaining: 12m 29s
167:	learn: 0.0022947	total: 2m 31s	remaining: 12m 28s
168:	learn: 0.0022905	total: 2m 32s	remaining: 12m 27s
169:	learn: 0.0022863	total: 2m 33s	remaining: 12m 27s
170:	learn: 0.0022840	total: 2m 33s	remaining: 12m 26s
171:	learn

304:	learn: 0.0017862	total: 4m 38s	remaining: 10m 34s
305:	learn: 0.0017796	total: 4m 39s	remaining: 10m 33s
306:	learn: 0.0017762	total: 4m 39s	remaining: 10m 31s
307:	learn: 0.0017721	total: 4m 40s	remaining: 10m 30s
308:	learn: 0.0017683	total: 4m 41s	remaining: 10m 29s
309:	learn: 0.0017663	total: 4m 42s	remaining: 10m 28s
310:	learn: 0.0017651	total: 4m 43s	remaining: 10m 27s
311:	learn: 0.0017628	total: 4m 44s	remaining: 10m 26s
312:	learn: 0.0017601	total: 4m 45s	remaining: 10m 25s
313:	learn: 0.0017571	total: 4m 46s	remaining: 10m 25s
314:	learn: 0.0017525	total: 4m 47s	remaining: 10m 24s
315:	learn: 0.0017513	total: 4m 47s	remaining: 10m 23s
316:	learn: 0.0017492	total: 4m 48s	remaining: 10m 22s
317:	learn: 0.0017464	total: 4m 49s	remaining: 10m 21s
318:	learn: 0.0017443	total: 4m 50s	remaining: 10m 20s
319:	learn: 0.0017411	total: 4m 51s	remaining: 10m 19s
320:	learn: 0.0017396	total: 4m 52s	remaining: 10m 18s
321:	learn: 0.0017351	total: 4m 53s	remaining: 10m 17s
322:	learn

457:	learn: 0.0014257	total: 6m 58s	remaining: 8m 14s
458:	learn: 0.0014246	total: 6m 59s	remaining: 8m 13s
459:	learn: 0.0014215	total: 6m 59s	remaining: 8m 13s
460:	learn: 0.0014205	total: 7m	remaining: 8m 11s
461:	learn: 0.0014187	total: 7m 1s	remaining: 8m 11s
462:	learn: 0.0014171	total: 7m 2s	remaining: 8m 10s
463:	learn: 0.0014158	total: 7m 3s	remaining: 8m 9s
464:	learn: 0.0014147	total: 7m 4s	remaining: 8m 8s
465:	learn: 0.0014115	total: 7m 5s	remaining: 8m 7s
466:	learn: 0.0014095	total: 7m 6s	remaining: 8m 6s
467:	learn: 0.0014064	total: 7m 6s	remaining: 8m 5s
468:	learn: 0.0014051	total: 7m 7s	remaining: 8m 4s
469:	learn: 0.0014037	total: 7m 8s	remaining: 8m 3s
470:	learn: 0.0014006	total: 7m 9s	remaining: 8m 2s
471:	learn: 0.0013965	total: 7m 10s	remaining: 8m 1s
472:	learn: 0.0013953	total: 7m 11s	remaining: 8m
473:	learn: 0.0013888	total: 7m 12s	remaining: 7m 59s
474:	learn: 0.0013878	total: 7m 13s	remaining: 7m 58s
475:	learn: 0.0013869	total: 7m 14s	remaining: 7m 57s
4

611:	learn: 0.0011337	total: 9m 14s	remaining: 5m 51s
612:	learn: 0.0011333	total: 9m 15s	remaining: 5m 50s
613:	learn: 0.0011317	total: 9m 16s	remaining: 5m 49s
614:	learn: 0.0011312	total: 9m 17s	remaining: 5m 48s
615:	learn: 0.0011305	total: 9m 18s	remaining: 5m 47s
616:	learn: 0.0011260	total: 9m 19s	remaining: 5m 47s
617:	learn: 0.0011250	total: 9m 19s	remaining: 5m 46s
618:	learn: 0.0011233	total: 9m 20s	remaining: 5m 45s
619:	learn: 0.0011220	total: 9m 21s	remaining: 5m 44s
620:	learn: 0.0011204	total: 9m 22s	remaining: 5m 43s
621:	learn: 0.0011195	total: 9m 23s	remaining: 5m 42s
622:	learn: 0.0011167	total: 9m 24s	remaining: 5m 41s
623:	learn: 0.0011161	total: 9m 25s	remaining: 5m 40s
624:	learn: 0.0011132	total: 9m 26s	remaining: 5m 39s
625:	learn: 0.0011116	total: 9m 27s	remaining: 5m 38s
626:	learn: 0.0011104	total: 9m 28s	remaining: 5m 38s
627:	learn: 0.0011087	total: 9m 29s	remaining: 5m 37s
628:	learn: 0.0011049	total: 9m 30s	remaining: 5m 36s
629:	learn: 0.0011038	total:

762:	learn: 0.0009183	total: 11m 28s	remaining: 3m 33s
763:	learn: 0.0009174	total: 11m 29s	remaining: 3m 33s
764:	learn: 0.0009164	total: 11m 30s	remaining: 3m 32s
765:	learn: 0.0009152	total: 11m 31s	remaining: 3m 31s
766:	learn: 0.0009147	total: 11m 32s	remaining: 3m 30s
767:	learn: 0.0009138	total: 11m 33s	remaining: 3m 29s
768:	learn: 0.0009135	total: 11m 33s	remaining: 3m 28s
769:	learn: 0.0009127	total: 11m 34s	remaining: 3m 27s
770:	learn: 0.0009122	total: 11m 35s	remaining: 3m 26s
771:	learn: 0.0009118	total: 11m 36s	remaining: 3m 25s
772:	learn: 0.0009102	total: 11m 37s	remaining: 3m 24s
773:	learn: 0.0009097	total: 11m 38s	remaining: 3m 23s
774:	learn: 0.0009094	total: 11m 39s	remaining: 3m 22s
775:	learn: 0.0009078	total: 11m 40s	remaining: 3m 22s
776:	learn: 0.0009067	total: 11m 41s	remaining: 3m 21s
777:	learn: 0.0009060	total: 11m 41s	remaining: 3m 20s
778:	learn: 0.0009042	total: 11m 42s	remaining: 3m 19s
779:	learn: 0.0009031	total: 11m 43s	remaining: 3m 18s
780:	learn

913:	learn: 0.0007527	total: 13m 42s	remaining: 1m 17s
914:	learn: 0.0007521	total: 13m 43s	remaining: 1m 16s
915:	learn: 0.0007509	total: 13m 44s	remaining: 1m 15s
916:	learn: 0.0007496	total: 13m 44s	remaining: 1m 14s
917:	learn: 0.0007487	total: 13m 45s	remaining: 1m 13s
918:	learn: 0.0007470	total: 13m 46s	remaining: 1m 12s
919:	learn: 0.0007453	total: 13m 47s	remaining: 1m 11s
920:	learn: 0.0007442	total: 13m 48s	remaining: 1m 11s
921:	learn: 0.0007432	total: 13m 49s	remaining: 1m 10s
922:	learn: 0.0007415	total: 13m 50s	remaining: 1m 9s
923:	learn: 0.0007399	total: 13m 51s	remaining: 1m 8s
924:	learn: 0.0007396	total: 13m 52s	remaining: 1m 7s
925:	learn: 0.0007390	total: 13m 52s	remaining: 1m 6s
926:	learn: 0.0007381	total: 13m 53s	remaining: 1m 5s
927:	learn: 0.0007372	total: 13m 54s	remaining: 1m 4s
928:	learn: 0.0007367	total: 13m 55s	remaining: 1m 3s
929:	learn: 0.0007344	total: 13m 56s	remaining: 1m 2s
930:	learn: 0.0007325	total: 13m 57s	remaining: 1m 2s
931:	learn: 0.00073

**Ideally I'd try to addressed the limitations listed in the accompanying pdf, but for now the model with categorical features specified *feels* less likely to overfit.**

### Final submission

In [135]:
test = pd.read_csv(test_path, low_memory=False)

In [136]:
clean = cleanup(test)

In [137]:
# check
pd.isna(clean).any().any()

False

In [138]:
X, y = df_to_feature_matrix(clean.iloc[:,1:-1], label_encoders), clean['target__office'].astype(int)

ValueError: y contains previously unseen labels: '4R'