In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [25]:
from sklearn import decomposition, ensemble, linear_model, metrics, \
model_selection, pipeline, preprocessing

In [5]:
import catboost

In [6]:
path = '../data/geophy/train.csv'
data = pd.read_csv(path, low_memory=False)

In [7]:
data['target__office'].sum()

5582

In [8]:
for col in data.columns:
    if pd.isna(data[col]).any():
        print(col, data[col].dtype, len(data[col].unique()))

schooldist float64 33
council float64 52
zipcode float64 209
firecomp object 349
policeprct float64 78
healthcenterdistrict float64 31
healtharea float64 211
sanitboro float64 9
sanitdistrict float64 28
sanitsub object 61
zonedist1 object 163
zonedist2 object 146
zonedist3 object 59
zonedist4 object 12
overlay1 object 11
overlay2 object 11
spdist1 object 79
spdist2 object 19
spdist3 float64 1
ltdheight object 3
splitzone object 3
landuse float64 12
ext object 4
proxcode float64 5
irrlotcode object 3
lottype float64 11
bsmtcode float64 7
histdist object 138
landmark object 3
xcoord float64 129382
ycoord float64 132303
zonemap object 130


In [9]:
data = data.fillna({'zipcode': min(data['zipcode'])-1,
                   'lottype': max(data['lottype'])+1,
                   'bsmtcode': max(data['bsmtcode'])+1})

In [10]:
for col in data.columns:
    if pd.isna(data[col]).any():
        if data[col].dtype == 'object':
            data[col].fillna('None', inplace=True)
        if data[col].dtype == 'float64':
            data[col].fillna(0., inplace=True)

In [11]:
for col in data.columns:
    if pd.isna(data[col]).any():
        print(col, data[col].dtype)

In [12]:
data['borough'].unique()

array(['BX', 'QN', 'BK', 'MN', 'SI'], dtype=object)

In [13]:
len(data['block'].unique())

13915

In [14]:
len(data['lot'].unique())

2331

In [15]:
label_encoders = {}
for col in data.columns:
    if data[col].dtype == 'object':
        le = preprocessing.LabelEncoder()
        le.fit(data[col])
        label_encoders[col] = le

In [16]:
X = np.empty((data.shape[0], 0))

In [17]:
for col in data.columns[:-1]:
    if col in label_encoders:
        X = np.hstack((X, label_encoders[col].transform(data[col]).reshape(X.shape[0], 1)))
    else:
        X = np.hstack((X, data[col].values.reshape(X.shape[0], 1)))

In [None]:
X.shape

In [18]:
y = data['target__office'].astype(int)

In [19]:
X.shape, y.shape

((687369, 63), (687369,))

In [20]:
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression(C=1))

In [21]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.5,
                                                                    random_state=0)

In [22]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [23]:
clf.score(X_test, y_test)

0.9928073672112545

In [26]:
preds = clf.predict(X_test)
metrics.f1_score(y_test, preds)

0.28430804863925885

In [28]:
print(metrics.classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00    340857
          1       0.78      0.17      0.28      2828

avg / total       0.99      0.99      0.99    343685



In [29]:
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression(C=1, class_weight='balanced'))

In [31]:
clf.fit(X_train, y_train)
preds = clf.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = clf.predict(X_test)
print(metrics.classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       1.00      0.95      0.98    340930
          1       0.14      0.97      0.25      2754

avg / total       0.99      0.95      0.97    343684

             precision    recall  f1-score   support

          0       1.00      0.95      0.98    340857
          1       0.14      0.97      0.25      2828

avg / total       0.99      0.95      0.97    343685



In [32]:
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression(C=3, class_weight='balanced'))
clf.fit(X_train, y_train)
preds = clf.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = clf.predict(X_test)
print(metrics.classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       1.00      0.95      0.98    340930
          1       0.14      0.97      0.25      2754

avg / total       0.99      0.95      0.97    343684

             precision    recall  f1-score   support

          0       1.00      0.95      0.98    340857
          1       0.15      0.97      0.25      2828

avg / total       0.99      0.95      0.97    343685



In [33]:
clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression(C=5, class_weight='balanced'))
clf.fit(X_train, y_train)
preds = clf.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = clf.predict(X_test)
print(metrics.classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       1.00      0.95      0.98    340930
          1       0.14      0.97      0.25      2754

avg / total       0.99      0.95      0.97    343684

             precision    recall  f1-score   support

          0       1.00      0.95      0.98    340857
          1       0.15      0.97      0.25      2828

avg / total       0.99      0.95      0.97    343685



In [47]:
for i, feat in enumerate(data.columns):
    print(i, feat),

0 index
1 borough
2 block
3 lot
4 schooldist
5 council
6 zipcode
7 firecomp
8 policeprct
9 healthcenterdistrict
10 healtharea
11 sanitboro
12 sanitdistrict
13 sanitsub
14 zonedist1
15 zonedist2
16 zonedist3
17 zonedist4
18 overlay1
19 overlay2
20 spdist1
21 spdist2
22 spdist3
23 ltdheight
24 splitzone
25 landuse
26 easements
27 lotarea
28 bldgarea
29 comarea
30 resarea
31 officearea
32 retailarea
33 garagearea
34 strgearea
35 factryarea
36 otherarea
37 numbldgs
38 numfloors
39 unitstotal
40 lotfront
41 lotdepth
42 bldgfront
43 bldgdepth
44 ext
45 proxcode
46 irrlotcode
47 lottype
48 bsmtcode
49 assessland
50 assesstot
51 exemptland
52 exempttot
53 yearbuilt
54 yearalter1
55 yearalter2
56 histdist
57 landmark
58 builtfar
59 tract2010
60 xcoord
61 ycoord
62 zonemap
63 target__office


In [34]:
model = catboost.CatBoostClassifier(iterations=2, depth=2, learning_rate=1,
                                    loss_function='Logloss', verbose=True)

In [35]:
model.fit(X_train, y_train)

0:	learn: 0.0233685	total: 103ms	remaining: 103ms
1:	learn: 0.0200384	total: 152ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f9596de3860>

In [37]:
preds = model.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = model.predict(X_test)
print(metrics.classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    340930
          1       0.65      0.54      0.59      2754

avg / total       0.99      0.99      0.99    343684

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    340857
          1       0.65      0.53      0.59      2828

avg / total       0.99      0.99      0.99    343685



In [42]:
model.fit(X_train, y_train, cat_features=[1, 14, 15, 16, 17, 18, 19, 20, 21, 22])

0:	learn: 0.0186400	total: 143ms	remaining: 143ms
1:	learn: 0.0150235	total: 267ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f9596de3860>

In [43]:
preds = model.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = model.predict(X_test)
print(metrics.classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00    340930
          1       0.87      0.21      0.33      2754

avg / total       0.99      0.99      0.99    343684

             precision    recall  f1-score   support

          0       0.99      1.00      1.00    340857
          1       0.85      0.21      0.34      2828

avg / total       0.99      0.99      0.99    343685



In [46]:
model = catboost.CatBoostClassifier(loss_function='Logloss', verbose=True)
model.fit(X_train, y_train)
preds = model.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = model.predict(X_test)
print(metrics.classification_report(y_test, preds))

Learning rate set to 0.08747
0:	learn: 0.4476337	total: 59.1ms	remaining: 59s
1:	learn: 0.2347521	total: 149ms	remaining: 1m 14s
2:	learn: 0.1421376	total: 238ms	remaining: 1m 19s
3:	learn: 0.0864904	total: 298ms	remaining: 1m 14s
4:	learn: 0.0593827	total: 366ms	remaining: 1m 12s
5:	learn: 0.0422235	total: 459ms	remaining: 1m 16s
6:	learn: 0.0309042	total: 519ms	remaining: 1m 13s
7:	learn: 0.0226772	total: 594ms	remaining: 1m 13s
8:	learn: 0.0183245	total: 684ms	remaining: 1m 15s
9:	learn: 0.0128876	total: 742ms	remaining: 1m 13s
10:	learn: 0.0109561	total: 816ms	remaining: 1m 13s
11:	learn: 0.0089353	total: 907ms	remaining: 1m 14s
12:	learn: 0.0079841	total: 966ms	remaining: 1m 13s
13:	learn: 0.0076141	total: 1.03s	remaining: 1m 12s
14:	learn: 0.0067821	total: 1.13s	remaining: 1m 14s
15:	learn: 0.0064077	total: 1.19s	remaining: 1m 13s
16:	learn: 0.0059609	total: 1.28s	remaining: 1m 14s
17:	learn: 0.0055339	total: 1.36s	remaining: 1m 14s
18:	learn: 0.0052925	total: 1.44s	remaining: 1m

159:	learn: 0.0022050	total: 12.4s	remaining: 1m 5s
160:	learn: 0.0022012	total: 12.5s	remaining: 1m 5s
161:	learn: 0.0021974	total: 12.6s	remaining: 1m 4s
162:	learn: 0.0021925	total: 12.6s	remaining: 1m 4s
163:	learn: 0.0021899	total: 12.7s	remaining: 1m 4s
164:	learn: 0.0021861	total: 12.8s	remaining: 1m 4s
165:	learn: 0.0021835	total: 12.8s	remaining: 1m 4s
166:	learn: 0.0021797	total: 12.9s	remaining: 1m 4s
167:	learn: 0.0021747	total: 13s	remaining: 1m 4s
168:	learn: 0.0021659	total: 13.1s	remaining: 1m 4s
169:	learn: 0.0021634	total: 13.1s	remaining: 1m 4s
170:	learn: 0.0021578	total: 13.2s	remaining: 1m 4s
171:	learn: 0.0021531	total: 13.3s	remaining: 1m 4s
172:	learn: 0.0021509	total: 13.4s	remaining: 1m 3s
173:	learn: 0.0021415	total: 13.5s	remaining: 1m 3s
174:	learn: 0.0021403	total: 13.5s	remaining: 1m 3s
175:	learn: 0.0021348	total: 13.6s	remaining: 1m 3s
176:	learn: 0.0021329	total: 13.7s	remaining: 1m 3s
177:	learn: 0.0021261	total: 13.8s	remaining: 1m 3s
178:	learn: 0.

320:	learn: 0.0015080	total: 24.8s	remaining: 52.4s
321:	learn: 0.0015058	total: 24.8s	remaining: 52.3s
322:	learn: 0.0015027	total: 24.9s	remaining: 52.2s
323:	learn: 0.0015005	total: 25s	remaining: 52.1s
324:	learn: 0.0014994	total: 25.1s	remaining: 52s
325:	learn: 0.0014969	total: 25.1s	remaining: 52s
326:	learn: 0.0014951	total: 25.2s	remaining: 51.9s
327:	learn: 0.0014922	total: 25.3s	remaining: 51.8s
328:	learn: 0.0014892	total: 25.3s	remaining: 51.7s
329:	learn: 0.0014857	total: 25.4s	remaining: 51.6s
330:	learn: 0.0014830	total: 25.5s	remaining: 51.5s
331:	learn: 0.0014756	total: 25.6s	remaining: 51.4s
332:	learn: 0.0014733	total: 25.6s	remaining: 51.3s
333:	learn: 0.0014707	total: 25.7s	remaining: 51.3s
334:	learn: 0.0014693	total: 25.8s	remaining: 51.2s
335:	learn: 0.0014658	total: 25.9s	remaining: 51.1s
336:	learn: 0.0014613	total: 25.9s	remaining: 51s
337:	learn: 0.0014603	total: 26s	remaining: 51s
338:	learn: 0.0014593	total: 26.1s	remaining: 50.9s
339:	learn: 0.0014540	to

482:	learn: 0.0010859	total: 35.4s	remaining: 37.9s
483:	learn: 0.0010840	total: 35.4s	remaining: 37.8s
484:	learn: 0.0010810	total: 35.5s	remaining: 37.7s
485:	learn: 0.0010789	total: 35.6s	remaining: 37.6s
486:	learn: 0.0010770	total: 35.7s	remaining: 37.6s
487:	learn: 0.0010763	total: 35.7s	remaining: 37.5s
488:	learn: 0.0010713	total: 35.8s	remaining: 37.4s
489:	learn: 0.0010710	total: 35.9s	remaining: 37.4s
490:	learn: 0.0010677	total: 36s	remaining: 37.3s
491:	learn: 0.0010641	total: 36s	remaining: 37.2s
492:	learn: 0.0010625	total: 36.1s	remaining: 37.1s
493:	learn: 0.0010619	total: 36.2s	remaining: 37.1s
494:	learn: 0.0010592	total: 36.3s	remaining: 37s
495:	learn: 0.0010580	total: 36.3s	remaining: 36.9s
496:	learn: 0.0010566	total: 36.4s	remaining: 36.8s
497:	learn: 0.0010555	total: 36.5s	remaining: 36.8s
498:	learn: 0.0010541	total: 36.5s	remaining: 36.7s
499:	learn: 0.0010500	total: 36.6s	remaining: 36.6s
500:	learn: 0.0010486	total: 36.7s	remaining: 36.6s
501:	learn: 0.0010

642:	learn: 0.0007767	total: 45.8s	remaining: 25.4s
643:	learn: 0.0007752	total: 45.9s	remaining: 25.4s
644:	learn: 0.0007745	total: 46s	remaining: 25.3s
645:	learn: 0.0007727	total: 46s	remaining: 25.2s
646:	learn: 0.0007712	total: 46.1s	remaining: 25.2s
647:	learn: 0.0007709	total: 46.2s	remaining: 25.1s
648:	learn: 0.0007680	total: 46.2s	remaining: 25s
649:	learn: 0.0007669	total: 46.3s	remaining: 24.9s
650:	learn: 0.0007661	total: 46.4s	remaining: 24.9s
651:	learn: 0.0007651	total: 46.5s	remaining: 24.8s
652:	learn: 0.0007647	total: 46.5s	remaining: 24.7s
653:	learn: 0.0007638	total: 46.6s	remaining: 24.6s
654:	learn: 0.0007620	total: 46.7s	remaining: 24.6s
655:	learn: 0.0007616	total: 46.7s	remaining: 24.5s
656:	learn: 0.0007595	total: 46.8s	remaining: 24.4s
657:	learn: 0.0007590	total: 46.9s	remaining: 24.4s
658:	learn: 0.0007581	total: 46.9s	remaining: 24.3s
659:	learn: 0.0007568	total: 47s	remaining: 24.2s
660:	learn: 0.0007541	total: 47s	remaining: 24.1s
661:	learn: 0.0007529	

802:	learn: 0.0005801	total: 57.3s	remaining: 14.1s
803:	learn: 0.0005782	total: 57.4s	remaining: 14s
804:	learn: 0.0005774	total: 57.4s	remaining: 13.9s
805:	learn: 0.0005751	total: 57.5s	remaining: 13.8s
806:	learn: 0.0005746	total: 57.5s	remaining: 13.8s
807:	learn: 0.0005738	total: 57.6s	remaining: 13.7s
808:	learn: 0.0005731	total: 57.7s	remaining: 13.6s
809:	learn: 0.0005728	total: 57.7s	remaining: 13.5s
810:	learn: 0.0005725	total: 57.8s	remaining: 13.5s
811:	learn: 0.0005705	total: 57.8s	remaining: 13.4s
812:	learn: 0.0005685	total: 57.9s	remaining: 13.3s
813:	learn: 0.0005682	total: 58s	remaining: 13.2s
814:	learn: 0.0005667	total: 58s	remaining: 13.2s
815:	learn: 0.0005661	total: 58.1s	remaining: 13.1s
816:	learn: 0.0005655	total: 58.1s	remaining: 13s
817:	learn: 0.0005652	total: 58.2s	remaining: 12.9s
818:	learn: 0.0005646	total: 58.2s	remaining: 12.9s
819:	learn: 0.0005632	total: 58.3s	remaining: 12.8s
820:	learn: 0.0005623	total: 58.4s	remaining: 12.7s
821:	learn: 0.000561

964:	learn: 0.0004388	total: 1m 7s	remaining: 2.44s
965:	learn: 0.0004383	total: 1m 7s	remaining: 2.37s
966:	learn: 0.0004379	total: 1m 7s	remaining: 2.29s
967:	learn: 0.0004374	total: 1m 7s	remaining: 2.23s
968:	learn: 0.0004370	total: 1m 7s	remaining: 2.15s
969:	learn: 0.0004365	total: 1m 7s	remaining: 2.08s
970:	learn: 0.0004361	total: 1m 7s	remaining: 2.02s
971:	learn: 0.0004357	total: 1m 7s	remaining: 1.95s
972:	learn: 0.0004355	total: 1m 7s	remaining: 1.88s
973:	learn: 0.0004331	total: 1m 7s	remaining: 1.81s
974:	learn: 0.0004326	total: 1m 7s	remaining: 1.74s
975:	learn: 0.0004317	total: 1m 7s	remaining: 1.67s
976:	learn: 0.0004312	total: 1m 7s	remaining: 1.6s
977:	learn: 0.0004300	total: 1m 7s	remaining: 1.53s
978:	learn: 0.0004292	total: 1m 7s	remaining: 1.46s
979:	learn: 0.0004284	total: 1m 8s	remaining: 1.39s
980:	learn: 0.0004273	total: 1m 8s	remaining: 1.32s
981:	learn: 0.0004272	total: 1m 8s	remaining: 1.25s
982:	learn: 0.0004266	total: 1m 8s	remaining: 1.18s
983:	learn: 0

In [44]:
model = catboost.CatBoostClassifier(loss_function='Logloss', verbose=True)

In [45]:
model.fit(X_train, y_train, cat_features=[1, 14, 15, 16, 17, 18, 19, 20, 21, 22])
preds = model.predict(X_train)
print(metrics.classification_report(y_train, preds))
preds = model.predict(X_test)
print(metrics.classification_report(y_test, preds))

Learning rate set to 0.08747
0:	learn: 0.4015438	total: 358ms	remaining: 5m 57s
1:	learn: 0.2428545	total: 690ms	remaining: 5m 44s
2:	learn: 0.1253821	total: 1.02s	remaining: 5m 38s
3:	learn: 0.0758395	total: 1.34s	remaining: 5m 34s
4:	learn: 0.0523029	total: 1.69s	remaining: 5m 35s
5:	learn: 0.0370727	total: 2.01s	remaining: 5m 32s
6:	learn: 0.0242656	total: 2.35s	remaining: 5m 32s
7:	learn: 0.0192770	total: 2.68s	remaining: 5m 32s
8:	learn: 0.0131610	total: 3.01s	remaining: 5m 31s
9:	learn: 0.0100398	total: 3.37s	remaining: 5m 33s
10:	learn: 0.0083150	total: 3.76s	remaining: 5m 37s
11:	learn: 0.0077124	total: 4.18s	remaining: 5m 43s
12:	learn: 0.0072146	total: 4.58s	remaining: 5m 47s
13:	learn: 0.0067301	total: 4.91s	remaining: 5m 45s
14:	learn: 0.0064761	total: 5.24s	remaining: 5m 43s
15:	learn: 0.0058810	total: 5.56s	remaining: 5m 41s
16:	learn: 0.0054641	total: 5.89s	remaining: 5m 40s
17:	learn: 0.0053647	total: 6.22s	remaining: 5m 39s
18:	learn: 0.0052040	total: 6.55s	remaining: 

158:	learn: 0.0023180	total: 51.3s	remaining: 4m 31s
159:	learn: 0.0023155	total: 51.6s	remaining: 4m 31s
160:	learn: 0.0023126	total: 51.9s	remaining: 4m 30s
161:	learn: 0.0023102	total: 52.2s	remaining: 4m 30s
162:	learn: 0.0023065	total: 52.6s	remaining: 4m 29s
163:	learn: 0.0023045	total: 52.8s	remaining: 4m 29s
164:	learn: 0.0022995	total: 53.1s	remaining: 4m 28s
165:	learn: 0.0022962	total: 53.4s	remaining: 4m 28s
166:	learn: 0.0022920	total: 53.7s	remaining: 4m 28s
167:	learn: 0.0022902	total: 54s	remaining: 4m 27s
168:	learn: 0.0022882	total: 54.3s	remaining: 4m 27s
169:	learn: 0.0022854	total: 54.6s	remaining: 4m 26s
170:	learn: 0.0022827	total: 54.9s	remaining: 4m 26s
171:	learn: 0.0022789	total: 55.2s	remaining: 4m 25s
172:	learn: 0.0022701	total: 55.5s	remaining: 4m 25s
173:	learn: 0.0022688	total: 55.7s	remaining: 4m 24s
174:	learn: 0.0022663	total: 56s	remaining: 4m 24s
175:	learn: 0.0022630	total: 56.3s	remaining: 4m 23s
176:	learn: 0.0022594	total: 56.6s	remaining: 4m 2

312:	learn: 0.0016661	total: 1m 37s	remaining: 3m 33s
313:	learn: 0.0016637	total: 1m 37s	remaining: 3m 33s
314:	learn: 0.0016567	total: 1m 37s	remaining: 3m 32s
315:	learn: 0.0016554	total: 1m 38s	remaining: 3m 32s
316:	learn: 0.0016538	total: 1m 38s	remaining: 3m 32s
317:	learn: 0.0016512	total: 1m 38s	remaining: 3m 31s
318:	learn: 0.0016507	total: 1m 39s	remaining: 3m 31s
319:	learn: 0.0016479	total: 1m 39s	remaining: 3m 31s
320:	learn: 0.0016460	total: 1m 39s	remaining: 3m 30s
321:	learn: 0.0016430	total: 1m 39s	remaining: 3m 30s
322:	learn: 0.0016397	total: 1m 40s	remaining: 3m 30s
323:	learn: 0.0016313	total: 1m 40s	remaining: 3m 29s
324:	learn: 0.0016278	total: 1m 40s	remaining: 3m 29s
325:	learn: 0.0016263	total: 1m 41s	remaining: 3m 29s
326:	learn: 0.0016210	total: 1m 41s	remaining: 3m 28s
327:	learn: 0.0016194	total: 1m 41s	remaining: 3m 28s
328:	learn: 0.0016179	total: 1m 42s	remaining: 3m 28s
329:	learn: 0.0016151	total: 1m 42s	remaining: 3m 27s
330:	learn: 0.0016139	total:

466:	learn: 0.0012583	total: 2m 25s	remaining: 2m 45s
467:	learn: 0.0012571	total: 2m 25s	remaining: 2m 45s
468:	learn: 0.0012522	total: 2m 25s	remaining: 2m 45s
469:	learn: 0.0012496	total: 2m 26s	remaining: 2m 44s
470:	learn: 0.0012453	total: 2m 26s	remaining: 2m 44s
471:	learn: 0.0012443	total: 2m 26s	remaining: 2m 44s
472:	learn: 0.0012430	total: 2m 27s	remaining: 2m 44s
473:	learn: 0.0012371	total: 2m 27s	remaining: 2m 43s
474:	learn: 0.0012360	total: 2m 27s	remaining: 2m 43s
475:	learn: 0.0012337	total: 2m 28s	remaining: 2m 43s
476:	learn: 0.0012327	total: 2m 28s	remaining: 2m 42s
477:	learn: 0.0012317	total: 2m 28s	remaining: 2m 42s
478:	learn: 0.0012294	total: 2m 29s	remaining: 2m 42s
479:	learn: 0.0012277	total: 2m 29s	remaining: 2m 42s
480:	learn: 0.0012254	total: 2m 29s	remaining: 2m 41s
481:	learn: 0.0012220	total: 2m 30s	remaining: 2m 41s
482:	learn: 0.0012201	total: 2m 30s	remaining: 2m 41s
483:	learn: 0.0012190	total: 2m 31s	remaining: 2m 40s
484:	learn: 0.0012138	total:

620:	learn: 0.0009553	total: 3m 14s	remaining: 1m 58s
621:	learn: 0.0009545	total: 3m 14s	remaining: 1m 58s
622:	learn: 0.0009523	total: 3m 14s	remaining: 1m 57s
623:	learn: 0.0009503	total: 3m 14s	remaining: 1m 57s
624:	learn: 0.0009489	total: 3m 15s	remaining: 1m 57s
625:	learn: 0.0009469	total: 3m 15s	remaining: 1m 56s
626:	learn: 0.0009461	total: 3m 15s	remaining: 1m 56s
627:	learn: 0.0009451	total: 3m 16s	remaining: 1m 56s
628:	learn: 0.0009445	total: 3m 16s	remaining: 1m 55s
629:	learn: 0.0009438	total: 3m 16s	remaining: 1m 55s
630:	learn: 0.0009420	total: 3m 17s	remaining: 1m 55s
631:	learn: 0.0009410	total: 3m 17s	remaining: 1m 54s
632:	learn: 0.0009399	total: 3m 17s	remaining: 1m 54s
633:	learn: 0.0009384	total: 3m 18s	remaining: 1m 54s
634:	learn: 0.0009371	total: 3m 18s	remaining: 1m 53s
635:	learn: 0.0009362	total: 3m 18s	remaining: 1m 53s
636:	learn: 0.0009349	total: 3m 18s	remaining: 1m 53s
637:	learn: 0.0009307	total: 3m 19s	remaining: 1m 53s
638:	learn: 0.0009299	total:

772:	learn: 0.0007402	total: 3m 59s	remaining: 1m 10s
773:	learn: 0.0007395	total: 4m	remaining: 1m 10s
774:	learn: 0.0007382	total: 4m	remaining: 1m 9s
775:	learn: 0.0007373	total: 4m	remaining: 1m 9s
776:	learn: 0.0007364	total: 4m 1s	remaining: 1m 9s
777:	learn: 0.0007358	total: 4m 1s	remaining: 1m 8s
778:	learn: 0.0007350	total: 4m 1s	remaining: 1m 8s
779:	learn: 0.0007346	total: 4m 2s	remaining: 1m 8s
780:	learn: 0.0007326	total: 4m 2s	remaining: 1m 7s
781:	learn: 0.0007318	total: 4m 2s	remaining: 1m 7s
782:	learn: 0.0007310	total: 4m 2s	remaining: 1m 7s
783:	learn: 0.0007304	total: 4m 3s	remaining: 1m 7s
784:	learn: 0.0007286	total: 4m 3s	remaining: 1m 6s
785:	learn: 0.0007276	total: 4m 3s	remaining: 1m 6s
786:	learn: 0.0007239	total: 4m 4s	remaining: 1m 6s
787:	learn: 0.0007228	total: 4m 4s	remaining: 1m 5s
788:	learn: 0.0007221	total: 4m 4s	remaining: 1m 5s
789:	learn: 0.0007220	total: 4m 4s	remaining: 1m 5s
790:	learn: 0.0007215	total: 4m 5s	remaining: 1m 4s
791:	learn: 0.0007

928:	learn: 0.0005714	total: 4m 45s	remaining: 21.8s
929:	learn: 0.0005710	total: 4m 45s	remaining: 21.5s
930:	learn: 0.0005706	total: 4m 45s	remaining: 21.2s
931:	learn: 0.0005690	total: 4m 46s	remaining: 20.9s
932:	learn: 0.0005674	total: 4m 46s	remaining: 20.6s
933:	learn: 0.0005662	total: 4m 46s	remaining: 20.3s
934:	learn: 0.0005652	total: 4m 47s	remaining: 20s
935:	learn: 0.0005646	total: 4m 47s	remaining: 19.6s
936:	learn: 0.0005632	total: 4m 47s	remaining: 19.3s
937:	learn: 0.0005619	total: 4m 47s	remaining: 19s
938:	learn: 0.0005610	total: 4m 48s	remaining: 18.7s
939:	learn: 0.0005604	total: 4m 48s	remaining: 18.4s
940:	learn: 0.0005599	total: 4m 48s	remaining: 18.1s
941:	learn: 0.0005583	total: 4m 49s	remaining: 17.8s
942:	learn: 0.0005570	total: 4m 49s	remaining: 17.5s
943:	learn: 0.0005567	total: 4m 49s	remaining: 17.2s
944:	learn: 0.0005558	total: 4m 50s	remaining: 16.9s
945:	learn: 0.0005546	total: 4m 50s	remaining: 16.6s
946:	learn: 0.0005543	total: 4m 50s	remaining: 16.

In [None]:
data = data.fillna({'zipcode': min(data['zipcode'])-1,
                   'policeprct': 0,
                   'healthcenterdistrict': 0,
                   'healtharea': 0,
                   'sanitboro': 0,
                   'sanitdistrict': 0,
                   'zonedist1': 'None',
                   'zonedist1': 'None',
                   'zonedist1': 'None',
                   'zonedist1': 'None',
                   'overlay1': 'None',
                   'overlay2': 'None',
                   'spdist1': 'None',
                   'spdist2': 'None',
                   'ltdheight': 'None',
                   'splitzone': 'None',
                   'landuse': 0,
                   'ext': 'None',
                   'proxcode': 0,
                   'irrlotcode': 'None',
                   'lottype': max(data['lottype'])+1,
                   'bsmtcode': max(data['bsmtcode'])+1,
                   'histdist': 'None',
                   'landmark': 'None',
                   'xcoord': 0,
                   'ycoord': 0,
                   'zonemap': 'None'})

In [None]:
data['ltdheight'].astype('category').unique(), data['ltdheight'].dtype

In [None]:
data['ltdheight'].fillna('None', inplace=True)

In [None]:
data['ltdheight'].dtype == 'O'

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
le.fit(data['ltdheight'].astype('category'))

In [None]:
test = pd.read_csv('../data/geophy/test.csv', low_memory=False)

In [None]:
pd.isna(test['xcoord']).sum()

In [None]:
pd.isna(data['xcoord']).sum()

In [None]:
data['ycoord'].min()

In [None]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [None]:
for i in data.dtypes:
    print(i)

In [None]:
X.head()

In [None]:
data.zipcode.unique()

In [None]:
grouped = data.groupby('target__office')

In [None]:
data[data['target__office']==True]['zonedist3'].unique()

In [None]:
names = data.columns

In [None]:
correlations = data.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
fig_size[0] = 20.; fig_size[1] = 9.
plt.show()

In [None]:
fig_size = plt.rcParams["figure.figsize"]

In [None]:
plt.show()

In [None]:
 plt.rcParams["figure.figsize"] = fig_size.

In [None]:
data.dtypes

In [None]:
print("Current size:", fig_size)