-
Notifications
You must be signed in to change notification settings - Fork 0
/
stack_stage1_both.py
59 lines (47 loc) · 1.87 KB
/
stack_stage1_both.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Imputer
from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
from zillow import modelling
import pickle as pkl
import dask.dataframe as dd
train_df = pd.concat([
dd.read_hdf("input/train2.*.hdf", "data").compute(),
dd.read_hdf("input/train_20172.*.hdf", "data").compute(),
])
month = train_df["yearmonth"]
train_y = train_df['logerror']
train_df = train_df.drop(["logerror", "transactiondate",
"year", "month", "yearmonth",
"error_last_year", "high_last_year", "low_last_year",
"error_last_month", "error_2ndlast_month", "error_3rdlast_month",
"error_last_3months"], axis=1)
encoders = {}
for c in train_df.columns:
if train_df[c].dtype == 'object':
encoders[c] = LabelEncoder()
train_df[c] = encoders[c].fit_transform(list(train_df[c].values))
train_df = train_df.replace(np.inf, np.nan).replace(-np.inf, np.nan)
feat_names = train_df.columns.values
print(feat_names.shape[0])
with open("input/feat_names_both.pkl", "wb") as f:
pkl.dump(feat_names, f)
sys.exit(0)
with open("input/encoders.pkl", "wb") as f:
pkl.dump(encoders, f)
# tolerance = 0.1
y = train_y # np.clip(train_y, np.median(train_y)-tolerance, np.median(train_y)+tolerance)
cv = LeaveOneGroupOut()
preds = {}
for n, model in modelling.stage1_models.items():
name = "both_" + n
print(name)
preds[name] = cross_val_predict(model, train_df, y, cv=cv, groups=month)[month > 12]
print(name, np.abs((train_y[month > 12] - preds[name])).mean())
model.fit(train_df, y)
with open("models/{}.py".format(name), "wb") as f:
pkl.dump(model, f)
del train_df
train_df = pd.DataFrame(preds)
train_df.to_csv("stack_stage1_3.csv")