In [1]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import auc, f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.impute import KNNImputer
from tqdm import tqdm_notebook
from tqdm import tqdm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [239]:
SEED = 42

In [240]:
train_df = pd.read_csv("Train.csv")

In [241]:
def encode(col, target, df):
    grouped = df.groupby(col)[target].mean()
    grouped = grouped.sort_values().index
    translator = {grouped[i] : i for i in range(len(grouped))}
    return df[col].apply(lambda x: translator[x])

In [242]:
train_df["street_id"] = encode("street_id", "price", train_df)

In [243]:
train_df["floor"] = encode("floor", "price", train_df)

In [244]:
def encode_date(data, date):
    splitted = [int(d) for d in date.split("-")]
    return (splitted[0] - 2011) * 12 + splitted[1] - 1

In [245]:
train_df["date"] = train_df["date"].apply(lambda d: encode_date(train_df, d))

In [123]:
# train_df["date"] = encode("date", "price", train_df)

In [116]:
# train_df[train_df["metro_dist"].notnull().values]["metro_dist"]

0        30.0
1        25.0
2        25.0
3        30.0
4        25.0
         ... 
99995    30.0
99996    30.0
99997     5.0
99998     5.0
99999     5.0
Name: metro_dist, Length: 95104, dtype: float64

In [117]:
# train_df[train_df["metro_dist"].notnull().values]["metro_dist"] = encode("metro_dist", "price", train_df[train_df["metro_dist"].notnull().values])

In [246]:
train_df

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,0,322,,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,4510000
1,1,0,650,,10,50,2,1,25.0,,...,0,0,0,0,0,0,0,0,0,13231000
2,2,0,246,0.0,0,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,2008000
3,3,0,670,1.0,3,62,3,1,30.0,,...,0,0,0,0,0,0,0,0,0,12680000
4,4,0,438,0.0,3,60,3,0,25.0,,...,0,0,0,0,0,0,0,0,0,3335000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,14,342,0.0,1,30,1,0,30.0,,...,0,0,0,0,0,0,0,0,0,1697000
99996,99996,14,465,0.0,4,34,1,0,30.0,,...,0,0,0,0,0,0,0,0,0,3620000
99997,99997,14,266,,3,52,2,0,5.0,,...,0,0,0,0,0,0,0,0,0,6712000
99998,99998,14,243,1.0,3,181,5,1,5.0,,...,0,0,0,0,0,0,0,0,0,20835000


In [74]:
train_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,0,322,,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,4510000
1,1,0,650,,10,50,2,1,25.0,,...,0,0,0,0,0,0,0,0,0,13231000
2,2,0,246,0.0,0,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,2008000
3,3,0,670,1.0,3,62,3,1,30.0,,...,0,0,0,0,0,0,0,0,0,12680000
4,4,0,438,0.0,3,60,3,0,25.0,,...,0,0,0,0,0,0,0,0,0,3335000


In [200]:
test_df = pd.read_csv("Test.csv")

In [201]:
test_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,100000,2012-3,459,,1,60,3,1,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100001,2012-3,344,1.0,10,52,2,1,,,...,0,0,0,0,0,0,0,0,0,0
2,100002,2012-3,585,0.0,4,54,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,100003,2012-3,494,,2,52,2,1,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,100004,2012-3,622,1.0,9,60,3,1,15.0,,...,0,0,0,0,0,0,0,0,0,0


In [247]:
X = train_df
y = X["price"]
del X["id"]
del X["price"]
# del X["date"]


In [224]:
test_df.isnull().sum()

id                0
date              0
street_id         0
build_tech    30020
floor             0
area              0
rooms             0
balcon            0
metro_dist     5117
g_lift        29814
n_photos          0
kw1               0
kw2               0
kw3               0
kw4               0
kw5               0
kw6               0
kw7               0
kw8               0
kw9               0
kw10              0
kw11              0
kw12              0
kw13              0
dtype: int64

In [56]:
# del X["build_tech"]

In [233]:
# def gen_pairs(row, a, b):
#     return row["kw" + str(a)] * row["kw" + str(b)]

In [240]:
# for a in range(1, 14):
#     for b in range(1, 14):
#         print((a - 1) * 14 + b)
#         X["kw" + str(a) + "&" + str(b)] = X.apply(lambda row: gen_pairs(row, a, b), axis=1)

1
2
3
4
5
6
7
8
9
10
11
12
13
15
16
17
18
19
20
21
22
23
24
25
26
27
29
30
31
32
33
34
35
36
37
38
39
40
41
43
44
45
46
47
48
49
50
51
52
53
54
55
57
58
59
60
61
62
63
64
65
66
67
68
69
71
72
73
74
75
76
77
78
79
80
81
82
83
85
86
87
88
89
90
91
92
93
94
95
96
97
99
100
101
102
103
104
105
106
107
108
109
110
111
113
114
115
116
117
118
119
120
121
122
123
124
125
127
128
129
130
131
132
133
134
135
136
137
138
139
141
142
143
144
145
146
147
148
149
150
151
152
153
155
156
157
158
159
160
161
162
163
164
165
166
167
169
170
171
172
173
174
175
176
177
178
179
180
181


In [136]:
# for i in range(1, 14):
#     del X["kw" + str(i)]

In [248]:
X_train, X_val,  y_train, y_val = train_test_split(X, y, train_size=0.75, shuffle=False, random_state=SEED)

In [249]:
X_train.head()

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,0,322,,2,59,3,0,30.0,1.0,5,...,0,0,0,0,0,0,0,0,0,0
1,0,650,,10,50,2,1,25.0,,1,...,0,0,0,0,0,0,0,0,0,0
2,0,246,0.0,0,48,2,0,25.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,670,1.0,3,62,3,1,30.0,,3,...,0,0,0,0,0,0,0,0,0,0
4,0,438,0.0,3,60,3,0,25.0,,3,...,0,0,0,0,0,0,0,0,0,0


In [251]:
imputer = IterativeImputer(estimator=DecisionTreeRegressor(random_state=SEED), max_iter=10)

In [252]:
X_imputed = imputer.fit_transform(X)

In [253]:
X = pd.DataFrame(X_imputed, columns=X.columns)

In [254]:
X_train, X_val,  y_train, y_val = train_test_split(X, y, train_size=0.75, shuffle=False, random_state=SEED)

In [256]:
X_val

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
75000,21.0,466.0,0.0,2.0,41.0,2.0,0.0,25.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75001,21.0,47.0,0.0,8.0,61.0,3.0,0.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75002,21.0,487.0,1.0,3.0,60.0,3.0,0.0,20.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75003,21.0,125.0,1.0,0.0,51.0,2.0,0.0,25.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75004,21.0,135.0,0.0,11.0,63.0,3.0,0.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,14.0,342.0,0.0,1.0,30.0,1.0,0.0,30.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,14.0,465.0,0.0,4.0,34.0,1.0,0.0,30.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,14.0,266.0,1.0,3.0,52.0,2.0,0.0,5.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,14.0,243.0,1.0,3.0,181.0,5.0,1.0,5.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [229]:
# X_train_imputed = imputer.fit_transform(X_train)

In [230]:
# X_train = pd.DataFrame(X_train_imputed, columns=X_train.columns)

In [231]:
rf = RandomForestRegressor(n_estimators=100, random_state=SEED)

In [233]:
rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = rf.predict(X_val)

In [None]:
mae(y_val, y_pred)

902617.6168285714

In [None]:
X_val_imputed = imputer.transform(X_val)

In [None]:
X_val = pd.DataFrame(X_val_imputed, columns=X_val.columns)

In [None]:
y_pred = rf.predict(pd.DataFrame(X_val_imputed, columns=X_val.columns))

In [None]:
mae(y_val, y_pred)

1148999.2099573014

In [234]:
X_train

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
98980,14.0,536.0,0.0,4.0,33.0,1.0,0.0,10.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69824,12.0,295.0,0.0,4.0,33.0,1.0,0.0,30.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9928,9.0,332.0,0.0,4.0,32.0,1.0,0.0,30.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75599,21.0,430.0,0.0,8.0,74.0,3.0,0.0,30.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95621,14.0,131.0,0.0,3.0,59.0,3.0,0.0,15.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,9.0,271.0,1.0,1.0,60.0,3.0,1.0,30.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54886,6.0,346.0,1.0,10.0,60.0,3.0,1.0,20.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76820,21.0,63.0,1.0,2.0,60.0,3.0,1.0,30.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,346.0,1.0,0.0,60.0,3.0,0.0,25.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [211]:
from sklearn.ensemble import GradientBoostingRegressor

In [257]:
gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=1)

In [258]:
gbr.fit(X_train, y_train)

In [259]:
y_pred = gbr.predict(X_val)

In [260]:
mae(y_val, y_pred)

823600.6588079466

In [185]:
# from catboost import CatBoostRegressor, Pool

In [261]:
train_pool = Pool(data=X_train, label=y_train)

In [262]:
regressor = CatBoostRegressor()

In [263]:
regressor.fit(train_pool)

Learning rate set to 0.080991
0:	learn: 3866271.2408979	total: 14.4ms	remaining: 14.4s
1:	learn: 3659887.6446438	total: 26.4ms	remaining: 13.2s
2:	learn: 3467719.5142253	total: 38.3ms	remaining: 12.7s
3:	learn: 3295978.8822858	total: 50.5ms	remaining: 12.6s
4:	learn: 3139993.8965579	total: 61.8ms	remaining: 12.3s
5:	learn: 2998696.8462951	total: 72.3ms	remaining: 12s
6:	learn: 2871633.2576322	total: 83.5ms	remaining: 11.8s
7:	learn: 2758565.1970769	total: 94.5ms	remaining: 11.7s
8:	learn: 2651887.3578827	total: 105ms	remaining: 11.5s
9:	learn: 2556391.0636527	total: 116ms	remaining: 11.5s
10:	learn: 2475169.8024472	total: 126ms	remaining: 11.3s
11:	learn: 2400114.6986336	total: 136ms	remaining: 11.2s
12:	learn: 2333197.0719753	total: 147ms	remaining: 11.2s
13:	learn: 2269744.6932806	total: 157ms	remaining: 11.1s
14:	learn: 2214829.1191947	total: 167ms	remaining: 11s
15:	learn: 2164329.1699319	total: 178ms	remaining: 10.9s
16:	learn: 2119476.2387020	total: 189ms	remaining: 10.9s
17:	lea

<catboost.core.CatBoostRegressor at 0x174656fef20>

In [264]:
mae(y_val, regressor.predict(X_val))

960632.0521375545

In [23]:
# from sklearn.ensemble import BaggingRegressor

In [187]:
# base_estimator = CatBoostRegressor()

In [221]:
# bagging = BaggingRegressor(base_estimator=CatBoostRegressor, n_estimators=20,
#                              max_features=0.2, bootstrap = True, 
#                              bootstrap_features=True, random_state=SEED, n_jobs=-1)