In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [271]:
SEED = 42

In [300]:
train_df = pd.read_csv("Train.csv")
del train_df["id"]

In [301]:
test_df = pd.read_csv("Test.csv")
del test_df["id"]
test_df.head()

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,2012-3,459,,1,60,3,1,30.0,0.0,4,...,0,0,0,0,0,0,0,0,0,0
1,2012-3,344,1.0,10,52,2,1,,,4,...,0,0,0,0,0,0,0,0,0,0
2,2012-3,585,0.0,4,54,3,0,30.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,2012-3,494,,2,52,2,1,25.0,1.0,3,...,0,0,0,0,0,0,0,0,0,0
4,2012-3,622,1.0,9,60,3,1,15.0,,1,...,0,0,0,0,0,0,0,0,0,0


In [302]:
for i in range(3, 14):
    del train_df["kw"+str(i)]
    del test_df["kw"+str(i)]

In [303]:
del train_df['g_lift']
del test_df['g_lift']

In [304]:
train_df.head()

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,n_photos,kw1,kw2,price
0,2011-1,560,,2,59,3,0,30.0,5,0,1,4510000
1,2011-1,667,,10,50,2,1,25.0,1,0,1,13231000
2,2011-1,90,0.0,1,48,2,0,25.0,1,0,1,2008000
3,2011-1,94,1.0,3,62,3,1,30.0,3,0,1,12680000
4,2011-1,232,0.0,3,60,3,0,25.0,3,0,1,3335000


In [305]:
def encode(col, target, df_from, df_to):
    grouped = df_from.groupby(col)[target].mean()
    grouped = grouped.sort_values().index
    translator = {grouped[i] : i for i in range(len(grouped))}
    return df_to[col].apply(lambda x: translator[x])

In [306]:
test_df["street_id"] = encode("street_id", "price", train_df, test_df)
test_df["floor"] = encode("floor", "price", train_df, test_df)

In [307]:
train_df["street_id"] = encode("street_id", "price", train_df, train_df)
train_df["floor"] = encode("floor", "price", train_df, train_df)

In [308]:
y = train_df["price"]
del train_df["price"]

In [309]:
def encode_date(data, date):
    splitted = [int(d) for d in date.split('-')]
    return (splitted[0] - 2011) * 12 + splitted[1] - 1

In [310]:
train_df["date"] = train_df["date"].apply(lambda d: encode_date(train_df, d))

In [311]:
test_df["date"] = test_df["date"].apply(lambda d: encode_date(test_df, d))

In [312]:
test_df.head()

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,n_photos,kw1,kw2
0,14,453,,0,60,3,1,30.0,4,0,0
1,14,154,1.0,10,52,2,1,,4,0,0
2,14,549,0.0,4,54,3,0,30.0,0,0,0
3,14,42,,2,52,2,1,25.0,3,0,1
4,14,242,1.0,5,60,3,1,15.0,1,1,1


In [313]:
train_df.head()

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,n_photos,kw1,kw2
0,0,322,,2,59,3,0,30.0,5,0,1
1,0,650,,10,50,2,1,25.0,1,0,1
2,0,246,0.0,0,48,2,0,25.0,1,0,1
3,0,670,1.0,3,62,3,1,30.0,3,0,1
4,0,438,0.0,3,60,3,0,25.0,3,0,1


In [314]:
scaler = MinMaxScaler()
scaler.fit(pd.DataFrame(train_df["date"]))

In [315]:
train_df["date"] = scaler.transform(pd.DataFrame(train_df["date"])).reshape(1, -1)[0]

In [316]:
test_df["date"] = scaler.transform(pd.DataFrame(test_df["date"])).reshape(1, -1)[0]

In [317]:
train_df.isnull().sum()

date              0
street_id         0
build_tech    29735
floor             0
area              0
rooms             0
balcon            0
metro_dist     4896
n_photos          0
kw1               0
kw2               0
dtype: int64

In [318]:
imputer = IterativeImputer(estimator=DecisionTreeRegressor(random_state=SEED), max_iter=20)

In [319]:
train_imputed = imputer.fit_transform(train_df)
train_df = pd.DataFrame(train_imputed, columns=train_df.columns)

In [320]:
test_imputed = imputer.fit_transform(test_df)
test_df = pd.DataFrame(test_imputed, columns=test_df.columns)

In [235]:
train_df.head()

Unnamed: 0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,n_photos,kw1,kw2
0,0.0,322.0,0.0,2.0,59.0,3.0,0.0,30.0,5.0,0.0,1.0
1,0.0,650.0,1.0,10.0,50.0,2.0,1.0,25.0,1.0,0.0,1.0
2,0.0,246.0,0.0,0.0,48.0,2.0,0.0,25.0,1.0,0.0,1.0
3,0.0,670.0,1.0,3.0,62.0,3.0,1.0,30.0,3.0,0.0,1.0
4,0.0,438.0,0.0,3.0,60.0,3.0,0.0,25.0,3.0,0.0,1.0


In [316]:
from sklearn.ensemble import GradientBoostingRegressor

In [321]:
gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.8, loss='huber',
                                random_state=SEED, criterion='squared_error')

In [322]:
gbr.fit(train_df, y)

In [323]:
y_pred = gbr.predict(test_df)

In [324]:
res = pd.DataFrame({'id': range(100000, 100000+len(y_pred)), 'price': y_pred})
res.to_csv('result.csv', index=False)