# Comparison of machine learning regression models

Models goal: predicting ```monthly rent price``` for an apartment in the biggest polish cities.

## Preparing data

In [None]:
import warnings
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

Loading data, removing selected columns and limiting the dataset to rent prices <= 10000 PLN/month to get rid of extreme outliers.

In [None]:
df = pd.read_csv("data.csv")
df = df[df["price"] <= 10000]
print("Number of observations:", df.shape[0])
print("Number of variables:", df.shape[1])

#### Add distance to city centre

In [None]:
boundaries = pd.read_csv("districts_boundaries.csv")

In [None]:
from shapely import wkb

def get_map_shapes(df_shapes):
    df = pd.DataFrame(
        columns=["longitude", "latitude", "city_id", "district", "district_id"]
    )
    for i in range(len(df_shapes)):
        shape = wkb.loads(str(df_shapes.at[i, "boundaries"]), hex=True)
        df_coord = pd.DataFrame(
            list(shape.exterior.coords), columns=["longitude", "latitude"]
        )
        df_coord["district"] = df_shapes.at[i, "slug"]
        df_coord["city_id"] = df_shapes.at[i, "city_id"]
        df_coord["district_id"] = df_shapes.at[i, "id"]
        df = pd.concat([df, df_coord], ignore_index=True)
    return df 

boundaries = get_map_shapes(boundaries)
boundaries["coordinates"] = list(zip(boundaries["latitude"], boundaries["longitude"]))
boundaries.sample(3)

In [None]:
boundaries = boundaries.groupby(['district','district_id','city_id'])['coordinates'].apply(list).reset_index(name='polygon')
boundaries.sample(3)

In [None]:
from shapely.geometry.polygon import Polygon
from shapely import wkt

for j in range(len(boundaries)):
    polygon = Polygon(boundaries.at[j,'polygon'])
    centroid = polygon.centroid.coords[0]
    boundaries.at[j,'centroid_lat'] = centroid[0]
    boundaries.at[j,'centroid_lon'] = centroid[1]
    
boundaries.head(3)

In [None]:
cities = {
    1:"warszawa",
    6:"krakow",
    7:"poznan",
    5:"wroclaw",
    2:"gdansk",
    3:"gdynia",
    9:"szczecin",
    13:"bialystok",
    14:"katowice",
    12:"lublin",
    8:"lodz",
    10:"bydgoszcz",
}

cities = {v: k for k, v in cities.items()}

centers = {
    1:5,
    6:139,
    7:184,
    5:130,
    2:56,
    3:59,
    8:201,
    14:278,
    9:206,
    12:208,
    13:235,
}

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371
    return round(c * r * 1000)

df['distance_to_centre'] = df.apply(lambda row : haversine(
    row['longitude'], row['latitude'],
    boundaries[boundaries["district_id"] == centers[cities[row["city"]]]]["centroid_lon"].iloc[0],
    boundaries[boundaries["district_id"] == centers[cities[row["city"]]]]["centroid_lat"].iloc[0],
), axis = 1)

In [None]:
df = df[df["distance_to_centre"] <= 15000]
print("Number of observations:", df.shape[0])
print("Number of variables:", df.shape[1])

Data sample

In [None]:
df.sample(5)

Descriptive statistics of all variables.

In [None]:
df.describe(include='all')

#### Distance to closesnt competitive offer

In [None]:
df.reset_index(inplace=True, drop=True)
    
from sklearn.neighbors import BallTree

coords = df[["latitude","longitude"]]
distance = 0.500 # km
earth_radius = 6371
radius = distance / earth_radius
tree = BallTree( np.radians(coords), leaf_size=10, metric='haversine')
df["competition500m"] = tree.query_radius( np.radians(coords), r=radius, count_only=True)

df.sample(5)

In [None]:
# list of binary/categorical columns
dummy_cols = ['balcony', 'elevator', 'furnished', 'internet', 'parking', 'separate_kitchen', 'storage', 'tv', 'building_material', 'building_type', 'standard', 'windows', 'city']
# target variable
Y = df["price"]
# independent variables
X = df[df.columns.difference(['price'])]
# transform binary/categorical variables into dummy variables
dummies = pd.get_dummies(data=X[dummy_cols])
# list of numerical columns
X_numerical = X.drop(dummy_cols+["district"], axis=1).astype('float64')
numerical_cols = X_numerical.columns
# dataframe with all variables except target variable
X = pd.concat([X_numerical, dummies, X["district"]], axis=1)
# normalizing numerical variables
X[numerical_cols] = X[numerical_cols].apply(lambda x: (x-x.mean())/x.std())
# transforming true/false to 1/0 for binary variables
for col in ["balcony", "elevator", "furnished", "internet", "parking", "separate_kitchen", "storage", "tv"]:
    X[col] = X[col].astype(int)

Data sample after preparation.

In [None]:
X.tail(5)

Checking for missing values and data types.

In [None]:
X.info()

In [None]:
from scipy.stats import pearsonr
pearsonr(X["floor"], X["floors"])

In [None]:
import plotly.express as px

cols = ['floor','floors','latitude','longitude','rooms','size','year_built', 'distance_to_centre', 'competition500m']

def corr_plot(df, method='pearson'):
    corr = df.corr(method=method).round(2)
    fig = px.imshow(
        corr,
        color_continuous_scale='oranges',
        text_auto=True,
        aspect="auto",
#         title="Correlation matrix",
#         width=1000, height=800,
    )
# .update_layout(transparent_plot)
    fig.update_layout(
        coloraxis_showscale=False,
        font_family="Times New Roman",
        font_size=20,
    )
    return fig

corr_plot(X[cols].rename(columns={
    "floors": "number of floors",
    "rooms": "number of rooms",
    "size": "living area",
    "year_built": "year of construction",
    "distance_to_centre": "distance to city centre",
    "competition500m": "competitive offers",
}))

There are no missing values in the dataset.

## Feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from collections import Counter
from numpy import set_printoptions

cols_before = list(X.columns)
cols_before = [c for c in cols_before if "city" not in c and "district" not in c]
cols_dropped = []
mi = pd.DataFrame({"variable":cols_before})
for i in range(5):
    # selecting 20 best features
    selector = SelectKBest(mutual_info_regression, k=20).fit(X[cols_before], Y)
    cols = selector.get_support(indices=True)
    X_n = X[cols_before].iloc[:,cols]
    # list of columns after applying the algorithm
    cols_after = list(X_n.columns)
    cols_dropped += (list(set(cols_before) - set(cols_after)))
    set_printoptions(precision=3)
    mi[f"iter_{i}"] = selector.scores_

mi['avg'] = mi[[f'iter_{i}' for  i in range(5)]].mean(axis=1)
mi = mi.sort_values(by=["avg"], ignore_index=True, ascending=False)
mi

In [None]:
variables_to_drop = mi.tail(10)["variable"].tolist()
variables_to_drop

In [None]:
X.drop(variables_to_drop, axis=1, inplace=True)
print("\nDataframe shape:", X.shape)

## Train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=101)
train = X_train.merge(y_train.to_frame(), left_index=True, right_index=True)
test = X_test.merge(y_test.to_frame(), left_index=True, right_index=True)
print("train size:", X_train.shape[0])
print("test size:", X_test.shape[0])

X_train_districts = X_train["district"]

cols_coords = [c for c in X_train.columns if "city" not in c and "district" not in c and "distance" not in c and "competition" not in c]
cols_spatial = [c for c in X_train.columns if "latitude" not in c and "longitude" not in c and "district" not in c]
X_train_coord = X_train[cols_coords]
X_train_spatial = X_train[cols_spatial]

cols_coords = [c for c in X_test.columns if "city" not in c and "district" not in c and "distance" not in c and "competition" not in c]
cols_spatial = [c for c in X_test.columns if "latitude" not in c and "longitude" not in c and "district" not in c]
X_test_coord = X_test[cols_coords]
X_test_spatial = X_test[cols_spatial]

In [None]:
d = pd.DataFrame(
    {
        "lat": X_test_coord["latitude"],
        "lon": X_test_coord["longitude"],
        "price": y_test,
    }
)

X_train_coord[[col for col in numerical_cols if col in X_train_coord.columns]] = X_train_coord[[col for col in numerical_cols if col in X_train_coord.columns]].apply(lambda x: (x-x.mean())/x.std())
X_train_spatial[[col for col in numerical_cols if col in X_train_spatial.columns]] = X_train_spatial[[col for col in numerical_cols if col in X_train_spatial.columns]].apply(lambda x: (x-x.mean())/x.std())

X_test_coord[[col for col in numerical_cols if col in X_test_coord.columns]] = X_test_coord[[col for col in numerical_cols if col in X_test_coord.columns]].apply(lambda x: (x-x.mean())/x.std())
X_test_spatial[[col for col in numerical_cols if col in X_test_spatial.columns]] = X_test_spatial[[col for col in numerical_cols if col in X_test_spatial.columns]].apply(lambda x: (x-x.mean())/x.std())

In [None]:
from shapely.geometry import Point
import geopandas

X_train_coord['geometry'] = X_train_coord.apply(lambda x: Point((float(x.longitude), float(x.latitude))), axis=1)
X_train_coord = geopandas.GeoDataFrame(X_train_coord, geometry='geometry')
X_train_coord.to_file('X_train_coord.shp', driver='ESRI Shapefile')

X_train_spatial['geometry'] = X_train_coord['geometry']
X_train_spatial = geopandas.GeoDataFrame(X_train_spatial, geometry='geometry')
X_train_spatial.to_file('X_train_spatial.shp', driver='ESRI Shapefile')

X_test_coord['geometry'] = X_test_coord.apply(lambda x: Point((float(x.longitude), float(x.latitude))), axis=1)
X_test_coord = geopandas.GeoDataFrame(X_test_coord, geometry='geometry')
X_test_coord.to_file('X_test_coord.shp', driver='ESRI Shapefile')

X_test_spatial['geometry'] = X_test_coord['geometry']
X_test_spatial = geopandas.GeoDataFrame(X_test_spatial, geometry='geometry')
X_test_spatial.to_file('X_test_spatial.shp', driver='ESRI Shapefile')

## Evaluation methods

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score

def percentage_error(y_true, y_pred, metric):
    if metric == "mean":
        mape = np.mean(np.abs((y_true - y_pred)/y_true))*100
    elif metric == "median":
        mape = np.median(np.abs((y_true - y_pred)/y_true))*100
    else:
        mape = None
    return mape

def eval_metrics(metrics_table, model_name, y_true, y_pred, add_to_results=True):
    assert len(y_true) == len(y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    mape = percentage_error(y_true, y_pred, "mean")
    medae = median_absolute_error(y_true, y_pred)
    medape = percentage_error(y_true, y_pred, "median") 
    r2 = r2_score(y_true, y_pred)
    print(model_name)
    print("------------------------")
    print("MSE:", round(mse, 4)) # mean squared error
    print("RMSE:", round(rmse, 4)) # root mean squared error
    print("MAE:", round(mae, 4)) # mean absolute error
    print("MAPE:", round(mape, 4)) # mean absolute percentage error
    print("MedAE:", round(medae, 4)) # median absolute error
    print("MedAPE:", round(medape, 4)) # median absolute percentage error
    print("R2:", round(r2, 4)) # R^2
    print("------------------------")
    if add_to_results:
        metrics_table.at[model_name, "MSE"] = mse
        metrics_table.at[model_name, "RMSE"] = rmse
        metrics_table.at[model_name, "MAE"] = mae
        metrics_table.at[model_name, "MAPE"] = mape
        metrics_table.at[model_name, "MedAE"] = medae
        metrics_table.at[model_name, "MedAPE"] = medape
        metrics_table.at[model_name, "R2"] = r2
    return metrics_table

In [None]:
model_names = [
    "coords_linear_regression",
    "coords_lasso_regression",
    "coords_ridge_regression",
    "coords_svm",
    "coords_random_forest",
    "coords_xgboost",
    "spatial_linear_regression",
    "spatial_lasso_regression",
    "spatial_ridge_regression",
    "spatial_svm",
    "spatial_random_forest",
    "spatial_xgboost",
]
metric_names = ["MSE", "RMSE", "MAE","MAPE", "MedAE", "MedAPE", "R2"]
metrics_table = pd.DataFrame(index = model_names, columns = metric_names)

## Models performance visualization

In [None]:
import plotly.graph_objects as go
from scipy import stats

def plot_scatter(model_name, y_true, y_pred):
    slope, intercept, r_value, p_value, std_err = stats.linregress(y_true, y_pred)
    line = slope * y_true + intercept
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=y_true, y=y_pred, showlegend=False, mode='markers', name='markers', marker={"size":3, "color": "blue"}))
    fig.add_trace(go.Scatter(x=y_true, y=line, mode='lines', name="Model fit", marker={"color": "red"}))
    fig.add_trace(go.Scatter(x=[0,10000], y=[0,10000], mode='lines', name="Best fit", marker={"color": "black"}))
    fig.update_layout(
        title=model_name,
        xaxis_title="True values",
        yaxis_title="Predicted values",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            font_size=20,
        ),
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=20)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()
    
def plot_histogram(model_name, y_true, y_pred):
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=y_true - y_pred, nbinsx=150))
    fig.update_traces(marker_line_width=1,marker_line_color="black")
    fig.update_layout(
        title=model_name,
        xaxis_title="True values - Predicted values",
        yaxis_title="Number of observations",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=20)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()

## Model parameters tuning method

In [None]:
from sklearn.model_selection import GridSearchCV,GroupKFold

scoring=[
    'neg_root_mean_squared_error', # rmse
    'neg_mean_absolute_error', # mae
    'r2',
]

def parameters_tuning(x, estimator, param_grid):
    gkf = GroupKFold(n_splits=5).split(x, y_train, X_train_districts)
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        refit='neg_mean_absolute_error',
        scoring=scoring,
        cv=gkf,
        n_jobs=-1,
        verbose=2,
    )
    grid_search.fit(x, y_train)
    print(grid_search.best_params_)
    return grid_search

# Modelling

## Linear Regression

#### Coord OLS

In [None]:
from sklearn.linear_model import LinearRegression

model_linear_regression_coord = LinearRegression(fit_intercept=True)
model_linear_regression_coord.fit(X_train_coord,y_train)
preds_linear_regression_coord = model_linear_regression_coord.predict(X_test_coord)

In [None]:
import statsmodels.api as sm

X_train_lr = sm.add_constant(X_train_coord)
model_linear_regression_coord1 = sm.OLS(y_train,X_train_lr).fit()
model_linear_regression_coord1.summary()

In [None]:
lin_fi = pd.DataFrame({'variable':coeff_parameter_linear_regression_coord.index, 'weight':coeff_parameter_linear_regression_coord.Coefficient.values})
lin_fi.dropna(inplace=True)
lin_fi

In [None]:
valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "latitude": "latitude",
    "longitude": "longitude",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
}

lin_fi['variable'] = lin_fi['variable'].map(valmap)

def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["weight"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Coefficient estimates",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()
    
plot_feature_importance(lin_fi.reindex(lin_fi.weight.abs().sort_values(ascending=True).index), "Absolute location")

In [None]:
plot_scatter("Absolute location", y_test, preds_linear_regression_coord)

In [None]:
plot_histogram("Absolute location", y_test, preds_linear_regression_coord)

In [None]:
metrics_table = eval_metrics(metrics_table, "coords_linear_regression", y_test, preds_linear_regression_coord)

#### Spatial OLS

In [None]:
from sklearn.linear_model import LinearRegression

model_linear_regression_spatial = LinearRegression(fit_intercept=True)
model_linear_regression_spatial.fit(X_train_spatial,y_train)
preds_linear_regression_spatial = model_linear_regression_spatial.predict(X_test_spatial)

In [None]:
import statsmodels.api as sm
X_train_lr = sm.add_constant(X_train_spatial)
model_linear_regression_spatial = sm.OLS(y_train,X_train_lr).fit()
model_linear_regression_spatial.summary()

In [None]:
plot_scatter("Relative location", y_test, preds_linear_regression_spatial)

In [None]:
plot_histogram("Relative location", y_test, preds_linear_regression_spatial)

In [None]:
metrics_table = eval_metrics(metrics_table, "spatial_linear_regression", y_test, preds_linear_regression_spatial)

## Regularization

### Lasso Regression

#### Coord Lasso

In [None]:
from sklearn.linear_model import Lasso

param_grid_lasso_regression_coord = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20, 100]}

grid_search_lasso_regression_coord = parameters_tuning(X_train_coord, Lasso(), param_grid_lasso_regression_coord)

In [None]:
from sklearn.linear_model import Lasso
model_lasso_regression_coord = Lasso(**grid_search_lasso_regression_coord.best_params_)
model_lasso_regression_coord.fit(X_train_coord, y_train)
preds_lasso_regression_coord = model_lasso_regression_coord.predict(X_test_coord)

In [None]:
import eli5

eli5.show_weights(
    model_lasso_regression_coord,
    top=-1,
    feature_names = X_train_coord.columns.tolist(),
)

In [None]:
def lasso1(alphas):
    df = pd.DataFrame()
    df['variable'] = X_train_coord.columns.tolist()
    for alpha in alphas:
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train_coord, y_train)
        column_name = 'weight'
        df[column_name] = lasso.coef_ 
    return df

lasso_fi = lasso1([10])

In [None]:
lasso_fi

In [None]:
valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "latitude": "latitude",
    "longitude": "longitude",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
}

lasso_fi['variable'] = lasso_fi['variable'].map(valmap)

def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["weight"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Coefficient estimates",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()

plot_feature_importance(lasso_fi.reindex(lasso_fi.weight.abs().sort_values(ascending=True).index), "Lasso regression")

In [None]:
plot_scatter("Absolute location", y_test, preds_lasso_regression_coord)

In [None]:
plot_histogram("Absolute location", y_test, preds_lasso_regression_coord)

In [None]:
metrics_table = eval_metrics(metrics_table, "coords_lasso_regression", y_test, preds_lasso_regression_coord)

#### Spatial Lasso

In [None]:
from sklearn.linear_model import Lasso

param_grid_lasso_regression_spatial = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20, 100]}

grid_search_lasso_regression_spatial = parameters_tuning(X_train_spatial, Lasso(), param_grid_lasso_regression_spatial)

In [None]:
from sklearn.linear_model import Lasso
model_lasso_regression_spatial = Lasso(**grid_search_lasso_regression_spatial.best_params_)
model_lasso_regression_spatial.fit(X_train_spatial, y_train)
preds_lasso_regression_spatial = model_lasso_regression_spatial.predict(X_test_spatial)

In [None]:
import eli5

eli5.show_weights(
    model_lasso_regression_spatial,
    top=-1,
    feature_names = X_train_spatial.columns.tolist(),
)

In [None]:
def lasso1(alphas):
    df = pd.DataFrame()
    df['variable'] = X_train_spatial.columns.tolist()
    for alpha in alphas:
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train_spatial, y_train)
        column_name = 'weight'
        df[column_name] = lasso.coef_ 
    return df

lasso_fi = lasso1([10])
lasso_fi

In [None]:
valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
    "city_warszawa": "city - Warsaw",
    "city_poznan": "city - Poznań",
    "city_wroclaw": "city - Wrocław",
    "city_krakow": "city - Kraków",
    "city_gdansk": "city - Gdańsk",
    "city_gdynia": "city - Gdynia",
    "city_lublin": "city - Lublin",
    "city_katowice": "city - Katowice",
    "city_szczecin": "city - Szczecin",
    "city_lodz": "city - Łódź",
    "city_bialystok": "city - Białystok",
    "competition500m": "competitive offers nearby",
    "distance_to_centre": "distance to city centre",
}

lasso_fi['variable'] = lasso_fi['variable'].map(valmap)

def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["weight"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        height=800,
        title=title,
        xaxis_title="Coefficient estimates",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', tickmode="linear")
    fig.show()
    
plot_feature_importance(lasso_fi.reindex(lasso_fi.weight.abs().sort_values(ascending=True).index), "Lasso regression")

In [None]:
plot_scatter("Relative location", y_test, preds_lasso_regression_spatial)

In [None]:
plot_histogram("Relative location", y_test, preds_lasso_regression_spatial)

In [None]:
metrics_table = eval_metrics(metrics_table, "spatial_lasso_regression", y_test, preds_lasso_regression_spatial)

### Ridge Regression

#### Coord Ridge

In [None]:
from sklearn.linear_model import Ridge

param_grid_ridge_regression_coord = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20, 100]}

grid_search_ridge_regression_coord = parameters_tuning(X_train_coord, Ridge(), param_grid_ridge_regression_coord)

In [None]:
model_ridge_regression_coord = Ridge(**grid_search_ridge_regression_coord.best_params_)
model_ridge_regression_coord.fit(X_train_coord, y_train)
preds_ridge_regression_coord = model_ridge_regression_coord.predict(X_test_coord)

In [None]:
from sklearn.linear_model import Ridge

def ridge1(alphas):
    df = pd.DataFrame()
    df['variable'] = X_train_coord.columns.tolist()
    for alpha in alphas:
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train_coord, y_train)
        column_name = 'weight'
        df[column_name] = ridge.coef_
    return df

ridge_fi = ridge1([100])
ridge_fi

In [None]:
valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "latitude": "latitude",
    "longitude": "longitude",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
}

ridge_fi['variable'] = ridge_fi['variable'].map(valmap)

def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["weight"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Coefficient estimates",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()

plot_feature_importance(
    ridge_fi.reindex(ridge_fi.weight.abs().sort_values(ascending=True).index),
    "Ridge regression"
)

In [None]:
plot_scatter("Absolute location", y_test, preds_ridge_regression_coord)

In [None]:
plot_histogram("Absolute location", y_test, preds_ridge_regression_coord)

In [None]:
metrics_table = eval_metrics(metrics_table, "coords_ridge_regression", y_test, preds_ridge_regression_coord)

#### Spatial Ridge

In [None]:
from sklearn.linear_model import Ridge

param_grid_ridge_regression_spatial = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 20, 100]}

grid_search_ridge_regression_spatial = parameters_tuning(X_train_spatial, Ridge(), param_grid_ridge_regression_spatial)

In [None]:
model_ridge_regression_spatial = Ridge(**grid_search_ridge_regression_spatial.best_params_)
model_ridge_regression_spatial.fit(X_train_spatial, y_train)
preds_ridge_regression_spatial = model_ridge_regression_spatial.predict(X_test_spatial)

In [None]:
from sklearn.linear_model import Ridge
def ridge1(alphas):
    df = pd.DataFrame()
    df['variable'] = X_train_spatial.columns.tolist()
    for alpha in alphas:
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train_spatial, y_train)
        column_name = 'weight'
        df[column_name] = ridge.coef_
    return df

ridge_fi = ridge1([100])
ridge_fi

In [None]:
valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
    "city_warszawa": "city - Warsaw",
    "city_poznan": "city - Poznań",
    "city_wroclaw": "city - Wrocław",
    "city_krakow": "city - Kraków",
    "city_gdansk": "city - Gdańsk",
    "city_gdynia": "city - Gdynia",
    "city_lublin": "city - Lublin",
    "city_katowice": "city - Katowice",
    "city_szczecin": "city - Szczecin",
    "city_lodz": "city - Łódź",
    "city_bialystok": "city - Białystok",
    "competition500m": "competitive offers nearby",
    "distance_to_centre": "distance to city centre",
}

ridge_fi['variable'] = ridge_fi['variable'].map(valmap)

def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["weight"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        height=800,
        xaxis_title="Coefficient estimates",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', tickmode="linear")
    fig.show()

plot_feature_importance(ridge_fi.reindex(ridge_fi.weight.abs().sort_values(ascending=True).index), "Ridge regression")

In [None]:
plot_scatter("Relative location", y_test, preds_ridge_regression_spatial)

In [None]:
plot_histogram("Relative location", y_test, preds_ridge_regression_spatial)

In [None]:
metrics_table = eval_metrics(metrics_table, "spatial_ridge_regression", y_test, preds_ridge_regression_spatial)

## Support Vector Machine

#### Coord SVR

In [None]:
from sklearn.svm import SVR

param_grid_svm_coord = {
    'epsilon': [0.05, 0.1, 0.2],
    'kernel': ["linear", "poly", "rbf"],
    'C': [0.01, 0.1, 0.25, 0.5, 1, 2, 5, 10],
    'degree': [2, 3, 4],
}

grid_search_svm_coord = parameters_tuning(X_train_coord, SVR(), param_grid_svm_coord)

In [None]:
model_svm_coord = SVR(**grid_search_svm_coord.best_params_)
model_svm_coord.fit(X_train_coord, y_train)
preds_svm_coord = model_svm_coord.predict(X_test_coord)

In [None]:
plot_scatter("Absolute location", y_test, preds_svm_coord)

In [None]:
plot_histogram("Absolute location", y_test, preds_svm_coord)

In [None]:
metrics_table = eval_metrics(metrics_table, "coords_svm", y_test, preds_svm_coord)

In [None]:
import pickle

pickle.dump(model_svm_coord, open("model_svm_coord.pkl", 'wb'))

#### Spatial SVR

In [None]:
from sklearn.svm import SVR

param_grid_svm_spatial = {
    'epsilon': [0.05, 0.1, 0.2],
    'kernel': ["linear", "poly", "rbf"],
    'C': [0.01, 0.1, 0.25, 0.5, 1, 2, 5, 10],
    'degree': [2, 3, 4],
}

grid_search_svm_spatial = parameters_tuning(X_train_spatial, SVR(), param_grid_svm_spatial)

In [None]:
model_svm_spatial = SVR(**grid_search_svm_spatial.best_params_)
model_svm_spatial.fit(X_train_spatial, y_train)
preds_svm_spatial = model_svm_spatial.predict(X_test_spatial)

In [None]:
plot_scatter("Relative location", y_test, preds_svm_spatial)

In [None]:
plot_histogram("Relative location", y_test, preds_svm_spatial)

In [None]:
metrics_table = eval_metrics(metrics_table, "spatial_svm", y_test, preds_svm_spatial)

In [None]:
pickle.dump(model_svm_spatial, open("model_svm_spatial.pkl", 'wb'))

## Random Forest

#### Coords RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid_random_forest_coords = {
    'max_depth': [5, 10, 25, 50, 100],
    'max_features': ['sqrt', 'auto'],
    'min_samples_leaf': [5, 10, 25],
    'min_samples_split': [10, 25, 50],
    'n_estimators': [100, 500, 1000, 1500]
}

grid_search_random_forest_coords = parameters_tuning(X_train_coord, RandomForestRegressor(), param_grid_random_forest_coords)

In [None]:
model_random_forest_coords = RandomForestRegressor(**grid_search_random_forest_coords.best_params_)
model_random_forest_coords.fit(X_train_coord, y_train)
preds_random_forest_coords = model_random_forest_coords.predict(X_test_coord)

In [None]:
plot_scatter("Absolute location", y_test, preds_random_forest_coords)

In [None]:
grid_search_random_forest_coords.best_params_

In [None]:
plot_histogram("Absolute location", y_test, preds_random_forest_coords)

Top levels of one of the trees

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "latitude": "latitude",
    "longitude": "longitude",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
}

x_c = X_train_coord.copy()
x_c.rename(columns=valmap, inplace=True)
x_c['variable'] = x_c['variable_dirty'].map(valmap)
fig = plt.figure(figsize=(15, 10))
plot_tree(
    model_random_forest_coords.estimators_[0], 
    max_depth=2,
    feature_names=x_c.columns,
    class_names=y_train.name, 
    filled=True, 
    impurity=True, 
    rounded=True,
    fontsize=10,
)

Features importance

In [None]:
feature_importance_coords = model_random_forest_coords.feature_importances_
cols = X_train_coord.columns

fi_rf = pd.DataFrame({"variable_dirty":cols,"feature importance":feature_importance_coords})
fi_rf

valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "latitude": "latitude",
    "longitude": "longitude",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
}

fi_rf['variable'] = fi_rf['variable_dirty'].map(valmap)


def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["feature importance"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Feature importance (MDI)",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()
    
plot_feature_importance(fi_rf.sort_values("feature importance", ascending=True), "Absolute location")

In [None]:
metrics_table = eval_metrics(metrics_table, "coords_random_forest", y_test, preds_random_forest_coords)

In [None]:
pickle.dump(model_random_forest_coords, open("model_random_forest_coords.pkl", 'wb'))

#### Spatial RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid_random_forest_spatial = {
    'max_depth': [5, 10, 25, 50, 100],
    'max_features': ['sqrt', 'auto'],
    'min_samples_leaf': [5, 10, 25],
    'min_samples_split': [10, 25, 50],
    'n_estimators': [100, 500, 1000, 1500]
}

grid_search_random_forest_spatial = parameters_tuning(X_train_spatial, RandomForestRegressor(), param_grid_random_forest_spatial)

In [None]:
model_random_forest_spatial = RandomForestRegressor(**grid_search_random_forest_spatial.best_params_)
model_random_forest_spatial.fit(X_train_spatial, y_train)
preds_random_forest_spatial = model_random_forest_spatial.predict(X_test_spatial)

In [None]:
plot_scatter("Relative location", y_test, preds_random_forest_spatial)

In [None]:
grid_search_random_forest_spatial.best_params_

In [None]:
plot_histogram("Relative location", y_test, preds_random_forest_spatial)

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
    "city_warszawa": "city - Warsaw",
    "city_poznan": "city - Poznań",
    "city_wroclaw": "city - Wrocław",
    "city_krakow": "city - Kraków",
    "city_gdansk": "city - Gdańsk",
    "city_gdynia": "city - Gdynia",
    "city_lublin": "city - Lublin",
    "city_katowice": "city - Katowice",
    "city_szczecin": "city - Szczecin",
    "city_lodz": "city - Łódź",
    "city_bialystok": "city - Białystok",
    "competition500m": "competitive offers nearby",
    "distance_to_centre": "distance to city centre",
}

x_c = X_train_spatial.copy()
x_c.rename(columns=valmap, inplace=True)
x_c['variable'] = x_c['variable_dirty'].map(valmap)
fig = plt.figure(figsize=(15, 10))
plot_tree(
    model_random_forest_spatial.estimators_[0], 
    max_depth=2,
    feature_names=x_c.columns,
    class_names=y_train.name, 
    filled=True, 
    impurity=True, 
    rounded=True,
    fontsize=10,
)

In [None]:
feature_importance_spatial = model_random_forest_spatial.feature_importances_
cols = X_train_spatial.columns

fi_rf = pd.DataFrame({"variable_dirty":cols,"feature importance":feature_importance_spatial})
fi_rf

valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
    "city_warszawa": "city - Warsaw",
    "city_poznan": "city - Poznań",
    "city_wroclaw": "city - Wrocław",
    "city_krakow": "city - Kraków",
    "city_gdansk": "city - Gdańsk",
    "city_gdynia": "city - Gdynia",
    "city_lublin": "city - Lublin",
    "city_katowice": "city - Katowice",
    "city_szczecin": "city - Szczecin",
    "city_lodz": "city - Łódź",
    "city_bialystok": "city - Białystok",
    "competition500m": "competitive offers nearby",
    "distance_to_centre": "distance to city centre",
}

fi_rf['variable'] = fi_rf['variable_dirty'].map(valmap)


def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["feature importance"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        height=800,
        title=title,
        xaxis_title="Feature importance (MDI)",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()
    
plot_feature_importance(fi_rf.sort_values("feature importance", ascending=True), "Relative location")

In [None]:
metrics_table = eval_metrics(metrics_table, "spatial_random_forest", y_test, preds_random_forest_spatial)

In [None]:
pickle.dump(model_random_forest_spatial, open("model_random_forest_spatial.pkl", 'wb'))

## XGBoost

#### Coord XGB

In [None]:
from xgboost import XGBRegressor

param_grid_xgboost_coord = [
    {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 25, 50, 100],
        'min_child_weight ': [3, 5, 10],
        'colsample_bytree': [0.5, 0.6, 0.7],
        'subsample ': [0.6, 0.75, 0.9],
        'n_estimators': [500],
        "early_stopping_rounds": [50],
    },
        {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 25, 50, 100],
        'min_child_weight ': [3, 5, 10],
        'colsample_bytree': [0.5, 0.6, 0.7],
        'subsample ': [0.6, 0.75, 0.9],
        'n_estimators': [1000],
        "early_stopping_rounds": [100],
    },
        {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 25, 50, 100],
        'min_child_weight ': [3, 5, 10],
        'colsample_bytree': [0.5, 0.6, 0.7],
        'subsample ': [0.6, 0.75, 0.9],
        'n_estimators': [1500],
        "early_stopping_rounds": [150],
    },
]

grid_search_xgboost_coord = parameters_tuning(X_train_coord, XGBRegressor(), param_grid_xgboost_coord)

In [None]:
model_xgboost_coord = XGBRegressor(**grid_search_xgboost_coord.best_params_)
model_xgboost_coord.fit(X_train_coord, y_train)
preds_xgboost_coord = model_xgboost_coord.predict(X_test_coord)

In [None]:
plot_scatter("Absolute location", y_test, preds_xgboost_coord)

In [None]:
plot_histogram("Absolute location", y_test, preds_xgboost_coord)

Feature importance

In [None]:
feature_importance_coord = model_xgboost_coord.feature_importances_
cols = X_train_coord.columns

fi_xgb = pd.DataFrame({"variable_dirty":cols,"feature importance":feature_importance_coord})
fi_xgb

valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "latitude": "latitude",
    "longitude": "longitude",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
}

fi_xgb['variable'] = fi_xgb['variable_dirty'].map(valmap)
fi_xgb  

In [None]:
def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["feature importance"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Feature importance (MDI)",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()
    
plot_feature_importance(fi_xgb.sort_values("feature importance", ascending=True), "Absolute location")

In [None]:
metrics_table = eval_metrics(metrics_table, "coords_xgboost", y_test, preds_xgboost_coord)

In [None]:
pickle.dump(model_xgboost_coord, open("model_xgboost_coord.pkl", 'wb'))

#### Spatial XGB

In [None]:
from xgboost import XGBRegressor

param_grid_xgboost_spatial = [
    {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 25, 50, 100],
        'min_child_weight ': [3, 5, 10],
        'colsample_bytree': [0.5, 0.6, 0.7],
        'subsample ': [0.6, 0.75, 0.9],
        'n_estimators': [500],
        "early_stopping_rounds": [50],
    },
        {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 25, 50, 100],
        'min_child_weight ': [3, 5, 10],
        'colsample_bytree': [0.5, 0.6, 0.7],
        'subsample ': [0.6, 0.75, 0.9],
        'n_estimators': [1000],
        "early_stopping_rounds": [100],
    },
        {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 25, 50, 100],
        'min_child_weight ': [3, 5, 10],
        'colsample_bytree': [0.5, 0.6, 0.7],
        'subsample ': [0.6, 0.75, 0.9],
        'n_estimators': [1500],
        "early_stopping_rounds": [150],
    },
]

grid_search_xgboost_spatial = parameters_tuning(X_train_spatial, XGBRegressor(), param_grid_xgboost_spatial)

In [None]:
model_xgboost_spatial = XGBRegressor(**grid_search_xgboost_spatial.best_params_)
model_xgboost_spatial.fit(X_train_spatial, y_train)
preds_xgboost_spatial = model_xgboost_spatial.predict(X_test_spatial)

In [None]:
plot_scatter("Relative location", y_test, preds_xgboost_spatial)

In [None]:
plot_histogram("Relative location", y_test, preds_xgboost_spatial)

In [None]:
feature_importance_spatial = model_xgboost_spatial.feature_importances_
cols = X_train_spatial.columns

fi_xgb = pd.DataFrame({"variable_dirty":cols,"feature importance":feature_importance_spatial})
fi_xgb

valmap = {
    "rooms": "number of rooms",
    "size": "living area",
    "floor": "floor",
    "floors": "number of floors",
    "year_built": "year of construction",
    "balcony": "balcony",
    "parking": "parking space",
    "separate_kitchen": "separate kitchen",
    "storage": "storage",
    "building_material_cegla": "building material - brick",
    "building_material_beton": "building material - concrete",
    "building_material_inny": "building material - other",
    "building_material_wielka plyta": "building material - plattenbau",
    "building_type_apartamentowiec": "building type - apartmenthouse",
    "building_type_blok": "building type - block of flats",
    "building_type_inny": "building type - other",
    "building_type_kamienica": "building type - tenement",
    "windows_drewniane": "windows - wooden",
    "standard_wysoki": "standard - high",
    "windows_plastikowe": "windows - plastic",
    "city_warszawa": "city - Warsaw",
    "city_poznan": "city - Poznań",
    "city_wroclaw": "city - Wrocław",
    "city_krakow": "city - Kraków",
    "city_gdansk": "city - Gdańsk",
    "city_gdynia": "city - Gdynia",
    "city_lublin": "city - Lublin",
    "city_katowice": "city - Katowice",
    "city_szczecin": "city - Szczecin",
    "city_lodz": "city - Łódź",
    "city_bialystok": "city - Białystok",
    "competition500m": "competitive offers nearby",
    "distance_to_centre": "distance to city centre",
}

fi_xgb['variable'] = fi_rf['variable_dirty'].map(valmap)
fi_xgb  

In [None]:
fi_xgb.sort_values("feature importance", ascending=True)

In [None]:
def plot_feature_importance(df, title):
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=df["feature importance"],
        y=df["variable"],
        orientation='h',
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Feature importance (MDI)",
        yaxis_title="",
        font_family="Times New Roman",
        yaxis_tickformat = '000',
        xaxis_tickformat = '000',
        paper_bgcolor="white",
        plot_bgcolor="white",
        yaxis = dict(titlefont = dict(size=30)),
        xaxis = dict(titlefont = dict(size=20)),
    )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
    fig.show()

plot_feature_importance(fi_xgb.sort_values("feature importance", ascending=True), "Relative location")

In [None]:
metrics_table = eval_metrics(metrics_table, "spatial_xgboost", y_test, preds_xgboost_spatial)

In [None]:
pickle.dump(model_xgboost_spatial, open("model_xgboost_spatial.pkl", 'wb'))

## Models comparison

In [None]:
display(metrics_table.sort_values(by=['MAPE'], ascending=False))

In [None]:
metrics_table.to_csv("metrics.csv")