In [2]:
import os

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor

In [3]:
pd.set_option("display.max_columns", 100)

In [4]:
data_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))), "data", "prepared_data.csv")
df = pd.read_csv(data_path, index_col=0)
df.head()

Unnamed: 0,link,price,description_len,general_sq,floor,built,ceil,parking,time_to_underground,address,offer_dt,all_views,today_views,total_floors,apartments,new_building,courtyard_view,road_view,district,rooms,wc_amount,joint_wc,balcony,living_square_ratio,placed_days_ago
0,https://www.cian.ru/sale/flat/258447697/,2280000.0,1623.0,11.4,4,1961.0,3.5,наземная,15.0,"Москва, ВАО, р-н Северное Измайлово, Сиреневый...",2021-07-19 16:48:00,5391.0,116.0,4,True,False,True,False,ВАО,1,1,True,0,0.5,14
1,https://www.cian.ru/sale/flat/258452553/,2310000.0,1620.0,11.4,4,1961.0,3.5,наземная,15.0,"Москва, ВАО, р-н Северное Измайлово, Сиреневый...",2021-07-19 17:55:00,11594.0,81.0,4,True,False,True,False,ВАО,студия,1,True,0,0.5,14
3,https://www.cian.ru/sale/flat/256222498/,2700000.0,1137.0,13.0,4,1980.0,4.0,наземная,15.0,"Москва, ВАО, р-н Северное Измайлово, Сиреневый...",2021-07-24 12:53:00,12641.0,182.0,4,True,False,False,True,ВАО,студия,1,True,0,0.5,10
4,https://www.cian.ru/sale/flat/260273278/,2750000.0,380.0,18.0,2,1952.0,3.3,наземная,6.0,"Москва, НАО (Новомосковский), Щербинка, ул. Лю...",2021-07-18 15:05:00,1774.0,94.0,2,True,False,True,False,НАО (Новомосковский),1,1,True,0,0.888889,15
5,https://www.cian.ru/sale/flat/258740783/,2790000.0,1375.0,18.0,7,2021.0,2.8,наземная,12.0,"Москва, ЮВАО, р-н Нижегородский, Подъемная ул....",2021-07-18 13:29:00,7742.0,129.0,10,True,False,False,True,ЮВАО,студия,1,True,0,0.555556,16


In [5]:
ohe_columns = [
    "parking",
    "new_building",
    "apartments",
    "courtyard_view",
    "road_view",
    "district",
    "rooms",
    "joint_wc",
]
scaler_columns = [
    "description_len",
    "general_sq",
    "floor",
    "built",
    "ceil",
    "time_to_underground",
    "all_views",
    "today_views",
    "total_floors",
    "wc_amount",
    "balcony",
    "living_square_ratio",
    "placed_days_ago",
]
preprocessor = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(), ohe_columns),
        (("scaler", StandardScaler(), scaler_columns)),
    ],
    remainder="drop",
)

In [6]:
X = df.drop("price", axis=1)
y = np.log1p(df["price"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
linear_regressor_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearRegression())]
)

linear_regressor_pipeline.fit(X_train, y_train)
print(
    "linear_regressor_pipeline score: %.3f"
    % linear_regressor_pipeline.score(X_test, y_test)
)
print(mean_absolute_error(y_test, linear_regressor_pipeline.predict(X_test)))
print(mean_squared_error(y_test, linear_regressor_pipeline.predict(X_test)))

linear_regressor_pipeline score: 0.657
0.09118220722687115
0.015450518626862335


In [8]:
knn_regressor_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", KNeighborsRegressor()),
    ]
)
knn_regressor_pipeline.fit(X_train, y_train)

print("knn regressor score: %.3f" % knn_regressor_pipeline.score(X_test, y_test))
print(mean_absolute_error(y_test, knn_regressor_pipeline.predict(X_test)))
print(mean_squared_error(y_test, knn_regressor_pipeline.predict(X_test)))


knn regressor score: 0.702
0.07766918907139964
0.013430221052258428


In [9]:
grid_ridge_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", Ridge()),]
)
grid_ridge = GridSearchCV(
    grid_ridge_pipeline,
    param_grid={"regressor__alpha": np.linspace(2, 3, 10)},
    cv=5,
)

grid_ridge.fit(X_train, y_train)

print("grid_ridge score: %.3f" % grid_ridge.score(X_test, y_test))
print(mean_absolute_error(grid_ridge.predict(X_test), y_test))
print(mean_squared_error(grid_ridge.predict(X_test), y_test))
print(grid_ridge.best_params_)


grid_ridge score: 0.657
0.09108184314941824
0.015418814327497121
{'regressor__alpha': 3.0}


In [10]:
grid_knn_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", KNeighborsRegressor()),
    ]
)
grid_knn = GridSearchCV(
    grid_knn_pipeline,
    param_grid={
        "regressor__n_neighbors": [3, 5, 10, 15, 20, 30, 40, 50, 60, 90, 100],
        "regressor__weights": ["uniform", "distance"],
        "regressor__p": [1, 2, 3,],
    },
    cv=5,
)
grid_knn.fit(X_train, y_train)

print("grid_knn model score: %.3f" % grid_knn.score(X_test, y_test))
print(mean_absolute_error(grid_knn.predict(X_test), y_test))
print(mean_squared_error(grid_knn.predict(X_test), y_test))
print(grid_knn.best_params_)


grid_knn model score: 0.762
0.06365677507544258
0.010703260424223253
{'regressor__n_neighbors': 5, 'regressor__p': 1, 'regressor__weights': 'distance'}


In [11]:
grid_lasso_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", Lasso()),]
)
grid_lasso = GridSearchCV(
    grid_lasso_pipeline,
    param_grid={"regressor__alpha": [0.01, 0.1, 1, *np.linspace(1, 5, 10)],},
    cv=5,
)
grid_lasso.fit(X_train, y_train)

print("grid lasso score: %.3f" % grid_lasso.score(X_test, y_test))
print(mean_absolute_error(grid_lasso.predict(X_test), y_test))
print(mean_squared_error(grid_lasso.predict(X_test), y_test))
print(grid_lasso.best_params_)


grid lasso score: 0.567
0.10149823951667605
0.019474712007525537
{'regressor__alpha': 0.01}


In [12]:
grid_sgd_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", SGDRegressor()),]
)
grid_sgd = GridSearchCV(
    grid_sgd_pipeline,
    param_grid={
        "regressor__alpha": [
            10 ** -6,
            10 ** -5,
            10 ** -4,
            10 ** -3,
            10 ** -2,
            0.1,
            1,
            1.5,
            2,
            3,
        ],
        "regressor__loss": [
            "squared_loss",
            "huber",
            "epsilon_insensitive",
            "squared_epsilon_insensitive",
        ],
        "regressor__penalty": ["l1", "l2",],
    },
    cv=5,
)
grid_sgd.fit(X_train, y_train)

print("grid sgd score: %.3f" % grid_sgd.score(X_test, y_test))
print(mean_absolute_error(grid_sgd.predict(X_test), y_test))
print(mean_squared_error(grid_sgd.predict(X_test), y_test))
print(grid_sgd.best_params_)


grid sgd score: 0.622
0.09323711306349952
0.017023095497541656
{'regressor__alpha': 0.1, 'regressor__loss': 'epsilon_insensitive', 'regressor__penalty': 'l2'}


In [13]:
saved_results = {"linear_regressor": {"score": 0.657, "MAE": 0.09118220722687115, "MSE": 0.015450518626862335},
                "knn_regressor": {"score": 0.702, "MAE": 0.07766918907139964, "MSE": 0.013430221052258428},
                "grid_ridge": {"score": 0.657, "MAE": 0.09108184314941824, "MSE": 0.015418814327497121},
                "grid_knn": {"score": 0.762, "MAE": 0.06365677507544258, "MSE": 0.010703260424223253},
                "grid_lasso": {"score": 0.567, "MAE": 0.10149823951667605, "MSE": 0.019474712007525537},
                "grid_sgd": {"score": 0.622, "MAE": 0.09323711306349952, "MSE": 0.017023095497541656}}
best_params = {"grid_ridge": {"alpha": 3.0},
              "grid_knn": {'regressor__n_neighbors': 5, 'regressor__p': 1, 'regressor__weights': 'distance'},
              "grid_lasso": {"alpha": 0.01}, "grid_sgd": {"alpha": 0.1, "loss": "epsilon_insensitive",
                                                         "penalty": "l2"}}
# linear_regressor_pipeline score: 0.657
# 0.09118220722687115
# 0.015450518626862335

# knn regressor score: 0.702
# 0.07766918907139964
# 0.013430221052258428

# grid_ridge score: 0.657
# 0.09108184314941824
# 0.015418814327497121
# {'regressor__alpha': 3.0}

# grid_knn model score: 0.762
# 0.06365677507544258
# 0.010703260424223253
# {'regressor__n_neighbors': 5, 'regressor__p': 1, 'regressor__weights': 'distance'}

# grid lasso score: 0.567
# 0.10149823951667605
# 0.019474712007525537
# {'regressor__alpha': 0.01}

# grid sgd score: 0.622
# 0.09323711306349952
# 0.017023095497541656
# {'regressor__alpha': 0.1, 'regressor__loss': 'epsilon_insensitive', 'regressor__penalty': 'l2'}