In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
rcParams['figure.figsize'] = (14, 10)

In [2]:
import typing
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}

EPS = 1e-8
assert deviation_metric(np.array([1,2,3,4,5]),np.array([1,2,3,4,5])) <= EPS
assert deviation_metric(np.array([1,2,3,4,5]),np.array([0.9,1.8,2.7,3.6,4.5])) <= EPS
assert deviation_metric(np.array([1,2,3,4,5]),np.array([1.1,2.2,3.3,4.4,5.5])) <= EPS
assert deviation_metric(np.array([1,2,3,4,5]),np.array([1.15,2.3,3.45,4.6,5.75])) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([1.3,2.6,3.9,5.2,6.5]))-1) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([0.7,1.4,2.1,2.8,3.5]))-1*NEGATIVE_WEIGHT) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([10,20,30,40,50]))-9) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([0,0,0,0,0]))-9*NEGATIVE_WEIGHT) <= EPS
assert np.abs(deviation_metric(np.array([1,2,3,4,5]),np.array([1,2.2,3.3,5,50])) - 85/45) <= EPS

In [3]:
RANDOM_SEED = 26
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 80)

In [4]:
train_df = pd.read_csv('../data/train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
print(train_df.shape)
train_df.sample(5)

(279792, 77)


Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,osm_catering_points_in_0.001,osm_catering_points_in_0.005,osm_catering_points_in_0.0075,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_name,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.001,osm_crossing_points_in_0.005,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.01,osm_culture_points_in_0.001,osm_culture_points_in_0.005,osm_culture_points_in_0.0075,osm_culture_points_in_0.01,osm_finance_points_in_0.001,osm_finance_points_in_0.005,osm_finance_points_in_0.0075,osm_finance_points_in_0.01,osm_healthcare_points_in_0.005,osm_healthcare_points_in_0.0075,osm_healthcare_points_in_0.01,osm_historic_points_in_0.005,osm_historic_points_in_0.0075,osm_historic_points_in_0.01,osm_hotels_points_in_0.005,osm_hotels_points_in_0.0075,osm_hotels_points_in_0.01,osm_leisure_points_in_0.005,osm_leisure_points_in_0.0075,osm_leisure_points_in_0.01,osm_offices_points_in_0.001,osm_offices_points_in_0.005,osm_offices_points_in_0.0075,osm_offices_points_in_0.01,osm_shops_points_in_0.001,osm_shops_points_in_0.005,osm_shops_points_in_0.0075,osm_shops_points_in_0.01,osm_subway_closest_dist,osm_train_stop_closest_dist,osm_train_stop_points_in_0.005,osm_train_stop_points_in_0.0075,osm_train_stop_points_in_0.01,osm_transport_stop_closest_dist,osm_transport_stop_points_in_0.005,osm_transport_stop_points_in_0.0075,osm_transport_stop_points_in_0.01,per_square_meter_price,reform_count_of_houses_1000,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type
9244,Москва,1.0,COL_9453,55.857143,37.558857,6,39,73,127,0,1,2,4,4,10,14,18,9.006,Долгопрудный,98788.0,0.078062,1,17,35,48,0,0,0,0,0,2,2,2,4,8,18,0,1,1,0,0,1,0,0,3,1,2,5,14,4,24,48,84,0.168722,0.862851,0,0,2,0.100717,13,25,35,902441.860465,78,12,3199.0,537.0,14.205128,13.833333,1989.782051,1986.75,Москва,43.0,S7194,2020-01-12,100,0
221798,Ермекеево,,COL_226641,54.076977,53.67011,0,1,1,1,0,0,0,0,0,0,0,0,127.542367,Альметьевск,151157.0,19.061062,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,243.14709,37.177882,0,0,0,4.079842,0,0,0,12416.666667,21,1,90.0,2.0,2.0,2.0,1978.095238,1973.0,Башкортостан,12.0,S20900,2020-07-12,110,0
247014,Стерлитамак,,COL_252331,53.5857,55.93389,0,11,13,13,0,0,0,0,0,1,1,1,5.331185,Стерлитамак,279692.0,1.578656,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,2,2,2,0,0,0,0,0,5,7,7,454.23851,97.080105,0,0,0,0.938855,0,0,2,2647.5,9,7,155.0,129.0,8.111111,8.571429,1996.555556,1998.0,Башкортостан,5600.0,S6517,2020-08-02,10,0
44118,Новосибирск,1.0,COL_45143,55.024925,82.95606,1,14,17,30,0,0,0,2,0,1,1,2,2.111045,Новосибирск,1625600.0,0.624388,0,0,3,5,0,0,0,0,0,0,0,0,2,2,2,0,0,1,0,1,1,2,2,4,0,1,1,2,1,9,9,16,1.266559,2.281024,0,0,0,0.281842,3,10,13,63196.894848,70,6,1018.0,136.0,7.014493,16.833333,1973.642857,2004.833333,Новосибирская область,1417.0,S20324,2020-02-09,110,0
40157,Томск,,COL_41096,56.460358,84.98654,2,42,58,80,0,0,1,1,0,5,9,13,3.790106,Томск,576624.0,0.073114,1,15,25,33,0,4,4,4,0,2,2,5,2,2,4,3,3,4,2,3,4,2,4,4,0,1,3,6,2,24,31,38,202.144856,0.295158,2,2,2,0.126337,13,14,25,107783.333333,126,58,1915.0,734.0,5.853211,4.641509,1971.842593,1966.055556,Томская область,60.0,S7495,2020-02-02,10,0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279792 entries, 0 to 279791
Data columns (total 77 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   city                                 279792 non-null  object 
 1   floor                                103555 non-null  object 
 2   id                                   279792 non-null  object 
 3   lat                                  279792 non-null  float64
 4   lng                                  279792 non-null  float64
 5   osm_amenity_points_in_0.001          279792 non-null  int64  
 6   osm_amenity_points_in_0.005          279792 non-null  int64  
 7   osm_amenity_points_in_0.0075         279792 non-null  int64  
 8   osm_amenity_points_in_0.01           279792 non-null  int64  
 9   osm_building_points_in_0.001         279792 non-null  int64  
 10  osm_building_points_in_0.005         279792 non-null  int64  
 11  osm_building_

In [7]:
train_df.describe()

Unnamed: 0,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,osm_catering_points_in_0.001,osm_catering_points_in_0.005,osm_catering_points_in_0.0075,osm_catering_points_in_0.01,osm_city_closest_dist,osm_city_nearest_population,osm_crossing_closest_dist,osm_crossing_points_in_0.001,osm_crossing_points_in_0.005,osm_crossing_points_in_0.0075,osm_crossing_points_in_0.01,osm_culture_points_in_0.001,osm_culture_points_in_0.005,osm_culture_points_in_0.0075,osm_culture_points_in_0.01,osm_finance_points_in_0.001,osm_finance_points_in_0.005,osm_finance_points_in_0.0075,osm_finance_points_in_0.01,osm_healthcare_points_in_0.005,osm_healthcare_points_in_0.0075,osm_healthcare_points_in_0.01,osm_historic_points_in_0.005,osm_historic_points_in_0.0075,osm_historic_points_in_0.01,osm_hotels_points_in_0.005,osm_hotels_points_in_0.0075,osm_hotels_points_in_0.01,osm_leisure_points_in_0.005,osm_leisure_points_in_0.0075,osm_leisure_points_in_0.01,osm_offices_points_in_0.001,osm_offices_points_in_0.005,osm_offices_points_in_0.0075,osm_offices_points_in_0.01,osm_shops_points_in_0.001,osm_shops_points_in_0.005,osm_shops_points_in_0.0075,osm_shops_points_in_0.01,osm_subway_closest_dist,osm_train_stop_closest_dist,osm_train_stop_points_in_0.005,osm_train_stop_points_in_0.0075,osm_train_stop_points_in_0.01,osm_transport_stop_closest_dist,osm_transport_stop_points_in_0.005,osm_transport_stop_points_in_0.0075,osm_transport_stop_points_in_0.01,per_square_meter_price,reform_count_of_houses_1000,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,total_square,realty_type,price_type
count,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279737.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,279792.0,265196.0,252558.0,263084.0,249624.0,263553.0,250155.0,279792.0,279792.0,279792.0
mean,54.364078,47.76354,2.709084,40.605146,81.596171,133.285458,0.037442,0.885701,2.046467,3.748163,0.614414,8.81097,17.408561,28.341454,15.516565,2584430.0,0.519421,0.737344,14.097683,29.307475,48.71462,0.071857,1.487837,3.258067,5.555917,0.145883,2.050423,4.041327,6.442375,2.030673,4.152703,6.836268,1.681188,3.679873,6.225678,1.042767,2.143657,3.555641,2.022713,4.188147,7.047953,0.239188,3.109102,6.216915,10.21709,1.620772,22.637438,44.802353,72.46694,170.93094,9.604875,0.078637,0.19636,0.347601,0.575846,6.262863,12.951189,21.61852,110195.5,100.453691,30.110661,2042.541716,644.610557,7.051233,7.360464,1967.532599,1967.98858,507.833604,54.974088,0.016058
std,4.245713,17.044625,4.202451,53.293388,105.193169,172.290136,0.391014,6.858338,14.801566,25.679859,1.695269,17.713251,33.267316,53.790038,34.755675,4110835.0,4.011926,1.386548,15.136481,30.23217,49.24749,0.37292,4.407199,8.815882,13.994591,0.457838,3.371304,6.11108,9.351825,3.331527,6.429165,10.338075,3.908801,7.823028,12.736713,3.312283,6.422509,10.342464,3.042804,5.686815,9.094224,0.737926,5.51767,10.720621,17.458145,2.659499,28.428653,54.182739,86.560839,217.64173,26.602806,0.530265,0.913866,1.310374,4.596875,5.559396,10.206379,16.263372,172050.6,85.488575,27.686234,1359.884747,445.699329,3.542084,4.231369,45.807699,54.110015,1704.251771,47.856417,0.1257
min,42.651897,19.892178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000802,44389.0,0.000223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001062,0.001644,0.0,0.0,0.0,0.000475,0.0,0.0,0.0,389.6104,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.1,10.0,0.0
25%,53.2266,37.582988,0.0,7.0,16.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.554834,262250.0,0.071507,0.0,3.0,7.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,4.0,9.0,16.0,1.245974,1.342095,0.0,0.0,0.0,0.099268,2.0,5.0,9.0,29701.49,39.0,10.0,932.0,290.0,4.591837,4.619959,1960.07,1959.890097,65.9,10.0,0.0
50%,55.67909,39.702435,1.0,22.0,46.0,77.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,9.0,5.32942,1013468.0,0.135857,0.0,10.0,20.0,35.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,3.0,1.0,2.0,3.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,3.0,0.0,1.0,2.0,4.0,1.0,13.0,27.0,45.0,60.636113,2.727866,0.0,0.0,0.0,0.176285,5.0,11.0,19.0,59710.68,86.0,25.0,1949.0,602.0,6.368932,6.395349,1970.890411,1971.647059,128.737034,10.0,0.0
75%,56.306976,55.957523,4.0,51.0,101.0,164.0,0.0,0.0,1.0,2.0,0.0,8.0,16.0,26.0,10.74514,1468833.0,0.275292,1.0,20.0,41.0,68.0,0.0,1.0,2.0,5.0,0.0,3.0,5.0,8.0,3.0,5.0,8.0,2.0,4.0,6.0,1.0,2.0,2.0,3.0,6.0,10.0,0.0,4.0,7.0,11.0,2.0,30.0,59.0,96.0,325.071715,5.825944,0.0,0.0,0.0,0.313665,9.0,19.0,32.0,119300.0,140.0,43.0,2978.0,936.0,8.698925,9.1,1983.701754,1986.95,336.0,110.0,0.0
max,69.50074,151.777,46.0,468.0,851.0,1392.0,30.0,586.0,949.0,1162.0,23.0,169.0,323.0,519.0,1107.810806,12630290.0,1072.660533,19.0,135.0,198.0,267.0,20.0,162.0,265.0,336.0,6.0,28.0,42.0,60.0,30.0,56.0,78.0,53.0,89.0,129.0,80.0,113.0,159.0,35.0,75.0,106.0,17.0,63.0,127.0,178.0,40.0,309.0,509.0,809.0,2925.919581,1321.700776,29.0,31.0,31.0,1006.664769,58.0,78.0,109.0,1990000.0,733.0,289.0,18392.0,6105.0,53.717949,221.666667,2019.0,2020.0,40000.0,110.0,1.0


In [8]:
train_df.isnull().sum().sort_values(ascending=False)

floor                                  176237
reform_mean_floor_count_500             30168
reform_mean_year_building_500           29637
reform_house_population_500             27234
reform_mean_floor_count_1000            16708
reform_mean_year_building_1000          16239
reform_house_population_1000            14596
street                                   1606
osm_city_nearest_population                55
price_type                                  0
osm_culture_points_in_0.001                 0
osm_crossing_closest_dist                   0
osm_crossing_points_in_0.001                0
osm_crossing_points_in_0.005                0
osm_crossing_points_in_0.0075               0
osm_crossing_points_in_0.01                 0
osm_finance_points_in_0.005                 0
osm_culture_points_in_0.005                 0
osm_culture_points_in_0.0075                0
osm_culture_points_in_0.01                  0
osm_finance_points_in_0.001                 0
osm_city_closest_dist             

In [9]:
def plot_target_dist(train_df):
    _, ax = plt.subplots(4, 1)
    sns.distplot(train_df['per_square_meter_price'], ax=ax[0])
    sns.distplot(train_df['per_square_meter_price'].apply(np.log), ax=ax[1]);
    sns.boxplot(train_df['per_square_meter_price'], ax=ax[2])
    sns.boxplot(train_df['per_square_meter_price'].apply(np.log), ax=ax[3]);
    plt.show();

In [10]:
import geopy.distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import geopy.distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import typing

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.exceptions import NotFittedError


class SmoothedTargetEncoding(BaseEstimator,TransformerMixin):
    """Регуляризованный таргет энкодинг.

    :param categorical_features: список из столбцов с категориальными признаками, которые нужно заэнкодить
    :param alpha: параметр регуляризации
    """

    def __init__(self, categorical_features: typing.List[str], alpha: float = 50.0):
        self.__is_fitted = False
        self.categorical_features = categorical_features
        self.alpha = alpha
        self.mean_price = None
        self.mean_price_by_cat = {}
        self.encoded_preffix = "encoded_"
        self.target = 'per_square_meter_price'

    def smoothed_target_encoding(self, y: pd.Series) -> pd.Series:
        """Реализация регуляризованного таргед энкодинга.

        Принцип такой - чем меньше исходных данных, тем сильнее будет регуляризация
        Параметр регуляризации регуляризует мин. кол-во необходимых данных
        :param y: pd.Series с ценой
        :return: pd.Series с регуляризованной ценой
        """
        nrows = y.notnull().sum()
        return (y.mean() * nrows + self.alpha * self.mean_price) / (nrows + self.alpha)

    def fit(self, X: pd.DataFrame, y: typing.Union[np.array, pd.Series] = None):
        """На основе обучающей выборки запоминает средние цены в разрезе категорий.

        :param X: pd.DataFrame, обучающая выборка
        :param y: target
        :return:
        """
        X[self.target] = y
        self.mean_price = X[self.target].mean()
        for col in self.categorical_features:
            self.mean_price_by_cat[col] = (
                X.groupby(col)[self.target].apply(lambda x: self.smoothed_target_encoding(x)).fillna(self.mean_price)
            )

        X.drop(self.target, axis=1, inplace=True)
        self.__is_fitted = True
        return self

    def transform(self, X: pd.DataFrame, y: typing.Union[np.array, pd.Series] = None):
        """Применение регуляризованного таргет энкодинга.

        :param X: pd.DataFrame, обучающая выборка
        :return:
        """
        X_cp = X.copy()
        if self.__is_fitted:
            encoded_cols = []
            for col in self.categorical_features:
                new_col = self.encoded_preffix + col
                X_cp[new_col] = X_cp[col].map(self.mean_price_by_cat[col]).fillna(self.mean_price)
                encoded_cols.append(new_col)
            return X_cp
        else:
            raise NotFittedError(
                "This {} instance is not fitted yet. Call 'fit' with appropriate arguments before using this transformer".format(
                    type(self).__name__
                )
            )

In [11]:
CITIES_MILLIONS = ['Москва', 'Санкт-Петербург', 'Новосибирск', 'Екатеринбург', 'Казань',
                   'Нижний Новгород', 'Челябинск', 'Самара', 'Омск', 'Ростов-на-Дону',
                   'Уфа', 'Красноярск', 'Воронеж', 'Пермь', 'Волгоград']
CITIES_MILLIONS = [city.lower() for city in CITIES_MILLIONS]

In [12]:
train_df['city_million'] = train_df['city'].apply(lambda x: '1' if x.strip().lower() in CITIES_MILLIONS else '0')
train_df['near_the_city_million'] = train_df['osm_city_nearest_name'].apply(lambda x: '1' if x.strip().lower() in CITIES_MILLIONS else '0')
train_df.loc[train_df['city_million'] == '1', 'near_the_city_million'] = '1'

In [13]:
train_df = train_df.sort_values('date')
train_df = train_df.drop(['date'], axis=1)
train_target = train_df.pop('per_square_meter_price')
X_train, X_test, y_train, y_test = train_test_split(train_df, train_target, shuffle=False, test_size=.2)

In [14]:
MY_CAT_FEATURES = ['region', 'city', 'realty_type', 'floor']#, 'near_the_city_million', 'city_million']
ENCODED_NUM_CAT_FEATURES = [f'encoded_{feature}' for feature in MY_CAT_FEATURES]
ENCODED_NUM_CAT_FEATURES

['encoded_region', 'encoded_city', 'encoded_realty_type', 'encoded_floor']

In [15]:
CENTER_MSK_LAT = 55.751663
CENTER_MSK_LNG = 37.618937

In [16]:
class DataPreparation(TransformerMixin):
    def __init__(self, features=None):
        self.columns = features
        
    def clean_floor(self, data):
        # почистим признак floor
        data['floor'] = data['floor'].mask(data['floor'] == '-1.0', -1) \
              .mask(data['floor'] == '-2.0', -2) \
              .mask(data['floor'] == '-3.0', -3) \
              .mask(data['floor'] == 'подвал, 1', 1) \
              .mask(data['floor'] == 'подвал', -1) \
              .mask(data['floor'] == 'цоколь, 1', 1) \
              .mask(data['floor'] == '1,2,антресоль', 1) \
              .mask(data['floor'] == 'цоколь', 0) \
              .mask(data['floor'] == 'тех.этаж (6)', 6) \
              .mask(data['floor'] == 'Подвал', -1) \
              .mask(data['floor'] == 'Цоколь', 0) \
              .mask(data['floor'] == 'фактически на уровне 1 этажа', 1) \
              .mask(data['floor'] == '1,2,3', 1) \
              .mask(data['floor'] == '1, подвал', 1) \
              .mask(data['floor'] == '1,2,3,4', 1) \
              .mask(data['floor'] == '1,2', 1) \
              .mask(data['floor'] == '1,2,3,4,5', 1) \
              .mask(data['floor'] == '5, мансарда', 5) \
              .mask(data['floor'] == '1-й, подвал', 1) \
              .mask(data['floor'] == '1, подвал, антресоль', 1) \
              .mask(data['floor'] == 'мезонин', 2) \
              .mask(data['floor'] == 'подвал, 1-3', 1) \
              .mask(data['floor'] == '1 (Цокольный этаж)', 0) \
              .mask(data['floor'] == '3, Мансарда (4 эт)', 3) \
              .mask(data['floor'] == 'подвал,1', 1) \
              .mask(data['floor'] == '1, антресоль', 1) \
              .mask(data['floor'] == '1-3', 1) \
              .mask(data['floor'] == 'мансарда (4эт)', 4) \
              .mask(data['floor'] == '1, 2.', 1) \
              .mask(data['floor'] == 'подвал , 1 ', 1) \
              .mask(data['floor'] == '1, 2', 1) \
              .mask(data['floor'] == 'подвал, 1,2,3', 1) \
              .mask(data['floor'] == '1 + подвал (без отделки)', 1) \
              .mask(data['floor'] == 'мансарда', 3) \
              .mask(data['floor'] == '2,3', 2) \
              .mask(data['floor'] == '4, 5', 4) \
              .mask(data['floor'] == '1-й, 2-й', 1) \
              .mask(data['floor'] == '1 этаж, подвал', 1) \
              .mask(data['floor'] == '1, цоколь', 1) \
              .mask(data['floor'] == 'подвал, 1-7, техэтаж', 1) \
              .mask(data['floor'] == '3 (антресоль)', 3) \
              .mask(data['floor'] == '1, 2, 3', 1) \
              .mask(data['floor'] == 'Цоколь, 1,2(мансарда)', 1) \
              .mask(data['floor'] == 'подвал, 3. 4 этаж', 3) \
              .mask(data['floor'] == 'подвал, 1-4 этаж', 1) \
              .mask(data['floor'] == 'подва, 1.2 этаж', 1) \
              .mask(data['floor'] == '2, 3', 2) \
              .mask(data['floor'] == '7,8', 7) \
              .mask(data['floor'] == '1 этаж', 1) \
              .mask(data['floor'] == '1-й', 1) \
              .mask(data['floor'] == '3 этаж', 3) \
              .mask(data['floor'] == '4 этаж', 4) \
              .mask(data['floor'] == '5 этаж', 5) \
              .mask(data['floor'] == 'подвал,1,2,3,4,5', 1) \
              .mask(data['floor'] == 'подвал, цоколь, 1 этаж', 1) \
              .mask(data['floor'] == '3, мансарда', 3) \
              .mask(data['floor'] == 'цоколь, 1, 2,3,4,5,6', 1) \
              .mask(data['floor'] == ' 1, 2, Антресоль', 1) \
              .mask(data['floor'] == '3 этаж, мансарда (4 этаж)', 3) \
              .mask(data['floor'] == 'цокольный', 0) \
              .mask(data['floor'] == '1,2 ', 1) \
              .mask(data['floor'] == '3,4', 3) \
              .mask(data['floor'] == 'подвал, 1 и 4 этаж', 1) \
              .mask(data['floor'] == '5(мансарда)', 5) \
              .mask(data['floor'] == 'технический этаж,5,6', 5) \
              .mask(data['floor'] == ' 1-2, подвальный', 1) \
              .mask(data['floor'] == '1, 2, 3, мансардный', 1) \
              .mask(data['floor'] == 'подвал, 1, 2, 3', 1) \
              .mask(data['floor'] == '1,2,3, антресоль, технический этаж', 1) \
              .mask(data['floor'] == '3, 4', 3) \
              .mask(data['floor'] == '1-3 этажи, цоколь (188,4 кв.м), подвал (104 кв.м)', 1) \
              .mask(data['floor'] == '1,2,3,4, подвал', 1) \
              .mask(data['floor'] == '2-й', 2) \
              .mask(data['floor'] == '1, 2 этаж', 1) \
              .mask(data['floor'] == 'подвал, 1, 2', 1) \
              .mask(data['floor'] == '1-7', 1) \
              .mask(data['floor'] == '1 (по док-м цоколь)', 1) \
              .mask(data['floor'] == '1,2,подвал ', 1) \
              .mask(data['floor'] == 'подвал, 2', 2) \
              .mask(data['floor'] == 'подвал,1,2,3', 1) \
              .mask(data['floor'] == '1,2,3 этаж, подвал ', 1) \
              .mask(data['floor'] == '1,2,3 этаж, подвал', 1) \
              .mask(data['floor'] == '2, 3, 4, тех.этаж', 2) \
              .mask(data['floor'] == 'цокольный, 1,2', 1) \
              .mask(data['floor'] == 'Техническое подполье', -1) \
              .mask(data['floor'] == '1.2', 1) \
              .astype(str)
        return data
    
    def fit(self, df, y=None):
        df = df.copy()
        if self.columns is None:
            cols = list(df.columns) #+ ['city_million', 'near_the_city_million']
            self.columns = ['distance_from_moscow_center'] + cols
        df['distance_from_moscow_center'] = df.apply(
            lambda x: geopy.distance.distance((x['lat'], x['lng']), (CENTER_MSK_LAT, CENTER_MSK_LNG)).km, axis=1)
        self.means = dict(df[train_df.select_dtypes(exclude='object').columns].mean())
        return self
    
    def transform(self, df, y=None):
        df = df.copy()
        #df['city_million'] = df['city'].apply(lambda x: 1 if x.strip().lower() in CITIES_MILLIONS else 0)
        #df['near_the_city_million'] = df['osm_city_nearest_name'].apply(lambda x: 1 if x.strip().lower() in CITIES_MILLIONS else 0)
        #df.loc[df['city_million'] == 1, 'near_the_city_million'] = 1
        df['distance_from_moscow_center'] = df.apply(
            lambda x: geopy.distance.distance((x['lat'], x['lng']), (CENTER_MSK_LAT, CENTER_MSK_LNG)).km, axis=1)
        df = df[self.columns]
        df.loc[:, df.select_dtypes(include='object').columns] = df[df.select_dtypes(include='object').columns].applymap(str)
        df = self.clean_floor(df)
        df[df.select_dtypes(exclude='object').columns].fillna(self.means, inplace=True)
#         df['sub_square'] = df['osm_subway_closest_dist'] * df['total_square']
        df.fillna('nan', inplace=True)
        return df

In [17]:
for en_num in ENCODED_NUM_CAT_FEATURES:
    train_df[en_num] = 0

In [18]:
import geopy.distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import typing

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.exceptions import NotFittedError


class SmoothedTargetEncoding(BaseEstimator, TransformerMixin):
    """Регуляризованный таргет энкодинг.

    :param categorical_features: список из столбцов с категориальными признаками, которые нужно заэнкодить
    :param alpha: параметр регуляризации
    """

    def __init__(self, categorical_features: typing.List[str], alpha: float = 50.0):
        self.__is_fitted = False
        self.categorical_features = categorical_features
        self.alpha = alpha
        self.mean_price = None
        self.mean_price_by_cat = {}
        self.encoded_preffix = "encoded_"
        self.target = 'per_square_meter_price'

    def smoothed_target_encoding(self, y: pd.Series) -> pd.Series:
        """Реализация регуляризованного таргед энкодинга.

        Принцип такой - чем меньше исходных данных, тем сильнее будет регуляризация
        Параметр регуляризации регуляризует мин. кол-во необходимых данных
        :param y: pd.Series с ценой
        :return: pd.Series с регуляризованной ценой
        """
        nrows = y.notnull().sum()
        return (y.mean() * nrows + self.alpha * self.mean_price) / (nrows + self.alpha)

    def fit(self, X: pd.DataFrame, y: typing.Union[np.array, pd.Series] = None):
        """На основе обучающей выборки запоминает средние цены в разрезе категорий.

        :param X: pd.DataFrame, обучающая выборка
        :param y: target
        :return:
        """
        X[self.target] = y
        self.mean_price = X[self.target].mean()
        for col in self.categorical_features:
            self.mean_price_by_cat[col] = (
                X.groupby(col)[self.target].apply(lambda x: self.smoothed_target_encoding(x)).fillna(self.mean_price)
            )

        X.drop(self.target, axis=1, inplace=True)
        self.__is_fitted = True
        return self

    def transform(self, X: pd.DataFrame, y: typing.Union[np.array, pd.Series] = None):
        """Применение регуляризованного таргет энкодинга.

        :param X: pd.DataFrame, обучающая выборка
        :return:
        """
        X_cp = X.copy()
        if self.__is_fitted:
            encoded_cols = []
            for col in self.categorical_features:
                new_col = self.encoded_preffix + col
                X_cp[new_col] = X_cp[col].map(self.mean_price_by_cat[col]).fillna(self.mean_price)
                encoded_cols.append(new_col)
            return X_cp
        else:
            raise NotFittedError(
                "This {} instance is not fitted yet. Call 'fit' with appropriate arguments before using this transformer".format(
                    type(self).__name__
                )
            )

In [19]:
from catboost import CatBoostRegressor

In [111]:
pipeline = Pipeline([
    ('smoothed_target_encoder', SmoothedTargetEncoding(MY_CAT_FEATURES, 30)),
    ('preparation', DataPreparation()),
    ('model', CatBoostRegressor(
        num_trees=500,
        loss_function='MAPE',
        per_float_feature_quantization=['79:border_count=2048', '72:border_count=1024'], #79, 72
        verbose=True,
        random_state=RANDOM_SEED,
        cat_features=list(train_df.select_dtypes('object').columns),
        early_stopping_rounds=3,
    ))
])

In [112]:
pipeline.fit(X_train[X_train['price_type'] == 1], np.log(y_train[X_train['price_type'] == 1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


0:	learn: 0.0369762	total: 32.1ms	remaining: 16s
1:	learn: 0.0364162	total: 64.2ms	remaining: 16s
2:	learn: 0.0358652	total: 93.8ms	remaining: 15.5s
3:	learn: 0.0353762	total: 124ms	remaining: 15.3s
4:	learn: 0.0348186	total: 155ms	remaining: 15.3s
5:	learn: 0.0343356	total: 184ms	remaining: 15.2s
6:	learn: 0.0338307	total: 214ms	remaining: 15.1s
7:	learn: 0.0334184	total: 229ms	remaining: 14.1s
8:	learn: 0.0329694	total: 258ms	remaining: 14.1s
9:	learn: 0.0325119	total: 287ms	remaining: 14.1s
10:	learn: 0.0320539	total: 316ms	remaining: 14.1s
11:	learn: 0.0316200	total: 346ms	remaining: 14.1s
12:	learn: 0.0311774	total: 376ms	remaining: 14.1s
13:	learn: 0.0307683	total: 407ms	remaining: 14.1s
14:	learn: 0.0303586	total: 437ms	remaining: 14.1s
15:	learn: 0.0299511	total: 467ms	remaining: 14.1s
16:	learn: 0.0295625	total: 497ms	remaining: 14.1s
17:	learn: 0.0291955	total: 528ms	remaining: 14.2s
18:	learn: 0.0288567	total: 561ms	remaining: 14.2s
19:	learn: 0.0285190	total: 591ms	remainin

Pipeline(steps=[('smoothed_target_encoder',
                 SmoothedTargetEncoding(alpha=30,
                                        categorical_features=['region', 'city',
                                                              'realty_type',
                                                              'floor'])),
                ('preparation',
                 <__main__.DataPreparation object at 0x7f5748bd6be0>),
                ('model',
                 <catboost.core.CatBoostRegressor object at 0x7f5748bd6e10>)])

In [113]:
pred = pipeline.predict(X_test[X_test['price_type'] == 1])
pred_exp = np.exp(pred)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [99]:
deviation_metric(y_test[X_test['price_type'] == 1].values, pred_exp * 0.9)

1.2688524350252495

In [114]:
deviation_metric(y_test[X_test['price_type'] == 1].values, pred_exp * 0.9)

1.2688524350252495

In [47]:
pd.DataFrame({'name': pipeline['model'].feature_names_, 'importance': pipeline['model'].get_feature_importance()})\
.sort_values('importance', ascending=False)

Unnamed: 0,name,importance
79,encoded_city,17.149620
72,total_square,10.329937
1,city,5.482423
19,osm_city_nearest_name,5.241857
17,osm_catering_points_in_0.01,4.351328
...,...,...
56,osm_train_stop_points_in_0.005,0.035793
77,near_the_city_million,0.027893
10,osm_building_points_in_0.001,0.024836
75,price_type,0.000000


# Сабмит

In [None]:
results = {}
results['knn'] = pred_exp

In [100]:
test_df = pd.read_csv('../data/test.csv')

In [101]:
test_df = test_df.drop(['date'], axis=1)

In [102]:
pipeline = Pipeline([
    ('smoothed_target_encoder', SmoothedTargetEncoding(MY_CAT_FEATURES, 30)),
    ('preparation', DataPreparation()),
    ('model', CatBoostRegressor(
        num_trees=3000,
        loss_function='MAPE',
        per_float_feature_quantization=['79:border_count=2048', '72:border_count=1024'], #79, 72
        verbose=True,
        random_state=RANDOM_SEED,
        cat_features=list(train_df.select_dtypes('object').columns),
        early_stopping_rounds=3,
    ))
])
pipeline.fit(train_df[train_df['price_type'] == 1], np.log(train_target[train_df['price_type'] == 1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


0:	learn: 0.0367614	total: 30.9ms	remaining: 1m 32s
1:	learn: 0.0362210	total: 62.7ms	remaining: 1m 33s
2:	learn: 0.0357627	total: 93.3ms	remaining: 1m 33s
3:	learn: 0.0352618	total: 125ms	remaining: 1m 33s
4:	learn: 0.0347575	total: 158ms	remaining: 1m 34s
5:	learn: 0.0342791	total: 191ms	remaining: 1m 35s
6:	learn: 0.0337980	total: 225ms	remaining: 1m 36s
7:	learn: 0.0333257	total: 256ms	remaining: 1m 35s
8:	learn: 0.0328520	total: 287ms	remaining: 1m 35s
9:	learn: 0.0324561	total: 318ms	remaining: 1m 35s
10:	learn: 0.0320108	total: 350ms	remaining: 1m 35s
11:	learn: 0.0316119	total: 383ms	remaining: 1m 35s
12:	learn: 0.0312667	total: 416ms	remaining: 1m 35s
13:	learn: 0.0308501	total: 447ms	remaining: 1m 35s
14:	learn: 0.0304599	total: 479ms	remaining: 1m 35s
15:	learn: 0.0301009	total: 509ms	remaining: 1m 34s
16:	learn: 0.0297746	total: 542ms	remaining: 1m 35s
17:	learn: 0.0293917	total: 574ms	remaining: 1m 35s
18:	learn: 0.0290393	total: 604ms	remaining: 1m 34s
19:	learn: 0.028744

Pipeline(steps=[('smoothed_target_encoder',
                 SmoothedTargetEncoding(alpha=30,
                                        categorical_features=['region', 'city',
                                                              'realty_type',
                                                              'floor'])),
                ('preparation',
                 <__main__.DataPreparation object at 0x7f5744c1d080>),
                ('model',
                 <catboost.core.CatBoostRegressor object at 0x7f574f538588>)])

In [103]:
test_df['city_million'] = test_df['city'].apply(lambda x: '1' if x.strip().lower() in CITIES_MILLIONS else '0')
test_df['near_the_city_million'] = test_df['osm_city_nearest_name'].apply(lambda x: 1 if x.strip().lower() in CITIES_MILLIONS else '0')
test_df.loc[test_df['city_million'] == '1', 'near_the_city_million'] = '1'

In [104]:
pred_test = pipeline.predict(test_df)
pred_test_exp = np.exp(pred_test) * 0.9

In [105]:
pred_test_exp

array([42924.43358522, 41470.79918273, 38138.80510315, ...,
       36862.19370004, 40038.77466023, 46780.85141798])

In [106]:
pd.DataFrame({'id': test_df['id'], 'per_square_meter_price': pred_test_exp}).to_csv('catboost+lenafeat9+encoder_8.csv', index=False)