In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
matplotlib.rcParams.update({'font.size': 14})


Загрузка данных

In [2]:
train_df = pd.read_csv('train.csv', sep=',')
test_df = pd.read_csv('test.csv', sep=',')

In [3]:
train_df.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


Id - идентификационный номер квартиры
DistrictId - идентификационный номер района
Rooms - количество комнат
Square - площадь
LifeSquare - жилая площадь
KitchenSquare - площадь кухни
Floor - этаж
HouseFloor - количество этажей в доме
HouseYear - год постройки дома
Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
Social_1, Social_2, Social_3 - социальные показатели местности
Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья
Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
Price - цена квартиры

Приводим типы данных

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

меняем DistrictId на тип str потому что это номинативное поле и Id потому что это безполезное поле

In [5]:
train_df['Id'] = train_df['Id'].astype(str)
train_df['DistrictId'] = train_df['DistrictId'].astype(str)

Подготавливаем данные

In [6]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.kitchen_square_quantile = None
        
    def fit(self, X):
        """Сохранение статистик"""       
        # Расчет медиан
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)
    
    def transform(self, X):
        """Трансформация данных"""
        
        #обработка пропусков помечаем все nan префиксм nan
        
        i = 0
        for f in X.isna().sum()>0:
            if f:
                X[(X.keys()[i])+'_nan'] = X[(train_df.keys()[i])].isna() * 1
            i=i+1
        
        # LifeSquare
        condition = (X['LifeSquare'].isna()) & \
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3
        X.fillna(self.medians, inplace=True)
                
        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
        
        # KitchenSquare
        condition = X['KitchenSquare'] > self.kitchen_square_quantile
        
        X.loc[condition, 'KitchenSquare'] = self.medians['KitchenSquare']

        X.loc[X['KitchenSquare'] < 6, 'KitchenSquare'] = 6
        
        #LifeSquare
        X.loc[X['LifeSquare']<15, 'LifeSquare'] = 15
        
        # Square
        X.loc[X['Square'] < X['LifeSquare'], 'Square'] = X['LifeSquare'] + 3
        X.loc[X['Square'] < X['KitchenSquare'], 'Square'] = X['KitchenSquare'] + 3
              
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        
        floor_outliers = X.loc[X['Floor'] > X['HouseFloor']].index
        X.loc[floor_outliers, 'Floor'] = X.loc[floor_outliers, 'HouseFloor']\
                                            .apply(lambda x: random.randint(1, x))
        
        # HouseYear
        current_year = datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1
        
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
            
        return X

In [7]:
dp = DataPreprocessing()
dp.fit(train_df)
train_dp = dp.transform(train_df)
train_dp.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price,LifeSquare_nan,Healthcare_1_nan,Rooms_outlier,HouseFloor_outlier,HouseYear_outlier
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.8876,57.378202,41.410962,7.3178,7.3959,12.9591,1984.8728,0.118858,24.687,5352.1574,8.0392,1026.3589,1.3195,4.2313,214138.857399,0.2113,0.4798,0.0012,0.1825,0.0002
std,0.811438,77.344217,77.591997,1.770005,5.035464,6.44346,18.417132,0.119025,17.532614,4006.799803,23.831875,746.662828,1.493601,4.806341,92872.293865,0.408251,0.499617,0.034622,0.386275,0.014141
min,1.0,16.117154,15.0,6.0,1.0,1.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028,0.0,0.0,0.0,0.0,0.0
25%,1.0,42.017323,25.355889,6.0,3.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,830.0,0.0,1.0,153872.633942,0.0,0.0,0.0,0.0,0.0
50%,2.0,52.744619,36.24331,6.0,6.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879,0.0,0.0,0.0,0.0,0.0
75%,2.0,66.148685,49.122175,8.0,11.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,990.0,2.0,6.0,249135.462171,0.0,1.0,0.0,0.0,0.0
max,5.0,7483.592129,7480.592129,13.0,42.0,117.0,2022.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657,1.0,1.0,1.0,1.0,1.0


In [8]:
train_dp.loc[train_dp['LifeSquare']<20]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,LifeSquare_nan,Healthcare_1_nan,Rooms_outlier,HouseFloor_outlier,HouseYear_outlier
15,4328,96,1.0,36.673407,16.285522,9.0,3,12.0,2003,0.041116,...,900.0,1,4,B,168143.345700,0,1,0,0,0
26,8553,88,3.0,83.262530,15.000000,6.0,1,1.0,1977,0.127376,...,900.0,3,9,B,410883.892020,0,1,0,1,0
28,1924,24,1.0,34.127059,19.435738,6.0,5,5.0,1960,0.111627,...,1970.0,2,3,B,173155.770779,0,0,0,0,0
40,10954,119,1.0,32.994994,17.717022,6.0,4,5.0,1963,0.033494,...,1322.0,3,8,B,158611.394377,0,0,0,0,0
56,16780,42,1.0,40.893527,18.251862,10.0,4,24.0,2007,0.158249,...,900.0,1,0,B,171531.569155,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9972,12778,28,1.0,37.817065,19.774407,6.0,5,9.0,1983,0.118537,...,1183.0,1,0,B,168151.013690,0,0,0,0,0
9982,10268,27,1.0,36.112393,15.000000,6.0,9,16.0,1977,0.211401,...,900.0,0,1,B,94881.691800,0,1,0,0,0
9985,277,93,1.0,34.723984,19.840550,9.0,6,16.0,1988,0.521867,...,900.0,0,0,B,149649.082219,0,1,0,0,0
9990,10635,21,1.0,18.983962,15.000000,6.0,1,5.0,1967,0.194489,...,125.0,3,5,B,112411.221700,0,0,0,0,0


In [9]:
train_dp.loc[train_dp['Square']<train_dp['LifeSquare'], 'Square']

Series([], Name: Square, dtype: float64)