# Предсказание стоимости недвижимости в Москве
### ==========================Импорт Библиотек==========================

In [1]:
# Импорт библиотек
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler

# выбранная модель
from sklearn.ensemble import GradientBoostingRegressor

# Дата для работоспосбности кода в 2020+
from datetime import datetime

# Метрика
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold

# Magic commands
%matplotlib inline 
# вывод графики в ноутбук
%config InlineBackend.figure_format = 'svg' 
# более четкое отображение, формат фала фигуры svg

### ==========================Исходные Датасеты==========================

In [47]:
df_train = pd.read_csv('import/train.csv') # загружаем тренировочный датасет в датафрейм df_train
df_test = pd.read_csv('import/test.csv') # загружаем тестовый датасет в датафрейм df_test

#TRAIN_DATASET_PATH = './realestatepriceprediction/train.csv'
#TEST_DATASET_PATH = './realestatepriceprediction/test.csv'
#df_train = pd.read_csv(TRAIN_DATASET_PATH) # загружаем тренировочный датасет в датафрейм df_train
#df_test = pd.read_csv(TEST_DATASET_PATH) # загружаем тестовый датасет в датафрейм df_test

In [18]:
df_train.dtypes # Типы признаков

Id                 int64
DistrictId         int64
Rooms            float64
Square           float64
LifeSquare       float64
KitchenSquare    float64
Floor              int64
HouseFloor       float64
HouseYear          int64
Ecology_1        float64
Ecology_2         object
Ecology_3         object
Social_1           int64
Social_2           int64
Social_3           int64
Healthcare_1     float64
Helthcare_2        int64
Shops_1            int64
Shops_2           object
Price            float64
dtype: object

In [48]:
# Превратим ID и DistrictId в строку, так как по сути это название района.
df_train['Id'] = df_train['Id'].astype(str) 
df_train['DistrictId'] = df_train['DistrictId'].astype(str)
df_test['Id'] = df_test['Id'].astype(str) 
df_test['DistrictId'] = df_test['DistrictId'].astype(str)

### ==========================EDA(exploratory data analysis)==========================

In [None]:
# Обзор целевой переменной

plt.figure(figsize = (10, 3))

df_train['Price'].hist(bins=30)
plt.ylabel('Count')
plt.xlabel('Price')

plt.title('Target distribution')
plt.show()

In [None]:
df_train.describe() # Обзор всех числовых признаков

In [None]:
df_train.select_dtypes(include='object').columns.tolist() # Категориальные признаки

### ==========================Обработка данных==========================

In [46]:
class Cleaner():
    district_reach = []
    
    def __init__(self):
        #self.med_price_by_district = None
        print(type(self))

    
    def transform(self, X):
        
        # Добавляю признак неадекватности комнат
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        
        X.loc[(X['Square'] > 280), 'Square'] = ((X.loc[(X['Square'] > 280), 'Square']) / 10)
        X.loc[(X['Square'] < 16), 'Square'] = ((X.loc[(X['Square'] < 16), 'Square']) * 10)
        
        
        # заменяю неадекватное кол-во комнат на 1,2,3,4 в зависимости от площади
        missing_rooms_data = X.loc[ (X['Rooms'] ==0) | (X['Rooms'] > 6), ['Square', 'Rooms']]
        if missing_rooms_data['Square'].count() !=0:
            for i in missing_rooms_data.index:
                if (missing_rooms_data.loc[(i),'Square']) < 43:
                    (missing_rooms_data.loc[(i),'Rooms']) = 1
                elif (missing_rooms_data.loc[(i),'Square']) < 60:
                    (missing_rooms_data.loc[(i),'Rooms']) = 2
                elif (missing_rooms_data.loc[(i),'Square']) < 100:
                    (missing_rooms_data.loc[(i),'Rooms']) = 3
                elif (missing_rooms_data.loc[(i),'Square']) > 100:
                    (missing_rooms_data.loc[(i),'Rooms']) = 4
            X.loc[ (X['Rooms'] ==0) | (X['Rooms'] > 6), 'Rooms'] = missing_rooms_data['Rooms']
        
        # Делаем выборку наблюдений, где площадь кухни больше площади всей квартиры, либо <5м, либо >50 
        var_kitchen = X.loc[(X['Square'] < X['KitchenSquare']) | 
                    (X['KitchenSquare'] < 5) | 
                    (X['KitchenSquare'] > 50), 
                    ['KitchenSquare','Square']]
        
        # если в выборку попали наблюдения, то умножаем площадь квартиры на 0.14 и записываем в KitchenSquare
        if var_kitchen['Square'].count() !=0:
            #print('ok')
            for i in var_kitchen.index:
                var_kitchen.loc[(i), 'KitchenSquare'] = round((var_kitchen.loc[(i), 'Square']) * 0.14)
        X.loc[(X['Square'] < X['KitchenSquare']) | 
                    (X['KitchenSquare'] < 5) | 
                    (X['KitchenSquare'] > 50), 
                    'KitchenSquare'] = var_kitchen['KitchenSquare']
        
        # Healthcare_1 удаляем так как много пропусков
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
        
        
        #  Преобразовываем экстремальные данные площади жилой
        X["LifeSquare"].fillna(0, inplace=True)
        missing_lifesquare = X.loc[(X['LifeSquare'] == 0) | 
                                   (X['LifeSquare'] > X['Square']), 
                                   ['Square', 'LifeSquare']]
        if missing_lifesquare['LifeSquare'].count() !=0:
            for i in missing_lifesquare.index:
                missing_lifesquare.loc[(i), 'LifeSquare'] = round((missing_lifesquare.loc[(i), 'Square']) * 0.64)
        X.loc[(X['LifeSquare'] == 0) | 
              (X['LifeSquare'] > X['Square']), 
              'LifeSquare'] = missing_lifesquare['LifeSquare']
        
        
        #if 'HouseYear' in X.keys():
        #    X['age_house'] = (current_year - X['HouseYear'])
        #var_house_age = X.loc[(X['age_house'] < 0) |(X['age_house'] > 110) ,'age_house']
        
        #HouseYear
        current_year = datetime.now().year
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        
        
        # Заменяем в наблюдениях по признаку Shops_2  буквы A и B на цифры 0 и 1
        binary_to_numbers = {"B": 1, "A": 0,}
        X["Shops_2"] = X["Shops_2"].replace(binary_to_numbers) 

        # Заменяем в наблюдениях по признаку Ecology_2  буквы A и B на цифры 0 и 1
        X["Ecology_2"] = X["Ecology_2"].replace(binary_to_numbers)

        # Заменяем в наблюдениях по признаку Ecology_3  буквы A и B на цифры 0 и 1
        X["Ecology_3"] = X["Ecology_3"].replace(binary_to_numbers)
        
        return X
    
    # DistrictReach
    def districtReach(self, X):   
        if 'Price' in X.keys():
            self.district_reach = X.groupby('DistrictId').median()[['Price']]\
                                .rename(columns={'Price':'district_median_price'})

            X = X.merge(self.district_reach, on=['DistrictId'], how='left')
        
        return X
    
    
    def addReach(self, X):
        X['district_median_price'] = X.merge(self.district_reach, on=['DistrictId'], how='left')
        
        return X

In [49]:
X_clean = Cleaner()

<class '__main__.Cleaner'>


In [50]:
X = X_clean.transform(df_train)
X = X_clean.districtReach(df_train)

In [51]:
var_district_reach = X_clean.district_reach
var_district_reach

Unnamed: 0_level_0,district_median_price
DistrictId,Unnamed: 1_level_1
0,165963.054142
1,183663.443595
10,217749.656911
100,234032.836228
101,184276.502773
...,...
95,293249.106761
96,200440.676982
97,193772.806849
98,191914.221629


### 6. Разбиение на test и train

In [52]:
y = X[['Price']]
X = X.drop(['Price','Id'], axis=1)

In [53]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.38, shuffle=True, random_state=66)

### 7. Построение модели

In [55]:
model_gbt = GradientBoostingRegressor(random_state=66, n_estimators=334)
model_gbt.fit(X_train, y_train)

  return f(**kwargs)


GradientBoostingRegressor(n_estimators=334, random_state=66)

In [56]:
y_train_preds = model_gbt.predict(X_train)
y_valid_preds = model_gbt.predict(X_valid)

In [57]:
cv_score = cross_val_score(model_gbt, X, y, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=66))
cv_score

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


array([0.73692109, 0.73964952, 0.74103823])

In [58]:
feature_importances = pd.DataFrame(zip(X_train.columns, model_gbt.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature_name,importance
2,Square,0.445802
18,district_median_price,0.344542
1,Rooms,0.036306
7,HouseYear,0.025246
13,Social_3,0.024645
12,Social_2,0.024414
11,Social_1,0.023225
6,HouseFloor,0.015055
5,Floor,0.01302
3,LifeSquare,0.012791


In [59]:
df_test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [60]:
X_test = X_clean.transform(df_test)

In [61]:
X_test = X_test.merge(var_district_reach, on=['DistrictId'], how='left')

In [62]:
predictions = pd.DataFrame(index = df_test['Id'])

In [63]:
X_test.drop('Id', axis=1, inplace=True)

In [64]:
X_test.isnull().sum()

DistrictId               0
Rooms                    0
Square                   0
LifeSquare               0
KitchenSquare            0
Floor                    0
HouseFloor               0
HouseYear                0
Ecology_1                0
Ecology_2                0
Ecology_3                0
Social_1                 0
Social_2                 0
Social_3                 0
Helthcare_2              0
Shops_1                  0
Shops_2                  0
Rooms_outlier            0
district_median_price    8
dtype: int64

In [65]:
X_test['district_median_price'].fillna(X_test['district_median_price'].median(), inplace=True)

In [66]:
y_pred = model_gbt.predict(X_test)

In [67]:
predictions['Price'] = y_pred
predictions

Unnamed: 0_level_0,Price
Id,Unnamed: 1_level_1
725,153096.302411
15856,222724.232139
5480,383902.477311
15664,345364.937986
14275,133571.565447
...,...
8180,238320.459638
4695,134182.873829
5783,331220.800494
4780,189839.800219


In [68]:
predictions.to_csv('try_test_14.csv', sep=',')