https://www.kaggle.com/c/realestatepriceprediction

### Загрузка библиотек

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,RandomForestClassifier,ExtraTreesRegressor
from sklearn.metrics import r2_score,mean_absolute_error as mae,mean_squared_error as mse

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as img

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import xgboost as xgb
# import lightgbm as lgb

In [3]:
matplotlib.rcParams.update({'font.size':14})

# 1. Предобработка df_train

## 1. Загрузка данных

In [4]:
df_train = pd.read_csv('train.csv')

In [5]:
df_test = pd.read_csv('test.csv')

In [6]:
print('df_train',df_train.shape)
print('df_test',df_test.shape)

df_train (10000, 20)
df_test (5000, 19)


## 2. Обработка Id

#### Проверка уникальности Id

In [7]:
len(df_train['Id'].unique()) == df_train.shape[0]

True

#### Перевод Id в индекс

In [8]:
df_train = df_train.set_index('Id')

----

In [9]:
features = df_train.columns.drop('Price')
features

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
       'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
       'Shops_1', 'Shops_2'],
      dtype='object')

## 2. Обзор признаков и цели

### Введение X и y

In [10]:
y = pd.DataFrame(df_train.Price)
X = df_train[features]

### Обзор целевой переменной

In [11]:
def Distribution_of_Price(col):
    # col = 'Price'
    target_mean = round(df_train[col].mean(), 2)
    target_median = df_train[col].median()
    target_mode = df_train[col].mode()[0]

    plt.figure(figsize=(8, 8 / 2))
    sns.distplot(df_train[col])

    y = np.linspace(0, 0.000005, 100)
    plt.plot([target_mean] * 100, y, label='mean', linestyle=':')
    plt.plot([target_median] * 100, y, label='median', linestyle='--')
    plt.plot([target_mode] * 100, y, label='mode', linestyle=':')

    plt.title('Distribution of Price')
    plt.legend()
    plt.show()


# Distribution_of_Price('Price')

In [12]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 14038 to 6306
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Price   10000 non-null  float64
dtypes: float64(1)
memory usage: 156.2 KB


### Обзор признаков

In [13]:
# X.info()

## 3. Обработка пропусков

In [14]:
# Найдем столбцы с пропусками
pd.Series(X.isnull().sum())[X.isnull().sum() > 0]

LifeSquare      2113
Healthcare_1    4798
dtype: int64

In [15]:
# Заменим на медианы
for feat in ['LifeSquare', 'Healthcare_1']:
    X.loc[X[feat].isnull(), feat] = X[feat].median()

In [16]:
# Проверим пропуски еще раз
pd.Series(X.isnull().sum())[X.isnull().sum() > 0]

Series([], dtype: int64)

## 4. Обработка выбросов и ошибок
**NB!** На test не удалять строки!!!

In [17]:
X.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,50.4008,1.8905,56.315775,36.26604,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1026.3589,1.3195,4.2313
std,43.587592,0.839512,21.058732,76.609981,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,746.662828,1.493601,4.806341
min,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,20.0,1.0,41.774881,25.527399,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,830.0,0.0,1.0
50%,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0
75%,75.0,2.0,65.900625,41.427234,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,990.0,2.0,6.0
max,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


#### Обработка DistrictId

In [18]:
# нет

#### Обработка Rooms

In [19]:
np.sort(X.Rooms.unique()).tolist()

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 19.0]

In [20]:
# Вычислим средн.. площадь для каждого количества комнат
for room in np.sort(X.Rooms.unique()).tolist():
    print('среднее Square для Rooms = {}'.format(room), X.loc[X['Rooms'] == room].median()['Square'])

среднее Square для Rooms = 0.0 65.48747379855399
среднее Square для Rooms = 1.0 40.40658991236909
среднее Square для Rooms = 2.0 55.84181213903055
среднее Square для Rooms = 3.0 77.41364294803941
среднее Square для Rooms = 4.0 98.66805357992513
среднее Square для Rooms = 5.0 116.08486083443316
среднее Square для Rooms = 6.0 59.41433379627719
среднее Square для Rooms = 10.0 59.96412034862395
среднее Square для Rooms = 19.0 42.00604570550408


In [21]:
X.loc[X['Rooms'] == 0, :]
# комнат может и не быть

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
12638,27,0.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,B,B,11,3097,0,900.0,0,0,B
7917,27,0.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,900.0,0,1,B
7317,27,0.0,41.790881,32.78126,0.0,13,0.0,1977,0.211401,B,B,9,1892,0,900.0,0,1,B
770,28,0.0,49.483501,32.78126,0.0,16,0.0,2015,0.118537,B,B,30,6207,1,1183.0,1,0,B
456,6,0.0,81.491446,32.78126,0.0,4,0.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B
3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,900.0,0,0,B
3159,88,0.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,B,B,43,8429,3,900.0,3,9,B
9443,27,0.0,87.762616,85.125471,0.0,5,15.0,1977,0.211401,B,B,9,1892,0,900.0,0,1,B


In [22]:
X.loc[X.index == 12638, 'Square']

Id
12638    138.427694
Name: Square, dtype: float64

In [23]:
X.loc[X.index == 12638, 'Rooms'] = 5
X.loc[X.index == 7917, 'Rooms'] = 5
X.loc[X.index == 7317, 'Rooms'] = 1
X.loc[X.index == 770, 'Rooms'] = 1
X.loc[X.index == 456, 'Rooms'] = 3
X.loc[X.index == 3159, 'Rooms'] = 1
X.loc[X.index == 9443, 'Rooms'] = 4

In [24]:
# есть одна выброс, с очень маленькой площадью => удалим:
X = X.drop(3224)
y = y.drop(3224)

In [25]:
# 6,10, 19 комнат - похожи на выбросы
X.loc[X['Rooms'] >= 6].sort_values(by='Square')

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,B,B,1,264,0,900.0,0,1,B
5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,B,B,74,19083,2,900.0,5,15,B
14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,B,B,66,10573,1,1322.0,3,8,B
14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,B,B,25,5648,1,30.0,2,4,B


In [26]:
# Поправим в соответствии со средними Square
X.loc[X['Rooms'] == 6] = 2
X.loc[X['Rooms'] == 10] = 2
X.loc[X['Rooms'] == 19] = 1

#### Обработка Square

In [27]:
# Отбор обхектов менее площадь < 10
# X.loc[X['Square'] <10].sort_values(by='Square')
(X.loc[X['Square'] <10]['Square']*10 - X.loc[X['Square'] <10]['LifeSquare']).sort_values()

Id
13265   -31.531170
1748    -12.658103
10527     3.001421
14786     6.842852
8491      9.000000
15744    17.247209
5927     18.000000
14003    18.000000
14865    18.000000
10202    21.358569
6782     24.285808
4504     38.292536
7657     40.409290
11526    44.365015
9487     45.742758
dtype: float64

In [28]:
# X.loc[X.index == 13265]

In [29]:
for i in [15744,10527,14786]:
    X.loc[X.index == i, 'Square'] = X.loc[X.index == i, 'Square'] * 10

In [30]:
# есть одна выброс, с очень маленькой площадью => удалим:
for i in [8491,5927,14003,14865,13265,1748]:
    X = X.drop(i)
    y = y.drop(i)

#### Обработка LifeSquare

In [31]:
# удалим столбец полностью
X.drop(['LifeSquare'], axis=1, inplace=True)

In [32]:
# экстремальные выбросы
# X.loc[X['LifeSquare'] > 200].sort_values(by='LifeSquare')

In [33]:
# X.loc[X.index == 14990, 'LifeSquare'] = X.loc[X.index == 14990, 'LifeSquare'] / 10
# X.loc[X.index == 16550, 'LifeSquare'] = X.loc[X.index == 16550, 'LifeSquare'] / 100
# X.loc[X.index == 15886, 'LifeSquare'] = X.loc[X.index == 15886, 'LifeSquare'] / 10

In [34]:
# X.loc[X['LifeSquare'] < 1 ].sort_values(by='LifeSquare')

In [35]:
# X.loc[X.index == 13491, 'LifeSquare'] = X.loc[X.index == 13491, 'LifeSquare'] * 100
# X.loc[X.index == 4378, 'LifeSquare'] = X.loc[X.index == 4378, 'LifeSquare'] * 100

In [36]:
# # NaN => заменю разницей между Square и KitchenSquare
# X.loc[X.index == 16593, 'LifeSquare'] = X.loc[X.index == 16593, 'Square'] - X.loc[X.index == 16593, 'KitchenSquare']

#### Обработка KitchenSquare

In [37]:
# экстремальные выбросы
X.loc[X['KitchenSquare'] > 50, ['Square','KitchenSquare']].sort_values(by='KitchenSquare')

Unnamed: 0_level_0,Square,KitchenSquare
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
673,54.418214,51.0
12918,51.440463,51.0
4265,53.216778,53.0
7162,56.738764,54.0
12666,60.603363,58.0
11739,61.070298,58.0
299,66.787523,60.0
16593,64.859242,62.0
6508,67.146049,63.0
2371,68.841073,66.0


In [38]:
# опечатка => заменю медианой
X.loc[X.index == 14656, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].median()
X.loc[X.index == 14679, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].median()

In [39]:
# возможно, слишком большой метраж кухни, заменю средним по комнатам

# X.loc[X.index == 16395, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
# X.loc[X.index == 2371, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 12507, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 4265, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 12390, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
# X.loc[X.index == 7441, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
# X.loc[X.index == 6508, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 299, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 12552, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
# X.loc[X.index == 13703, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
# X.loc[X.index == 11739, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 16593, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 673, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
# X.loc[X.index == 12666, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 4966, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
# X.loc[X.index == 6569, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
# X.loc[X.index == 7162, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 12918, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
# X.loc[X.index == 2737, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()

#### Обработка Floor

In [40]:
#  аномалии
X.loc[X['Floor'] > 50]

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


#### Обработка HouseFloor

In [41]:
#  аномалии
X.loc[X['HouseFloor'] > 50].sort_values(by='HouseYear')

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10806,5,1.0,51.944587,1.0,6,99.0,1977,0.150818,B,B,16,3433,4,2643.0,4,5,B
9300,74,2.0,71.747869,9.0,5,99.0,1977,0.075779,B,B,6,1437,3,900.0,0,2,B
78,30,2.0,65.773749,1.0,8,117.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B


In [42]:
# Этажость > 50 соответствует 1977 => заменим на медиану 1977 года
X.loc[X['HouseFloor'] > 50, 'HouseFloor'] = X.loc[X['HouseYear'] == 1977, 'HouseFloor'].median()

#### Обработка HouseYear

In [43]:
# аномалии. Невозможно больше 2020
X.loc[(X['HouseYear'] > 2020)]

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10814,109,1.0,37.26507,9.0,9,12.0,20052011,0.13633,B,B,30,6141,10,262.0,3,6,B
11607,147,2.0,44.791836,5.0,4,9.0,4968,0.319809,B,B,25,4756,16,2857.0,5,8,B


In [44]:
# исправлю опечатки
X.loc[X.index == 11607, 'HouseYear'] = 1968

In [45]:
# спорный момент - удалю
X.drop(10814,axis=0, inplace=True)
y.drop(10814,axis=0, inplace=True)

#### Обработка Ecology_2, Ecology_3, Shops_2

In [46]:
# перевожу категориальные признаки в численные
for i in X.select_dtypes(include='object').columns:
    print(i, pd.unique(X[i]))
    print('-'*30)

Ecology_2 ['B' 'A']
------------------------------
Ecology_3 ['B' 'A']
------------------------------
Shops_2 ['B' 'A']
------------------------------


In [47]:
# Преобразуем категориальные признаки в несколько бинарных
X = pd.concat([X, pd.get_dummies(X['Ecology_2'], prefix='Ecology_2')], axis=1)
X = pd.concat([X, pd.get_dummies(X['Ecology_3'], prefix='Ecology_3')], axis=1)
X = pd.concat([X, pd.get_dummies(X['Shops_2'], prefix='Shops_2')], axis=1)

In [48]:
for i in ['Ecology_2','Ecology_3','Shops_2']:
    X = X.drop(i,axis=1)

In [49]:
# for i in ['Ecology_2','Ecology_3','Shops_2']:
#     print(i, pd.unique(X[i]))
#     print('-'*30)

In [50]:
X.head()

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
14038,35,2.0,47.981561,6.0,7,9.0,1969,0.08904,33,7976,5,900.0,0,11,0,1,0,1,0,1
15053,41,3.0,65.68364,8.0,7,9.0,1978,7e-05,46,10309,1,240.0,1,16,0,1,0,1,0,1
4765,53,2.0,44.947953,0.0,8,12.0,1968,0.049637,34,7759,0,229.0,1,3,0,1,0,1,0,1
5809,58,2.0,53.352981,9.0,8,17.0,1977,0.437885,23,5735,3,1084.0,0,5,0,1,0,1,0,1
10783,99,1.0,39.649192,7.0,11,12.0,1976,0.012339,35,5776,1,2078.0,2,4,0,1,0,1,0,1


# 2. Обучение модели

#### Подготовка формы для модели

In [51]:
def create_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    r2 = r2_score(y_test, y_pred) 
    print(f'r2 = {r2}')

In [52]:
# оценка прогнозных значений
def evaluate_preds(true_values, pred_values):
    print("R2:\t" + str(round(r2(true_values, pred_values), 3)))

In [53]:
X_train,X_valid,y_train,y_valid= train_test_split(X,y,test_size=0.25,shuffle=True,random_state=42)

### Обучение модели на train dataset на линейной регрессии

In [54]:
model_lr = LinearRegression()

In [55]:
create_model(X_train, y_train, X_valid, y_valid, model_lr)

r2 = 0.5028071046556275


#### Нормализация признаков

In [56]:
scaler = StandardScaler()

In [57]:
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

X_valid_scaled = scaler.transform(X_valid)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)    

In [58]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_lr)

r2 = 0.502807104655637


### Lasso, Ridge

In [59]:
for i in np.linspace(0.01,100,4):
    model_lasso = Lasso(alpha=i)
    print('alpha',i)
    create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_lasso)
    print(' ')

alpha 0.01
r2 = 0.5028070954810862
 
alpha 33.339999999999996
r2 = 0.50276641830732
 
alpha 66.67
r2 = 0.5027055351743656
 
alpha 100.0
r2 = 0.5026245742434488
 


In [60]:
for i in np.linspace(0.01,100,4):
    model_ridge = Ridge(alpha=i)
    print('alpha',i)
    create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_ridge)
    print(' ')

alpha 0.01
r2 = 0.5028070812651483
 
alpha 33.339999999999996
r2 = 0.5027203591649436
 
alpha 66.67
r2 = 0.5026242904066224
 
alpha 100.0
r2 = 0.5025252022949493
 


### Random Forest

In [61]:
model_forest = RandomForestRegressor(max_depth=400, random_state=42, n_estimators=1000)

In [62]:
y_train = y_train.values.ravel()

In [63]:
%%time
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_forest)

r2 = 0.7351106893754011
Wall time: 1min 14s


#### ExtraTreesRegressor

In [64]:
model_ExtraTrees = ExtraTreesRegressor(max_depth=400, random_state=42, n_estimators=1000)

In [65]:
%%time
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_ExtraTrees)

r2 = 0.7397164304983108
Wall time: 56.8 s


#### Gradient Boosting

In [66]:
model_gb = GradientBoostingRegressor()

In [67]:
%%time
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_gb)

r2 = 0.7218331344602011
Wall time: 1.82 s


### Настройка и оценка финальной модели

In [68]:
# %%time
# params={
#     'n_estimators':[1000,1500,2000,3000],
#     'max_depth':[400,500,600,700,800]
# }
# gs=GridSearchCV(
#     model_ExtraTrees, params, scoring='r2', cv=KFold(n_splits=3,random_state=21,shuffle=True), n_jobs=-1)
# gs.fit(X, y)

In [69]:
# gs.best_params_

In [70]:
model_final = ExtraTreesRegressor(n_estimators=1000, max_depth=400, random_state=42)

In [72]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_final)

r2 = 0.7397164304983108


# 3. Предобработка df_test

In [73]:
df_test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


## 1. Перевод Id в индекс

In [74]:
df_test = df_test.set_index('Id')

## 2. Обработка пропусков

In [75]:
# Найдем столбцы с пропусками
pd.Series(df_test.isnull().sum())[df_test.isnull().sum()>0]

LifeSquare      1041
Healthcare_1    2377
dtype: int64

In [76]:
# Заменим на медианы
for feat in ['LifeSquare','Healthcare_1']:
    df_test.loc[df_test[feat].isnull(), feat] = df_test[feat].median()

In [77]:
# Проверим пропуски еще раз
pd.Series(df_test.isnull().sum())[df_test.isnull().sum()>0]

Series([], dtype: int64)

## 4. Обработка выбросов и ошибок
**NB!** На test не удалять строки!!!

In [78]:
df_test.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,51.2792,1.91,56.4495,35.485549,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1029.3964,1.3194,4.2428
std,44.179466,0.838594,19.092787,15.915345,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,766.595258,1.47994,4.777365
min,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,21.0,1.0,41.906231,25.850152,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,810.0,0.0,1.0
50%,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,77.0,2.0,66.285129,41.769526,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,990.0,2.0,6.0
max,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


#### Обработка DistrictId

In [79]:
# нет

#### Обработка Rooms

In [80]:
np.sort(df_test.Rooms.unique()).tolist()

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 17.0]

In [81]:
# Вычислим средн.. площадь для каждого количества комнат
for room in np.sort(df_test.Rooms.unique()).tolist():
    print('среднее Square для Rooms = {}'.format(room), df_test.loc[df_test['Rooms'] == room].median()['Square'])

среднее Square для Rooms = 0.0 96.58467749889924
среднее Square для Rooms = 1.0 40.11768891770818
среднее Square для Rooms = 2.0 55.828079347894004
среднее Square для Rooms = 3.0 77.67927907920622
среднее Square для Rooms = 4.0 95.14424552779224
среднее Square для Rooms = 5.0 120.33072670890957
среднее Square для Rooms = 6.0 167.10195725313875
среднее Square для Rooms = 17.0 52.86610662396232


In [82]:
df_test.loc[df_test['Rooms'] == 0,:]
# комнат может и не быть

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3343,58,0.0,116.824201,113.692424,0.0,3,3.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B
10729,27,0.0,76.345154,42.820796,12.0,14,0.0,1977,0.017647,B,B,2,469,0,900.0,0,0,B


In [83]:
# заменяю средними из датасета train
df_test.loc[df_test.index == 3343, 'Rooms'] = 5
df_test.loc[df_test.index == 10729, 'Rooms'] = 3

In [84]:
# Проверка 6 комн квартиры
df_test.loc[df_test['Rooms'] == 6]
# норма

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10793,23,6.0,110.750226,32.925087,0.0,2,2.0,2015,0.014073,B,B,2,475,0,900.0,0,0,B
4058,27,6.0,223.453689,104.113552,16.0,2,2.0,2017,0.041116,B,B,53,14892,4,900.0,1,4,B


In [85]:
# Проверка 17 комн квартиры
df_test.loc[df_test['Rooms'] == 17].sort_values(by='Square')

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,B,B,23,4635,5,3300.0,2,4,B


In [86]:
# Заменим на среднее для этой площади
df_test.loc[df_test.index == 1435, 'Rooms'] = 2

#### Обработка Square

In [87]:
# Отбор объектов менее площадь < 10
df_test.loc[df_test['Square'] <10].sort_values(by='Square')
# (X.loc[X['Square'] <10]['Square']*10 - X.loc[X['Square'] <10]['LifeSquare']).sort_values()

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9011,53,1.0,1.378543,1.353573,1.0,1,1.0,1977,0.049637,B,B,34,7759,0,229.0,1,3,B
1420,45,1.0,1.975769,2.900371,1.0,1,1.0,1977,0.195781,B,B,23,5212,6,900.0,3,2,B
1165,27,1.0,2.372101,1.899119,1.0,2,17.0,1977,0.011654,B,B,4,915,0,900.0,0,0,B
16401,30,1.0,2.645046,4.338755,1.0,2,1.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B
170,6,2.0,2.900586,61.468563,1.0,18,17.0,2014,0.243205,B,B,5,1564,0,540.0,0,0,B
7855,6,1.0,4.967143,2.968086,1.0,3,1.0,2018,0.243205,B,B,5,1564,0,540.0,0,0,B
10120,6,1.0,5.100672,3.86178,1.0,3,1.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B
2138,27,1.0,5.647458,1.501582,1.0,1,1.0,1977,0.017647,B,B,2,469,0,900.0,0,0,B


In [88]:
# Умножим площади на 10
for i in [9011,1165,7855,10120,2138]:
    df_test.loc[df_test.index == i, 'Square'] = df_test.loc[df_test.index == i, 'Square'] * 10
    df_test.loc[df_test.index == i, 'LifeSquare'] = df_test.loc[df_test.index == i, 'LifeSquare'] * 10

#### Обработка LifeSquare

In [89]:
# удалим столбец полностью
df_test.drop(['LifeSquare'], axis=1, inplace=True)

In [90]:
# # экстремальные выбросы
# df_test.loc[df_test['LifeSquare'] > 200].sort_values(by='LifeSquare')

In [91]:
# # Разделим площадь на 10
# df_test.loc[df_test.index == 11533, 'LifeSquare'] = df_test.loc[df_test.index == 11533, 'LifeSquare'] / 10

In [92]:
# # Проверка LifeSquare меньше 1.0
# df_test.loc[df_test['LifeSquare'] < 2 ].sort_values(by='LifeSquare')

In [93]:
# df_test.loc[df_test.index == 6406, 'LifeSquare'] = df_test.loc[df_test.index == 6406, 'LifeSquare'] * 100

#### Обработка KitchenSquare

In [94]:
# экстремальные выбросы/ Проверка
df_test.loc[df_test['KitchenSquare'] > 50].sort_values(by='KitchenSquare')

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5199,27,2.0,59.05499,57.0,13,12.0,2016,0.211401,B,B,9,1892,0,900.0,0,1,B
12612,27,2.0,60.988496,60.0,5,17.0,2013,0.072158,B,B,2,629,1,900.0,0,0,A
5428,27,2.0,62.326044,61.0,12,17.0,1977,0.072158,B,B,2,629,1,900.0,0,0,A
8015,27,1.0,66.099096,62.0,3,7.0,2016,0.014058,B,B,1,290,0,900.0,0,0,B
5260,73,3.0,69.358242,65.0,6,6.0,1931,0.042032,B,B,37,6856,84,1940.0,2,5,B
12640,6,2.0,54.629142,97.0,4,17.0,2015,0.243205,B,B,5,1564,0,540.0,0,0,B
3341,62,3.0,112.114019,112.0,3,3.0,2017,0.072158,B,B,2,629,1,900.0,0,0,A
14594,11,2.0,42.795304,620.0,11,14.0,1972,0.038693,B,B,28,6533,1,1015.0,2,5,B


In [95]:
df_test.loc[df_test.index == 14594, 'KitchenSquare'] = df_test.loc[df_test.index == 14594, 'KitchenSquare'] / 100
df_test.loc[(df_test['KitchenSquare'] > 50), 'KitchenSquare'] = df_test.loc[(df_test['KitchenSquare'] > 50), 'KitchenSquare'] / 10

#### Обработка Floor

In [96]:
#  аномалии
df_test.loc[df_test['Floor'] > 50]

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
15759,17,2.0,57.60187,10.0,78,22.0,1989,0.0,B,B,25,5027,4,46.0,1,1,B


In [97]:
# Заменим на первое (скорее всего опечатка)
df_test.loc[df_test.index == 15759, 'Floor'] = 7

#### Обработка HouseFloor

In [98]:
#  аномалии
df_test.loc[df_test['HouseFloor'] > 50].sort_values(by='HouseYear')

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
15864,27,3.0,47.722835,9.0,18,99.0,1977,0.072158,B,B,2,629,1,900.0,0,0,A


In [99]:
# Этажость > 50 не соответствует 1977 => заменим на медиану 1977 года датасета train
df_test.loc[df_test.index == 15864, 'HouseFloor'] = X.loc[X['HouseYear'] == 1977, 'HouseFloor'].median()

In [100]:
#  аномалии
df_test.loc[((df_test['HouseFloor'] ==0) & (df_test['HouseYear'] >1977))].sort_values(by='HouseYear')

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
13829,47,1.0,39.89618,0.0,4,0.0,1979,7e-05,B,B,46,10309,1,240.0,1,16,B
1358,78,2.0,54.209252,8.0,6,0.0,1998,0.092291,B,B,21,4346,2,165.0,1,2,B
16053,17,4.0,168.729035,0.0,4,0.0,2013,0.093443,B,B,23,4635,5,3300.0,2,4,B
12694,27,3.0,83.670032,0.0,8,0.0,2015,0.072158,B,B,2,629,1,900.0,0,0,A
16735,23,3.0,70.650625,0.0,5,0.0,2015,0.005767,B,B,1,388,0,900.0,0,0,B
11296,27,1.0,47.478618,0.0,6,0.0,2016,0.017647,B,B,2,469,0,900.0,0,0,B
6406,27,2.0,66.46951,0.0,4,0.0,2018,0.017647,B,B,2,469,0,900.0,0,0,B
1347,45,2.0,56.227827,0.0,4,0.0,2018,0.195781,B,B,23,5212,6,900.0,3,2,B


In [101]:
df_test.loc[df_test.index == 13829, 'HouseFloor'] = X.loc[X['HouseYear'] == 1979, 'HouseFloor'].median()
df_test.loc[df_test.index == 1358, 'HouseFloor'] = X.loc[X['HouseYear'] == 1998, 'HouseFloor'].median()
df_test.loc[df_test.index == 16053, 'HouseFloor'] = X.loc[X['HouseYear'] == 2013, 'HouseFloor'].median()
df_test.loc[df_test.index == 12694, 'HouseFloor'] = X.loc[X['HouseYear'] == 2015, 'HouseFloor'].median()
df_test.loc[df_test.index == 16735, 'HouseFloor'] = X.loc[X['HouseYear'] == 2015, 'HouseFloor'].median()
df_test.loc[df_test.index == 11296, 'HouseFloor'] = X.loc[X['HouseYear'] == 2016, 'HouseFloor'].median()
df_test.loc[df_test.index == 6406, 'HouseFloor'] = X.loc[X['HouseYear'] == 2018, 'HouseFloor'].median()
df_test.loc[df_test.index == 1347, 'HouseFloor'] = X.loc[X['HouseYear'] == 2018, 'HouseFloor'].median()

In [102]:
#  аномалии
df_test.loc[df_test['HouseFloor'] ==0].sort_values(by='HouseYear').HouseYear.value_counts()

1977    123
Name: HouseYear, dtype: int64

In [103]:
# заменим на медиану 1977 года датасета train
df_test.loc[df_test['HouseFloor'] ==0, 'HouseFloor'] = X.loc[X['HouseYear'] == 1977, 'HouseFloor'].median()

#### Обработка HouseYear

In [104]:
# аномалии. Невозможно больше 2020
X.loc[(X['HouseYear'] > 2020)]

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1


#### Обработка Ecology_2, Ecology_3, Shops_2

In [105]:
# перевожу категориальные признаки в численные
for i in df_test.select_dtypes(include='object').columns:
    print(i, pd.unique(df_test[i]))
    print('-'*30)

Ecology_2 ['B' 'A']
------------------------------
Ecology_3 ['B' 'A']
------------------------------
Shops_2 ['B' 'A']
------------------------------


In [106]:
# Преобразуем категориальные признаки в несколько бинарных
df_test = pd.concat([df_test, pd.get_dummies(df_test['Ecology_2'], prefix='Ecology_2')], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Ecology_3'], prefix='Ecology_3')], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Shops_2'], prefix='Shops_2')], axis=1)

In [107]:
for i in ['Ecology_2','Ecology_3','Shops_2']:
    df_test = df_test.drop(i,axis=1)

In [108]:
# for i in ['Ecology_2','Ecology_3','Shops_2']:
#     print(i, pd.unique(X[i]))
#     print('-'*30)

In [109]:
df_test.head(3)

Unnamed: 0_level_0,DistrictId,Rooms,Square,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
725,58,2.0,49.882643,6.0,6,14.0,1972,0.310199,11,2748,1,900.0,0,0,0,1,0,1,0,1
15856,74,2.0,69.263183,1.0,6,1.0,1977,0.075779,6,1437,3,900.0,0,2,0,1,0,1,0,1
5480,190,1.0,13.597819,12.0,2,5.0,1909,0.0,30,7538,87,4702.0,5,5,0,1,0,1,0,1


In [110]:
df_test_scaled = scaler.fit_transform(df_test)
df_test_scaled = pd.DataFrame(df_test_scaled, columns=df_test.columns)

In [111]:
# df_test['Price'] = model_ExtraTrees_final.predict(df_test_scaled)

In [112]:
# df_test[['Id','Price']].to_csv('Koryagin_predictions.csv',index=False)

In [114]:
y_result = model_final.predict(df_test_scaled)

In [115]:
submit = pd.DataFrame(list(zip(df_test.index,y_result)), columns = ['Id', 'Price'])
submit.head()

Unnamed: 0,Id,Price
0,725,148400.72958
1,15856,238457.407141
2,5480,256784.282152
3,15664,318678.317417
4,14275,148594.102988


In [116]:
submit.to_csv('Koryagin_predictions_ver3.csv', sep=',', index=False)