In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

## Загрузка данных

In [2]:
data = pd.read_csv('data/train.csv')
data.shape

(10000, 20)

In [3]:
dif_mean = (data['Square'] - data['LifeSquare']).mean()

In [4]:
data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = data['Square'] - dif_mean
data = data.loc[data['Rooms'].between(1, 9), :]
data = data.loc[(data['Square'] > 15) & (data['LifeSquare'] > 15), :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['HouseYear'] < 2020, :]

data = pd.get_dummies(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7576 entries, 0 to 9999
Data columns (total 23 columns):
Id               7576 non-null int64
DistrictId       7576 non-null int64
Rooms            7576 non-null float64
Square           7576 non-null float64
LifeSquare       7576 non-null float64
KitchenSquare    7576 non-null float64
Floor            7576 non-null int64
HouseFloor       7576 non-null float64
HouseYear        7576 non-null int64
Ecology_1        7576 non-null float64
Social_1         7576 non-null int64
Social_2         7576 non-null int64
Social_3         7576 non-null int64
Healthcare_1     4301 non-null float64
Helthcare_2      7576 non-null int64
Shops_1          7576 non-null int64
Price            7576 non-null float64
Ecology_2_A      7576 non-null uint8
Ecology_2_B      7576 non-null uint8
Ecology_3_A      7576 non-null uint8
Ecology_3_B      7576 non-null uint8
Shops_2_A        7576 non-null uint8
Shops_2_B        7576 non-null uint8
dtypes: float64(8), int64(

In [5]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)
train.shape, valid.shape

((6060, 23), (1516, 23))

In [6]:
features = list(data.columns.values)
ex_col = ['Id', 'DistrictId', 'Price', 'Healthcare_1']
features = [i for i in features if not(i in ex_col)]
features

['Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Ecology_2_A',
 'Ecology_2_B',
 'Ecology_3_A',
 'Ecology_3_B',
 'Shops_2_A',
 'Shops_2_B']

In [7]:
lr = LinearRegression()
lr.fit(train.loc[:, features], train['Price'])
pred_lr = lr.predict(train.loc[:, features])
r2_score(train['Price'], pred_lr)

0.47282999359032896

In [8]:
pred_lr_valid = lr.predict(valid.loc[:, features])
r2_score(valid['Price'], pred_lr_valid)

0.4756060325551753

In [9]:
RFR = RandomForestRegressor(n_estimators=20, 
                            max_depth=12, 
                            random_state=42,
                            min_samples_leaf=2)
RFR.fit(train.loc[:, features], train['Price'])
pred_RFR = RFR.predict(train.loc[:, features])
r2_score(train['Price'], pred_RFR)

0.8707835613684645

In [10]:
pred_RFR_valid = RFR.predict(valid.loc[:, features])
r2_score(valid['Price'], pred_RFR_valid)

0.6731205418108978

# Предсказание на тесте

In [11]:
test = pd.read_csv('data/test.csv')
test.shape

(5000, 19)

In [12]:
test = pd.get_dummies(test)
test.shape

(5000, 22)

In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Ecology_2_A      5000 non-null uint8
Ecology_2_B      5000 non-null uint8
Ecology_3_A      5000 non-null uint8
Ecology_3_B      5000 non-null uint8
Shops_2_A        5000 non-null uint8
Shops_2_B        5000 non-null uint8
dtypes: float64(7), int64(9), uint8(6)
memory usage: 654.4 KB


In [14]:
test = test.fillna(0)
test.loc[test['LifeSquare'] == 0, 'LifeSquare'] = test['Square'] - dif_mean
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       5000 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     5000 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Ecology_2_A      5000 non-null uint8
Ecology_2_B      5000 non-null uint8
Ecology_3_A      5000 non-null uint8
Ecology_3_B      5000 non-null uint8
Shops_2_A        5000 non-null uint8
Shops_2_B        5000 non-null uint8
dtypes: float64(7), int64(9), uint8(6)
memory usage: 654.4 KB


In [15]:
pred_test = lr.predict(test.loc[:, features])
pred_test

array([193565.31954557, 194532.19813994, 183215.84659636, ...,
       380894.93142481, 186492.80730833, 309339.5752478 ])