In [66]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
from xgboost import XGBRegressor, XGBClassifier

In [3]:
df_train_raw = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/train.csv')
df_test_raw = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/test.csv')
df_macro_data = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/macro.csv')
df_macro_euusd = df_macro_data[['timestamp', 'usdrub', 'eurrub']]

df_train_raw.shape, df_test_raw.shape

((30471, 292), (7662, 291))

In [4]:
def time_year(x):
    year = x.split('-')
    return int(year[0])

In [5]:
def time_month(x):
    month = x.split('-')
    return int(month[1])

In [6]:
def conv_to_bool(x):
    if x == 'yes':
        return 1
    else:
        return 0

In [7]:
df_train_raw = pd.merge(df_train_raw, df_macro_data, on='timestamp')
df_test_raw = pd.merge(df_test_raw, df_macro_data, on='timestamp')

In [14]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

feature_cols = ['id', 'full_sq', 'sport_count_5000', 'sport_count_3000', 'trc_count_5000', 'zd_vokzaly_avto_km',
                'sadovoe_km', 'sport_count_2000', 'bulvar_ring_km', 'kremlin_km', 'ttk_km', 'trc_sqm_5000',
                'nuclear_reactor_km', 'sport_count_1500', 'office_sqm_5000', 'sport_objects_raion', 'trc_count_3000',
                'stadium_km', 'cafe_count_5000_price_1000', 'detention_facility_km', 'basketball_km',
                'cafe_count_5000_price_1500', 'office_km', 'cafe_count_5000', 'cafe_count_5000_na_price',
                'university_km', 'trc_sqm_3000', 'cafe_count_5000_price_500', 'workplaces_km',
                'cafe_count_5000_price_2500', 'office_sqm_3000', 'swim_pool_km', 'thermal_power_plant_km',
                'office_count_5000', 'catering_km', 'exhibition_km', 'church_count_5000', 'office_sqm_2000',
                'cafe_count_5000_price_high', 'cafe_count_5000_price_4000', 'big_church_km',
                'school_education_centers_raion', 'sport_count_1000', 'fitness_km', 'metro_min_avto',
                'market_count_5000', 'park_km', 'big_church_count_5000', 'leisure_count_5000',
                'office_sqm_1500', 'ekder_male', 'metro_km_avto', 'trc_count_2000', 'shopping_centers_km',
                'public_healthcare_km', 'ekder_all', 'ekder_female', 'cafe_count_3000_price_1000',
                'office_count_1500', 'raion_popul', 'usdrub', 'eurrub', 'cafe_count_2000', 'theater_km',
                'office_raion', 'indust_part']


for col in feature_cols:
    df_train[col] = df_train_raw[col]
    df_test[col] = df_test_raw[col]


df_train['sale_year'] = df_train_raw['timestamp'].apply(time_year)
df_train['sale_month'] = df_train_raw['timestamp'].apply(time_month)

df_test['sale_year'] = df_test_raw['timestamp'].apply(time_year)
df_test['sale_month'] = df_test_raw['timestamp'].apply(time_month)

df_train['price'] = df_train_raw['price_doc']
df_train['price_eur'] = df_train_raw['price_doc'] / df_train_raw['eurrub']
df_train['price_usd'] = df_train_raw['price_doc'] / df_train_raw['usdrub']


nan_data = df_train['full_sq'] > 2000
print('train: ', nan_data.sum())
df_train = df_train[~nan_data]

nan_data = df_test['full_sq'] > 2000
print('test: ', nan_data.sum())
df_test = df_test[~nan_data]

df_train.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v0.csv', index=False)
df_test.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v0.csv', index=False)

train:  1
test:  0


In [13]:
def clean(dft, name='life_sq_lg1'):
    dft.loc[dft['full_sq'] < dft[name],name] = dft['full_sq'][dft['full_sq'] < dft[name]]
    return dft

In [49]:
df_train = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v0.csv')
df_test = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v0.csv')

df_train = pd.merge(df_train, df_train_raw[['id', 'life_sq']], on='id')
df_test = pd.merge(df_test, df_test_raw[['id', 'life_sq']], on='id')   

df_train['life_sq_lg1'] = df_train['life_sq']
df_test['life_sq_lg1'] = df_test['life_sq']

df_train = clean(df_train)
df_test = clean(df_test)


nan_ind = ~pd.isnull(df_train['life_sq_lg1'])

lgreg_life_sq = LinearRegression(fit_intercept=True)

tmp_feat_arr = np.zeros([df_train['full_sq'][nan_ind].values.shape[0], 1])
tmp_feat_arr[:, 0] = df_train['full_sq'][nan_ind].values[:]
tmp_targ_arr = np.reshape(df_train['life_sq_lg1'][nan_ind].values,
                          (df_train['life_sq_lg1'][nan_ind].values.shape[0], 1))
lgreg_life_sq.fit(tmp_feat_arr, np.log(tmp_targ_arr + 1))
tmp_feat_arr = np.zeros([df_train['full_sq'][~nan_ind].values.shape[0], 1])
tmp_feat_arr[:, 0] = df_train['full_sq'][~nan_ind].values[:]

pred_lsq = lgreg_life_sq.predict(tmp_feat_arr)
df_train.loc[~nan_ind, 'life_sq_lg1'] = np.exp(pred_lsq[:, 0]) - 1
df_train.loc[df_train['life_sq_lg1'] < 0, 'life_sq_lg1'] = 0

nan_ind = ~pd.isnull(df_test['life_sq'])
tmp_feat_arr = np.zeros([df_test['full_sq'][~nan_ind].values.shape[0], 1])
tmp_feat_arr[:, 0] = df_test['full_sq'][~nan_ind].values[:]
pred_lsq = lgreg_life_sq.predict(tmp_feat_arr)
df_test.loc[~nan_ind, 'life_sq_lg1'] = np.exp(pred_lsq[:, 0]) - 1
df_test.loc[df_train['life_sq_lg1'] < 0, 'life_sq_lg1'] = 0

df_train.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v1.csv', index=False)
df_test.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v1.csv', index=False)

In [4]:
df_train = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v1.csv')
df_test = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v1.csv')


df_train = pd.merge(df_train, df_train_raw[['id', 'floor']], on='id')
df_test = pd.merge(df_test, df_test_raw[['id', 'floor']], on='id')   

df_train.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v2.csv', index=False)
df_test.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v2.csv', index=False)

In [108]:
df_train = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v2.csv')
df_test = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v2.csv')

df_train = pd.merge(df_train, df_train_raw[['id', 'max_floor', 'num_room', 'kitch_sq']], on='id')
df_test = pd.merge(df_test, df_test_raw[['id', 'max_floor', 'num_room', 'kitch_sq']], on='id')   

df_train = df_train[~pd.isnull(df_train['floor'])]

df_train['max_floor_lg1'] = df_train['max_floor']
df_test['max_floor_lg1'] = df_test['max_floor']

nan_ind = ~pd.isnull(df_train['max_floor_lg1'])

lgreg_life_sq = LinearRegression(fit_intercept=True)

tmp_feat_arr = np.zeros([df_train['full_sq'][nan_ind].values.shape[0], 3])
tmp_feat_arr[:, 0] = df_train['price_eur'][nan_ind].values[:]**0.5
tmp_feat_arr[:, 1] = df_train['floor'][nan_ind].values[:]**0.5
tmp_feat_arr[:, 2] = df_train['sadovoe_km'][nan_ind].values[:]
tmp_targ_arr = np.reshape(df_train['max_floor_lg1'][nan_ind].values,
                          (df_train['max_floor_lg1'][nan_ind].values.shape[0], 1))
lgreg_life_sq.fit(tmp_feat_arr, np.log(tmp_targ_arr + 1))

tmp_feat_arr = np.zeros([df_train['full_sq'][~nan_ind].values.shape[0], 3])
tmp_feat_arr[:, 0] = df_train['price_eur'][~nan_ind].values[:]**0.5
tmp_feat_arr[:, 1] = df_train['floor'][~nan_ind].values[:]**0.5
tmp_feat_arr[:, 2] = df_train['sadovoe_km'][~nan_ind].values[:]

pred_lsq = lgreg_life_sq.predict(tmp_feat_arr)
df_train.loc[~nan_ind, 'max_floor_lg1'] = np.exp(pred_lsq[:, 0]) - 1
indarr = df_train['max_floor_lg1'] < df_train['floor']
df_train.loc[indarr, 'max_floor_lg1'] = df_train.loc[indarr, 'floor']
df_train.loc[df_train['max_floor_lg1'] < 1, 'max_floor_lg1'] = 1

df_train['kitch_sq_lg1'] = df_train['kitch_sq']
df_test['kitch_sq_lg1'] = df_test['kitch_sq']



lgreg_life_sq = LinearRegression(fit_intercept=True)

tmp_feat_arr = np.zeros([df_train['full_sq'][nan_ind].values.shape[0], 7])
tmp_feat_arr[:, 0] = df_train['life_sq_lg1'][nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['full_sq'][nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['life_sq_lg1'][nan_ind].values[:]
tmp_feat_arr[:, 3] = df_train['life_sq_lg1'][nan_ind].values[:] / (df_train['full_sq'][nan_ind].values[:] + 1)
tmp_feat_arr[:, 4] = df_train['ekder_all'][nan_ind].values[:]
tmp_feat_arr[:, 5] = df_train['office_raion'][nan_ind].values[:]
tmp_feat_arr[:, 6] = df_train['sport_count_3000'][nan_ind].values[:]
tmp_targ_arr = np.reshape(df_train['kitch_sq_lg1'][nan_ind].values,
                          (df_train['kitch_sq_lg1'][nan_ind].values.shape[0], 1))
lgreg_life_sq.fit(tmp_feat_arr, np.log(tmp_targ_arr + 1))

tmp_feat_arr = np.zeros([df_train['full_sq'][~nan_ind].values.shape[0], 7])
tmp_feat_arr[:, 0] = df_train['life_sq_lg1'][~nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['full_sq'][~nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['life_sq_lg1'][~nan_ind].values[:]
tmp_feat_arr[:, 3] = df_train['life_sq_lg1'][~nan_ind].values[:] / (df_train['full_sq'][~nan_ind].values[:] + 1)
tmp_feat_arr[:, 4] = df_train['ekder_all'][~nan_ind].values[:]
tmp_feat_arr[:, 5] = df_train['office_raion'][~nan_ind].values[:]
tmp_feat_arr[:, 6] = df_train['sport_count_3000'][~nan_ind].values[:]

pred_lsq = lgreg_life_sq.predict(tmp_feat_arr)
df_train.loc[~nan_ind, 'kitch_sq_lg1'] = np.exp(pred_lsq[:, 0]) - 1
df_train.loc[df_train['kitch_sq_lg1'] < 0, 'kitch_sq_lg1'] = 0

df_train.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v3.csv', index=False)
df_test.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v3.csv', index=False)

In [109]:
df_train = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v3.csv')
df_test = pd.read_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v3.csv')

df_train = pd.merge(df_train, df_train_raw[['id', 'state', 'material', 'build_year']], on='id')
df_test = pd.merge(df_test, df_test_raw[['id', 'state', 'material', 'build_year']], on='id')   

df_train['build_year'][pd.isnull(df_train['build_year'])] = 0
df_test['build_year'][pd.isnull(df_test['build_year'])] = 0
# pd.isnull(df_train['build_year']).sum()

df_train['unbuilt'] = (df_train['build_year'] > df_train['sale_year']).apply(lambda x: 1 if x else 0)
df_test['unbuilt'] = (df_test['build_year'] > df_test['sale_year']).apply(lambda x: 1 if x else 0)

df_train = df_train[df_train['state'] != 33.0] 

nan_ind = ~pd.isnull(df_train['state'])
clf = XGBClassifier(max_depth=4, n_estimators=500, learning_rate=0.05)



df_train['state_xg1'] = df_train['state']
df_test['state_xg1'] = df_test['state']



nan_ind = ~pd.isnull(df_train['state_xg1'])

lgreg_life_sq = LinearRegression(fit_intercept=True)

tmp_feat_arr = np.zeros([df_train['state_xg1'][nan_ind].values.shape[0], 4])
tmp_feat_arr[:, 0] = df_train['ekder_all'][nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['thermal_power_plant_km'][nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['full_sq'][nan_ind].values[:]
tmp_feat_arr[:, 3] = df_train['kitch_sq_lg1'][nan_ind].values[:]
tmp_targ_arr = np.reshape(df_train['state_xg1'][nan_ind].values,
                          (df_train['state_xg1'][nan_ind].values.shape[0], 1))
clf.fit(tmp_feat_arr, tmp_targ_arr)
tmp_feat_arr = np.zeros([df_train['state_xg1'][~nan_ind].values.shape[0], 4])
tmp_feat_arr[:, 0] = df_train['ekder_all'][~nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['thermal_power_plant_km'][~nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['full_sq'][~nan_ind].values[:]
tmp_feat_arr[:, 3] = df_train['kitch_sq_lg1'][~nan_ind].values[:]

pred = clf.predict(tmp_feat_arr)
df_train.loc[~nan_ind, 'state_xg1'] = pred
# df_train.loc[df_train['life_sq_lg1'] < 0, 'life_sq_lg1'] = 0

nan_ind = ~pd.isnull(df_test['state_xg1'])
tmp_feat_arr = np.zeros([df_test['state_xg1'][~nan_ind].values.shape[0], 4])
tmp_feat_arr[:, 0] = df_test['ekder_all'][~nan_ind].values[:]
tmp_feat_arr[:, 1] = df_test['thermal_power_plant_km'][~nan_ind].values[:]
tmp_feat_arr[:, 2] = df_test['full_sq'][~nan_ind].values[:]
tmp_feat_arr[:, 3] = df_test['kitch_sq_lg1'][~nan_ind].values[:]

pred = clf.predict(tmp_feat_arr)
df_test.loc[~nan_ind, 'state_xg1'] = pred

state_enc = OneHotEncoder(sparse=False)
state_onehot = state_enc.fit_transform(df_train['state_xg1'].values.reshape(df_train['state_xg1'].values.shape[0], 1))

for ii in range(state_onehot.shape[1]):
    df_train['state_'+str(ii)] = state_onehot[:, ii]
    
state_onehot = state_enc.transform(df_test['state_xg1'].values.reshape(df_test['state_xg1'].values.shape[0], 1))

for ii in range(state_onehot.shape[1]):
    df_test['state_'+str(ii)] = state_onehot[:, ii]
    
df_train.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/train_v4.csv', index=False)
df_test.to_csv('/Volumes/KM/Archive/KaggleStuff/2017SBER/transformed_data/new/test_v4.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Below are some cells which were used to test various missing data reconstruction approaches

In [94]:
df_train = df_train[~pd.isnull(df_train['state'])]

nan_ind = ~pd.isnull(df_train['state'])

# lgreg_life_sq = LinearRegression(fit_intercept=True)
# lgreg_life_sq = XGBRegressor(max_depth=2, learning_rate=0.01, n_estimators=600, subsample=1., gamma=0.0)
clf = XGBClassifier(max_depth=4, n_estimators=500, learning_rate=0.05)
# clf = GradientBoostingClassifier()

last_ind = 4

tmp_feat_arr = np.zeros([df_train['ekder_all'][nan_ind].values.shape[0], last_ind])


tmp_feat_arr[:, 0] = df_train['ekder_all'][nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['thermal_power_plant_km'][nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['full_sq'][nan_ind].values[:]
tmp_feat_arr[:, 3] = df_train['kitch_sq_lg1'][nan_ind].values[:]
output_res = []

# cols_list = []
# for col in cols_list:
#     tmp_feat_arr[:, last_ind - 1] = df_train[col][nan_ind].values[:]**1.

tmp_targ_arr = np.reshape(df_train['state'][nan_ind].values,
                          (df_train['state'][nan_ind].values.shape[0], 1))
n_splits = 3
kf = StratifiedKFold(n_splits=n_splits)
res = []
for train_index, test_index in kf.split(tmp_feat_arr, tmp_targ_arr.ravel()):
    scaler = StandardScaler()
    scaler.fit(tmp_feat_arr[train_index])

    train_features = scaler.transform(tmp_feat_arr[train_index])
    nel = tmp_targ_arr[train_index].shape[0]
    clf.fit(train_features, tmp_targ_arr[train_index])
    pred_pr = clf.predict_proba(scaler.transform(tmp_feat_arr[test_index]))
    nel = tmp_targ_arr[test_index].shape[0]
#         print((tmp_targ_arr[test_index] == 0).sum(), (tmp_targ_arr[test_index] == 1).sum())
    res.append(log_loss(tmp_targ_arr[test_index], pred_pr))

output_res.append([col, np.sum(res) / n_splits])

sorted(output_res, key=lambda x: x[1])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[['max_floor_lg1', 0.75740804191186351]]

In [84]:
df_train = df_train[~pd.isnull(df_train['floor'])]

nan_ind = ~pd.isnull(df_train['max_floor'])

lgreg_life_sq = LinearRegression(fit_intercept=True)
# lgreg_life_sq = XGBRegressor(max_depth=2, learning_rate=0.01, n_estimators=600, subsample=1., gamma=0.0)

last_ind = 4

tmp_feat_arr = np.zeros([df_train['full_sq'][nan_ind].values.shape[0], last_ind])
tmp_feat_arr[:, 0] = df_train['price_eur'][nan_ind].values[:]**0.5
tmp_feat_arr[:, 1] = df_train['floor'][nan_ind].values[:]**0.5
# tmp_feat_arr[:, 2] = df_train['full_sq'][nan_ind].values[:]**0.15
# tmp_feat_arr[:, 1] = df_train['kremlin_km'][nan_ind].values[:] ** 0.8
# tmp_feat_arr[:, 2] = df_train['workplaces_km'][nan_ind].values[:]
# tmp_feat_arr[:, 3] = df_train['church_synagogue_km'][nan_ind].values[:]
# # tmp_feat_arr[:, 4] = df_train['trc_sqm_5000'][nan_ind].values[:]**2.5
# tmp_feat_arr[:, 5] = df_train['hospice_morgue_km'][nan_ind].values[:]**1.5

output_res = []

cols_list = ['full_sq', 'sadovoe_km', 'bulvar_ring_km', ]

for col in cols_list:
    tmp_feat_arr[:, last_ind - 1] = df_train[col][nan_ind].values[:]**1.

    tmp_targ_arr = np.reshape(df_train['max_floor'][nan_ind].values,
                              (df_train['max_floor'][nan_ind].values.shape[0], 1))
    n_splits = 4
    kf = KFold(n_splits=n_splits)
    res = []
    for train_index, test_index in kf.split(tmp_feat_arr):
        scaler = StandardScaler()
        scaler.fit(tmp_feat_arr[train_index])

        train_features = scaler.transform(tmp_feat_arr[train_index])

        lgreg_life_sq.fit(train_features, np.log(tmp_targ_arr[train_index] + 1))
        pred = lgreg_life_sq.predict(scaler.transform(tmp_feat_arr[test_index]))

        pred = np.exp(pred) - 1
#         res.append(np.sqrt(np.mean(np.square(np.log(pred + 1) - np.log(tmp_targ_arr[test_index] + 1)))))
        res.append(np.sqrt(np.mean(np.square(np.log(pred + 1) - np.log(tmp_targ_arr[test_index] + 1)))))
#     print(res, np.sum(res) / n_splits)
    output_res.append([col, np.sum(res) / n_splits])
#     print(col, np.sum(res) / n_splits)
sorted(output_res, key=lambda x: x[1])

[['sadovoe_km', 0.68258484628061311],
 ['bulvar_ring_km', 0.68267852629789894],
 ['full_sq', 0.68405072657366195]]

In [90]:
targ = 'kitch_sq'


df_train = df_train[~pd.isnull(df_train['floor'])]

nan_ind = ~pd.isnull(df_train[targ])

lgreg_life_sq = LinearRegression(fit_intercept=True)
# lgreg_life_sq = XGBRegressor(max_depth=2, learning_rate=0.01, n_estimators=600, subsample=1., gamma=0.0)

last_ind = 7

tmp_feat_arr = np.zeros([df_train['full_sq'][nan_ind].values.shape[0], last_ind])
tmp_feat_arr[:, 0] = df_train['price_eur'][nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['life_sq_lg1'][nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['full_sq'][nan_ind].values[:]
tmp_feat_arr[:, 3] = df_train['life_sq_lg1'][nan_ind].values[:] / (df_train['full_sq'][nan_ind].values[:] + 1)
tmp_feat_arr[:, 4] = df_train['ekder_all'][nan_ind].values[:]
tmp_feat_arr[:, 5] = df_train['office_raion'][nan_ind].values[:]
tmp_feat_arr[:, 6] = df_train['sport_count_3000'][nan_ind].values[:]
# tmp_feat_arr[:, 1] = df_train['kremlin_km'][nan_ind].values[:] ** 0.8
# tmp_feat_arr[:, 2] = df_train['workplaces_km'][nan_ind].values[:]
# tmp_feat_arr[:, 3] = df_train['church_synagogue_km'][nan_ind].values[:]
# # tmp_feat_arr[:, 4] = df_train['trc_sqm_5000'][nan_ind].values[:]**2.5
# tmp_feat_arr[:, 5] = df_train['hospice_morgue_km'][nan_ind].values[:]**1.5

output_res = []

cols_list = ['full_sq', 'sport_count_5000', 'sport_count_3000', 'trc_count_5000', 'zd_vokzaly_avto_km',
                'sadovoe_km', 'sport_count_2000', 'bulvar_ring_km', 'kremlin_km', 'ttk_km', 'trc_sqm_5000',
                'nuclear_reactor_km', 'sport_count_1500', 'office_sqm_5000', 'sport_objects_raion', 'trc_count_3000',
                'stadium_km', 'cafe_count_5000_price_1000', 'detention_facility_km', 'basketball_km',
                'cafe_count_5000_price_1500', 'office_km', 'cafe_count_5000', 'cafe_count_5000_na_price',
                'university_km', 'trc_sqm_3000', 'cafe_count_5000_price_500', 'workplaces_km',
                'cafe_count_5000_price_2500', 'office_sqm_3000', 'swim_pool_km', 'thermal_power_plant_km',
                'office_count_5000', 'catering_km', 'exhibition_km', 'church_count_5000', 'office_sqm_2000',
                'cafe_count_5000_price_high', 'cafe_count_5000_price_4000', 'big_church_km',
                'school_education_centers_raion', 'sport_count_1000', 'fitness_km', 'metro_min_avto',
                'market_count_5000', 'park_km', 'big_church_count_5000', 'leisure_count_5000',
                'office_sqm_1500', 'ekder_male', 'metro_km_avto', 'trc_count_2000', 'shopping_centers_km',
                'public_healthcare_km', 'ekder_female', 'cafe_count_3000_price_1000',
                'office_count_1500', 'raion_popul', 'usdrub', 'eurrub', 'cafe_count_2000', 'theater_km',
                'office_raion', 'indust_part']

for col in cols_list:
    tmp_feat_arr[:, last_ind - 1] = df_train[col][nan_ind].values[:]**1.

    tmp_targ_arr = np.reshape(df_train[targ][nan_ind].values,
                              (df_train[targ][nan_ind].values.shape[0], 1))
    n_splits = 4
    kf = KFold(n_splits=n_splits)
    res = []
    for train_index, test_index in kf.split(tmp_feat_arr):
        scaler = StandardScaler()
        scaler.fit(tmp_feat_arr[train_index])

        train_features = scaler.transform(tmp_feat_arr[train_index])

        lgreg_life_sq.fit(train_features, np.log(tmp_targ_arr[train_index] + 1))
        pred = lgreg_life_sq.predict(scaler.transform(tmp_feat_arr[test_index]))

        pred = np.exp(pred) - 1
#         res.append(np.sqrt(np.mean(np.square(np.log(pred + 1) - np.log(tmp_targ_arr[test_index] + 1)))))
        res.append(np.sqrt(np.mean(np.square(np.log(pred + 1) - np.log(tmp_targ_arr[test_index] + 1)))))
#     print(res, np.sum(res) / n_splits)
    output_res.append([col, np.sum(res) / n_splits])
#     print(col, np.sum(res) / n_splits)
sorted(output_res, key=lambda x: x[1])

[['sport_count_3000', 0.6952740393938861],
 ['sport_count_5000', 0.69554000134020255],
 ['trc_count_5000', 0.6958130703714942],
 ['sport_count_2000', 0.69801556209740245],
 ['sport_count_1500', 0.69965608335033691],
 ['trc_sqm_5000', 0.69969018833236019],
 ['market_count_5000', 0.70163349535839725],
 ['cafe_count_5000_price_1000', 0.70245196477818594],
 ['cafe_count_5000_price_500', 0.70247545372726472],
 ['trc_count_3000', 0.70250754837103357],
 ['cafe_count_5000_na_price', 0.70283095461487921],
 ['office_count_5000', 0.70317755370223334],
 ['cafe_count_5000', 0.7035759005064709],
 ['cafe_count_5000_price_1500', 0.70383113113769724],
 ['office_sqm_5000', 0.70400490196135623],
 ['big_church_count_5000', 0.70433399431501564],
 ['sport_count_1000', 0.70442632532885496],
 ['office_sqm_3000', 0.70448668996528907],
 ['church_count_5000', 0.70457526639150947],
 ['zd_vokzaly_avto_km', 0.70467158022617415],
 ['workplaces_km', 0.7047980026950863],
 ['trc_sqm_3000', 0.70511158153291209],
 ['fitn

In [107]:
targ = 'num_room'


df_train = df_train[~pd.isnull(df_train['floor'])]

nan_ind = ~pd.isnull(df_train[targ])

lgreg_life_sq = LinearRegression(fit_intercept=True)
# lgreg_life_sq = XGBRegressor(max_depth=2, learning_rate=0.01, n_estimators=600, subsample=1., gamma=0.0)

last_ind = 7

tmp_feat_arr = np.zeros([df_train['full_sq'][nan_ind].values.shape[0], last_ind])
tmp_feat_arr[:, 0] = df_train['price_eur'][nan_ind].values[:]
tmp_feat_arr[:, 1] = df_train['life_sq_lg1'][nan_ind].values[:]
tmp_feat_arr[:, 2] = df_train['full_sq'][nan_ind].values[:]
# tmp_feat_arr[:, 3] = df_train['kitch_sq_lg1'][nan_ind].values[:]
# tmp_feat_arr[:, 4] = df_train['life_sq_lg1'][nan_ind].values[:] / (df_train['full_sq'][nan_ind].values[:] + 1)
# tmp_feat_arr[:, 1] = df_train['kremlin_km'][nan_ind].values[:] ** 0.8
# tmp_feat_arr[:, 2] = df_train['workplaces_km'][nan_ind].values[:]
# tmp_feat_arr[:, 3] = df_train['church_synagogue_km'][nan_ind].values[:]
# # tmp_feat_arr[:, 4] = df_train['trc_sqm_5000'][nan_ind].values[:]**2.5
# tmp_feat_arr[:, 5] = df_train['hospice_morgue_km'][nan_ind].values[:]**1.5

output_res = []

cols_list = ['full_sq', 'sport_count_5000', 'sport_count_3000', 'trc_count_5000', 'zd_vokzaly_avto_km',
                'sadovoe_km', 'sport_count_2000', 'bulvar_ring_km', 'kremlin_km', 'ttk_km', 'trc_sqm_5000',
                'nuclear_reactor_km', 'sport_count_1500', 'office_sqm_5000', 'sport_objects_raion', 'trc_count_3000',
                'stadium_km', 'cafe_count_5000_price_1000', 'detention_facility_km', 'basketball_km',
                'cafe_count_5000_price_1500', 'office_km', 'cafe_count_5000', 'cafe_count_5000_na_price',
                'university_km', 'trc_sqm_3000', 'cafe_count_5000_price_500', 'workplaces_km',
                'cafe_count_5000_price_2500', 'office_sqm_3000', 'swim_pool_km', 'thermal_power_plant_km',
                'office_count_5000', 'catering_km', 'exhibition_km', 'church_count_5000', 'office_sqm_2000',
                'cafe_count_5000_price_high', 'cafe_count_5000_price_4000', 'big_church_km',
                'school_education_centers_raion', 'sport_count_1000', 'fitness_km', 'metro_min_avto',
                'market_count_5000', 'park_km', 'big_church_count_5000', 'leisure_count_5000',
                'office_sqm_1500', 'ekder_male', 'metro_km_avto', 'trc_count_2000', 'shopping_centers_km',
                'public_healthcare_km', 'ekder_female', 'cafe_count_3000_price_1000',
                'office_count_1500', 'raion_popul', 'usdrub', 'eurrub', 'cafe_count_2000', 'theater_km',
                'office_raion', 'indust_part']

for col in cols_list:
#     tmp_feat_arr[:, last_ind - 1] = df_train[col][nan_ind].values[:]**1.

    tmp_targ_arr = np.reshape(df_train[targ][nan_ind].values,
                              (df_train[targ][nan_ind].values.shape[0], 1))
    n_splits = 4
    kf = KFold(n_splits=n_splits)
    res = []
    for train_index, test_index in kf.split(tmp_feat_arr):
        scaler = StandardScaler()
        scaler.fit(tmp_feat_arr[train_index])

        train_features = scaler.transform(tmp_feat_arr[train_index])

        lgreg_life_sq.fit(train_features, np.log(tmp_targ_arr[train_index] + 1))
        pred = lgreg_life_sq.predict(scaler.transform(tmp_feat_arr[test_index]))

        pred = np.exp(pred) - 1
#         res.append(np.sqrt(np.mean(np.square(np.log(pred + 1) - np.log(tmp_targ_arr[test_index] + 1)))))
        res.append(np.sqrt(np.mean(np.square(np.log(pred + 1) - np.log(tmp_targ_arr[test_index] + 1)))))
#     print(res, np.sum(res) / n_splits)
    output_res.append([col, np.sum(res) / n_splits])
#     print(col, np.sum(res) / n_splits)
sorted(output_res, key=lambda x: x[1])

[['full_sq', 0.20291984253450826],
 ['sport_count_5000', 0.20291984253450826],
 ['sport_count_3000', 0.20291984253450826],
 ['trc_count_5000', 0.20291984253450826],
 ['zd_vokzaly_avto_km', 0.20291984253450826],
 ['sadovoe_km', 0.20291984253450826],
 ['sport_count_2000', 0.20291984253450826],
 ['bulvar_ring_km', 0.20291984253450826],
 ['kremlin_km', 0.20291984253450826],
 ['ttk_km', 0.20291984253450826],
 ['trc_sqm_5000', 0.20291984253450826],
 ['nuclear_reactor_km', 0.20291984253450826],
 ['sport_count_1500', 0.20291984253450826],
 ['office_sqm_5000', 0.20291984253450826],
 ['sport_objects_raion', 0.20291984253450826],
 ['trc_count_3000', 0.20291984253450826],
 ['stadium_km', 0.20291984253450826],
 ['cafe_count_5000_price_1000', 0.20291984253450826],
 ['detention_facility_km', 0.20291984253450826],
 ['basketball_km', 0.20291984253450826],
 ['cafe_count_5000_price_1500', 0.20291984253450826],
 ['office_km', 0.20291984253450826],
 ['cafe_count_5000', 0.20291984253450826],
 ['cafe_count_5