In [78]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import warnings
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

min_corr = 0.3

In [85]:
# %%writefile preprocessing.py

def clean_data(df):
    # build_year 1500이전 nan으로
    df.loc[df.build_year < 1500, 'build_year'] = np.nan
    df.loc[df.build_year > 2016, 'build_year'] = np.nan
    
    # floor가 0이면 nan으로
    df.loc[df.floor==0, 'floor'] = np.nan
    
    # max_floor가 0이면 nan으로
    df.loc[df.max_floor==0, 'max_floor'] = np.nan
    
    # max_floor가 floor보다 크면 nan으로
    df.loc[df.floor>df.max_floor, 'max_floor'] = np.nan
    
    # full_sq, life_sq, kitch_sq가 0이면 nan으로
    df.loc[df.full_sq==0, 'full_sq'] = np.nan
    df.loc[df.life_sq==0, 'life_sq'] = np.nan
    df.loc[df.kitch_sq==0, 'kitch_sq'] = np.nan
    
    # full_sq가 life_sq보다 작으면 nan으로
    df.loc[df.life_sq>df.full_sq, 'life_sq'] = np.nan
    
    # kitch_sq가 life_sq보다 크면 nan으로
    df.loc[df.kitch_sq>df.life_sq, 'kitch_sq'] = np.nan
    
    df.loc[df.state == 33, 'state'] = 3
    
    df.loc[df.full_sq > 210, 'full_sq'] == np.nan
    df.loc[df.full_sq > 200, 'full_sq'] == np.nan    

    df.loc[df.num_room < 0, 'num_room'] = np.nan
    
    df['material'].fillna(0, inplace=True)
    
    # 이상한 숫자값들 45,34 ...
    if 'modern_education_share' in df: del df['modern_education_share']
    if 'old_education_build_share' in df: del df['old_education_build_share']
    if 'child_on_acc_pre_school' in df: del df['child_on_acc_pre_school']
        
    consts = [col for col in df.columns if len(df[col].value_counts().index) == 1]
    for const in consts:
        del df[const]
        
    df = df.replace(['no data'], ['nodata'])
    
#     # 뉴머릭한 카테고리컬 독립변수들인데 유니크값이 너무 많아서 없앤다.
#     del df['ID_railroad_station_walk']
#     del df['ID_railroad_station_avto']
#     del df['ID_big_road1']
#     del df['ID_big_road2']
#     del df['ID_railroad_terminal']
#     del df['ID_bus_terminal']
#     del df['ID_metro']
#     # too many dummy variables
#     del df['sub_area']
    
#     50% 이상 미싱 데이터가 있으면 없애버린다
    if 'provision_retail_space_sqm' in df: del df['provision_retail_space_sqm']
    if 'theaters_viewers_per_1000_cap' in df: del df['theaters_viewers_per_1000_cap']
    if 'museum_visitis_per_100_cap' in df: del df['museum_visitis_per_100_cap']
    
    # material은 카테고리
#     df['material'] = df['material'].astype(np.str, copy=False)
#     df['material'] = df['material'].replace([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ['a', 'b', 'c', 'd', 'e', 'f', 'e'])
    return df


def col_renames(df):
    df.rename(columns={'build_count_1921-1945': 'build_count_1921_1945', 'build_count_1946-1970': 'build_count_1946_1970', 'build_count_1971-1995': 'build_count_1971_1995'}, inplace=True)
    return df

def del_many_unique(df):
 # 뉴머릭한 카테고리컬 독립변수들인데 유니크값이 너무 많아서 없앤다.
    del df['ID_railroad_station_walk']
    del df['ID_railroad_station_avto']
    del df['ID_big_road1']
    del df['ID_big_road2']
    del df['ID_railroad_terminal']
    del df['ID_bus_terminal']
    del df['ID_metro']
    # too many dummy variables
    del df['sub_area']
    del df['prom_part_3000']
    return df

def categorize(df):
    df['material'] = df['material'].astype(np.object, copy=False)
    df['material'] = df['material'].replace([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ['a', 'b', 'c', 'd', 'e', 'f', 'e'])
# def find_missing_data_columns(df):
#     missing_df = df.isnull().sum(axis=0).reset_index()
#     missing_df.columns = ['missing_column', 'missing_count']
#     missing_df = missing_df.loc[missing_df['missing_count'] > 0]
#     return missing_df


def impute_num_mode(df):
    for col in df._get_numeric_data().columns[df._get_numeric_data().isnull().any()]:
        df[col].fillna(df[col].mean(), inplace=True)

def imput_cat_mode(df):
    for col in df.column[df.isnull().any()].tolist():
        df[col].fillna(df[col].mean(), inplace=True)
        
def apply_log(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1
    df[numeric_cols].apply(np.log)

def scale_up_positive(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1
            
def remove_outliers(df, formula, repeat=1):
    result = None
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        influence = result.get_influence()
        distances, pvalues = influence.cooks_distance
        threshold = 4/(len(distances) - len(df.columns.drop(['_timestamp', '_price_doc']))-1)
        outliers = [idx for idx, d in enumerate(distances) if d > threshold]
        df.drop(df.index[outliers], inplace=True)
    return df, model, result

def remove_features_by_vif(df):
    features_to_remove = [ 
        'raion_popul', \
        'preschool_education_centers_raion', \
        'school_education_centers_raion', \
        'sport_objects_raion', \
        'office_raion', \
        'young_all', \
        'work_all', \
        'ekder_all', \
        '0_17_all', \
        'raion_build_count_with_material_info', \
        'raion_build_count_with_builddate_info', \
        'build_count_1946-1970', \
        'metro_min_avto', \
        'metro_km_avto', \
        'metro_min_walk', \
        'school_km', \
        'park_km', \
        'railroad_station_walk_min', \
        'railroad_station_avto_min', \
        'ttk_km', \
        'sadovoe_km', \
        'bulvar_ring_km', \
        'kremlin_km', \
        'zd_vokzaly_avto_km', \
        'bus_terminal_avto_km', \
        'oil_chemistry_km', \
        'nuclear_reactor_km', \
        'radiation_km', \
        'power_transmission_line_km', \
        'thermal_power_plant_km', \
        'ts_km', \
        'swim_pool_km', \
        'ice_rink_km', \
        'stadium_km', \
        'basketball_km', \
        'detention_facility_km', \
        'public_healthcare_km', \
        'university_km', \
        'workplaces_km', \
        'shopping_centers_km', \
        'preschool_km', \
        'big_church_km', \
        'mosque_km', \
        'theater_km', \
        'museum_km', \
        'exhibition_km', \
        'cafe_count_500', \
        'cafe_sum_500_min_price_avg', \
        'cafe_avg_price_500', \
        'office_count_1000', \
        'cafe_count_1000', \
        'cafe_sum_1000_min_price_avg', \
        'cafe_sum_1000_max_price_avg', \
        'cafe_avg_price_1000', \
        'cafe_count_1000_na_price', \
        'cafe_count_1000_price_1000', \
        'cafe_count_1000_price_1500', \
        'office_count_1500', \
        'cafe_count_1500', \
        'cafe_sum_1500_max_price_avg', \
        'cafe_avg_price_1500', \
        'cafe_count_1500_na_price', \
        'cafe_count_1500_price_500', \
        'cafe_count_1500_price_1000', \
        'cafe_count_1500_price_1500', \
        'cafe_count_1500_price_2500', \
        'cafe_count_1500_price_high', \
        'leisure_count_1500', \
        'sport_count_1500', \
        'green_part_2000', \
        'office_count_2000', \
        'office_sqm_2000', \
        'trc_count_2000', \
        'cafe_count_2000', \
        'cafe_sum_2000_min_price_avg', \
        'cafe_sum_2000_max_price_avg', \
        'cafe_avg_price_2000', \
        'cafe_count_2000_na_price', \
        'cafe_count_2000_price_500', \
        'cafe_count_2000_price_1000', \
        'cafe_count_2000_price_1500', \
        'cafe_count_2000_price_2500', \
        'cafe_count_2000_price_high', \
        'sport_count_2000', \
        'green_part_3000', \
        'office_count_3000', \
        'office_sqm_3000', \
        'trc_count_3000', \
        'cafe_count_3000', \
        'cafe_count_3000_na_price', \
        'cafe_count_3000_price_500', \
        'cafe_count_3000_price_1000', \
        'cafe_count_3000_price_1500', \
        'cafe_count_3000_price_2500', \
        'cafe_count_3000_price_4000', \
        'cafe_count_3000_price_high', \
        'big_church_count_3000', \
        'church_count_3000', \
        'leisure_count_3000', \
        'sport_count_3000', \
        'green_part_5000',\
        'office_count_5000',\
        'office_sqm_5000',\
        'trc_count_5000',\
        'trc_sqm_5000',\
        'cafe_count_5000',\
        'cafe_count_5000_na_price', \
        'cafe_count_5000_price_500', \
        'cafe_count_5000_price_1000', \
        'cafe_count_5000_price_1500', \
        'cafe_count_5000_price_2500', \
        'cafe_count_5000_price_4000', \
        'cafe_count_5000_price_high', \
        'big_church_count_5000', \
        'church_count_5000', \
        'leisure_count_5000', \
        'sport_count_5000', \
        'market_count_5000', \
        'avg_price_ID_metro', \
        'avg_price_ID_railroad_station_walk', \
        'avg_price_ID_big_road1', \
        'avg_price_ID_big_road2', \
        'avg_price_ID_railroad_terminal', \
        'avg_price_ID_bus_terminal', \
        'avg_price_sub_area' \
]
for f in features_to_remove:
    if f in df_train:
        del df_train[f]

def scale_up_positive(df, numeric_cols):
    for col in numeric_cols:
        min_val = min(df[col].value_counts().index)
        if min_val < 0:
            df[col] -= min_val
            df[col] += 1
        else:
            df[col] += 1

def remove_features_by_high_corr(df):            
    features_to_remove = [
        'children_preschool', 'children_school', 'male_f', \
        'female_f', 'young_male', 'young_female', 'work_male', \
        'work_female', 'ekder_male', 'ekder_female', '16_29_all',\
        '0_6_all', '0_6_male', '0_6_female',\
        '7_14_all', '7_14_male', '7_14_female', '0_17_male', '0_17_female',\
        '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female',\
        'metro_km_walk', 'railroad_station_walk_km',\
        'railroad_station_avto_km', 'public_transport_station_km' \
    ]
    for f in features_to_remove:
        del df[f]
        
def remove_outliers(df, formula, repeat=1):
    result = None
    for i in range(repeat):
        model = sm.OLS.from_formula(formula, data=df)
        result = model.fit()
        influence = result.get_influence()
        distances, pvalues = influence.cooks_distance
        threshold = 4/(len(distances) - len(df.columns.drop(['_timestamp', '_price_doc']))-1)
        outliers = [idx for idx, d in enumerate(distances) if d > threshold]
        df.drop(df.index[outliers], inplace=True)
    return df, model, result


In [26]:
df_macro = pd.read_csv('./data/macro.csv', parse_dates=['timestamp'])
df_train = pd.read_csv('./data/train.csv', index_col=0, parse_dates=['timestamp'])
df_test = pd.read_csv('./data/test.csv', index_col=0, parse_dates=['timestamp'])

In [27]:
######################################
# Drop data with extremely big price #
######################################
df_train = df_train.drop([2121]) 

######################################
# Replace outliers with proper value #
######################################
df_train.loc[df_train.state == 33, 'state'] = 3
df_train.loc[df_train['life_sq'] > 1000,     'life_sq']       = np.median(df_train['life_sq'].dropna())
df_train.loc[df_train['kitch_sq'] > 250,     'kitch_sq']      = np.median(df_train['kitch_sq'].dropna())
df_train.loc[df_train['num_room'] > 6,       'num_room']      = np.median(df_train['num_room'].dropna())
df_train.loc[df_train['build_year'] > 2017,  'build_year']    = np.median(df_train['build_year'].dropna())
df_train.loc[df_train['build_year'] < 1800,  'build_year']    = np.median(df_train['build_year'].dropna())
df_train.loc[df_train['floor'] > 50,         'floor']         = np.median(df_train['floor'].dropna())
df_train.loc[df_train['max_floor'] > 60,     'max_floor']     = np.median(df_train['max_floor'].dropna())
df_train.loc[df_train.full_sq == 0, 'full_sq'] = 50
df_train = df_train[df_train.price_doc/df_train.full_sq <= 600000]
df_train = df_train[df_train.price_doc/df_train.full_sq >= 10000]

df_test.loc[df_test['life_sq'] > 1000,     'life_sq']       = np.median(df_test['life_sq'].dropna())
df_test.loc[df_test['kitch_sq'] > 250,     'kitch_sq']      = np.median(df_test['kitch_sq'].dropna())
df_test.loc[df_test['num_room'] > 6,       'num_room']      = np.median(df_test['num_room'].dropna())
df_test.loc[df_test['build_year'] > 2017,  'build_year']    = np.median(df_test['build_year'].dropna())
df_test.loc[df_test['build_year'] < 1800,  'build_year']    = np.median(df_test['build_year'].dropna())
df_test.loc[df_test['floor'] > 50,         'floor']         = np.median(df_test['floor'].dropna())
df_test.loc[df_test['max_floor'] > 60,     'max_floor']     = np.median(df_test['max_floor'].dropna())
df_test.loc[df_test.full_sq == 0, 'full_sq'] = 50

In [28]:
#############################
# Add month and day of week #
#############################
df_train['month'] = df_train.timestamp.dt.month
df_train['dow'] = df_train.timestamp.dt.dayofweek

df_test['month'] = df_test.timestamp.dt.month
df_test['dow'] = df_test.timestamp.dt.dayofweek

#######################################
# Create new features that might help #
#######################################
df_train['rel_floor'] = df_train['floor'] / df_train['max_floor'].astype(float)
df_train['rel_kitch_sq'] = df_train['kitch_sq'] / df_train['full_sq'].astype(float)

df_test['rel_floor'] = df_test['floor'] / df_test['max_floor'].astype(float)
df_test['rel_kitch_sq'] = df_test['kitch_sq'] / df_test['full_sq'].astype(float)

df_train.apartment_name=df_train.sub_area + df_train['metro_km_avto'].astype(str)
df_test.apartment_name=df_test.sub_area + df_train['metro_km_avto'].astype(str)

df_train['room_size'] = df_train['life_sq'] / df_train['num_room'].astype(float)
df_test['room_size'] = df_test['life_sq'] / df_test['num_room'].astype(float)



In [29]:
####################################################
# Average price corresponding to sub_area and ID_* #
####################################################

id_features = ['ID_metro',
    'ID_railroad_station_walk', \
    'ID_big_road1', \
    'ID_big_road2', \
    'ID_railroad_terminal', \
    'ID_bus_terminal']

for id_f in id_features:
    df_test['avg_price_' + id_f] = 0.0
    for val in df_test[id_f].unique():
        if val == 171 and id_f == 'ID_metro':
            df_test.loc[df_test.ID_metro == 171, 'avg_price_ID_metro'] = df_train[df_train.ID_metro == 170]['price_doc'].mean()
            continue
        if val == 132 and id_f == 'ID_railroad_station_walk':
            df_test.loc[df_test.ID_railroad_station_walk == 132, 'avg_price_ID_railroad_station_walk'] = df_train[df_train.ID_railroad_station_walk == 131]['price_doc'].mean()
            continue
        if val == 121 and id_f == 'ID_railroad_station_walk':
            df_test.loc[df_test.ID_railroad_station_walk == 122, 'avg_price_ID_railroad_station_walk'] = df_train[df_train.ID_railroad_station_walk == 131]['price_doc'].mean()
            continue
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_test.loc[df_test[id_f] == val, 'avg_price_' + id_f] = avg
    del df_test[id_f]
    
for id_f in id_features:
    df_train['avg_price_' + id_f] = 0.0
    for val in df_train[id_f].unique():
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_train.loc[df_train[id_f] == val, 'avg_price_' + id_f] = avg
    del df_train[id_f]
    
cols = list(df_train.columns.values)
cols.pop(cols.index('price_doc'))
df_train = df_train[cols + ['price_doc']]


df_test['avg_price_sub_area'] = 0.0
df_train['avg_price_sub_area'] = 0.0
for subarea in df_train['sub_area'].unique():
    avg = df_train[df_train['sub_area'] == subarea]['price_doc'].mean()
    df_train.loc[df_train['sub_area'] == subarea, 'avg_price_sub_area'] = avg
    df_test.loc[df_test['sub_area'] == subarea, 'avg_price_sub_area'] = avg
del df_train['sub_area']
del df_test['sub_area']

In [30]:
###########
# numeric #
###########
for col in df_train._get_numeric_data().columns[df_train._get_numeric_data().columns.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].mean(), inplace=True)
for col in df_test._get_numeric_data().columns[df_test._get_numeric_data().columns.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].mean(), inplace=True)

###############
# categorical #
###############
for col in df_train.columns[df_train.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].value_counts().index[0], inplace=True)
for col in df_test.columns[df_test.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].value_counts().index[0], inplace=True)

In [31]:
# def pick_highly_correlated_features(df, columns, min_corr):
#     pairs = []
#     for col in columns:
#         if not np.issubdtype(df[col].dtype, np.number):
#             continue
#         corrs = [(col, c, abs(df[col].corr(df[c]))) for c in df.columns.values.tolist() if c != col]
#         corrs.sort(key=lambda item: item[2], reverse=True)
#         for item in corrs:
#             if item[2] > min_corr:
#                 pairs.append(item)
#             else:
#                 break
#     return pd.DataFrame(pairs, columns=['missing_col', 'highest corr with', 'corr'])

# def pick_highly_correlated_IVs(df, target_col, min_corr, min_unique_values = 0):
#     if not np.issubdtype(df[target_col].dtype, np.number):
#         Exception('{}은 numeric data가 아닙니다.'.format(target_col))
#     # if len(df[col].value_counts().index) < min_unique_values:
#     #     Exception('{}로 상관관계를 계산하기에는 유니크한 값이 너무 작습니다.'.format(col))

#     corrs = []
#     for col in df._get_numeric_data().drop(target_col, axis=1).columns:
#         if len(df[col].value_counts().index) < min_unique_values: continue
#         corr = abs(df[target_col].corr(df[col]))
#         if corr > min_corr:
#             corrs.append((col, corr))
    
#     return corrs 


In [33]:
#############################################
# Features with high correlation with other #
#############################################
features_to_remove = [
    'children_preschool', 'children_school', 'male_f', \
    'female_f', 'young_male', 'young_female', 'work_male', \
    'work_female', 'ekder_male', 'ekder_female',\
    '0_6_all', '0_6_male', '0_6_female',\
    '7_14_all', '7_14_male', '7_14_female', '0_17_male', '0_17_female',\
    '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female',\
    'metro_km_walk', 'railroad_station_walk_km',\
    'railroad_station_avto_km', 'public_transport_station_km' \
]
for f in features_to_remove:
    del df_train[f]
    del df_test[f]
    

In [34]:
################################
# Macro features with bad data #
################################
del df_macro['modern_education_share']
del df_macro['old_education_build_share']
del df_macro['child_on_acc_pre_school']

In [35]:
#####################  
# Constant features #
#####################
consts = [col for col in df_train.columns if len(df_train[col].value_counts().index) == 1]
for const in consts:
    del df_train[const]
    del df_test[const]
    
consts = [col for col in df_macro.columns if len(df_macro[col].value_counts().index) == 1]
for const in consts:
    del df_macro[const]

In [37]:
##############################
# Low correlation with price #
##############################
corr_limit = 0.1
for column in df_train._get_numeric_data().columns.drop('price_doc').values:
    if abs(df_train[column].corr(df_train['price_doc'])) < corr_limit:
        df_train = df_train.drop(column, axis=1)
        if column in df_test.columns.values:
            df_test = df_test.drop(column, axis=1)
            
for column in df_macro._get_numeric_data().columns.values:
    if abs(df_macro[column].corr(df_train['price_doc'])) < corr_limit:
        df_macro = df_macro.drop(column, axis=1)

In [79]:
list(vif.loc[vif['VIF Factor'] > 30].features)

['raion_popul',
 'preschool_education_centers_raion',
 'school_education_centers_raion',
 'sport_objects_raion',
 'office_raion',
 'young_all',
 'work_all',
 'ekder_all',
 '0_17_all',
 'raion_build_count_with_material_info',
 'raion_build_count_with_builddate_info',
 'build_count_1946-1970',
 'metro_min_avto',
 'metro_km_avto',
 'metro_min_walk',
 'school_km',
 'park_km',
 'railroad_station_walk_min',
 'railroad_station_avto_min',
 'ttk_km',
 'sadovoe_km',
 'bulvar_ring_km',
 'kremlin_km',
 'zd_vokzaly_avto_km',
 'bus_terminal_avto_km',
 'oil_chemistry_km',
 'nuclear_reactor_km',
 'radiation_km',
 'power_transmission_line_km',
 'thermal_power_plant_km',
 'ts_km',
 'swim_pool_km',
 'ice_rink_km',
 'stadium_km',
 'basketball_km',
 'detention_facility_km',
 'public_healthcare_km',
 'university_km',
 'workplaces_km',
 'shopping_centers_km',
 'preschool_km',
 'big_church_km',
 'mosque_km',
 'theater_km',
 'museum_km',
 'exhibition_km',
 'cafe_count_500',
 'cafe_sum_500_min_price_avg',
 'caf

In [55]:
#################
# Calculate VIF #
#################
df_train[df_train==np.inf]=np.nan
df_train.fillna(df_train.mean(), inplace=True)
categorial_ivs = set(df_train.columns.drop('timestamp')) - set(df_train._get_numeric_data().columns)
numeric_ivs = df_train._get_numeric_data().columns.drop('price_doc')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(
    df_train[numeric_ivs].values, i) for i in range(df_train[numeric_ivs].shape[1])]
vif["features"] = df_train[numeric_ivs].columns

Unnamed: 0,VIF Factor,features
0,14.76838,full_sq
1,8.198999,life_sq
2,3.734615,floor
3,17.81401,num_room
4,2.310847,kitch_sq
5,16.03865,state
6,21.40726,area_m
7,inf,raion_popul
8,113.0173,preschool_education_centers_raion
9,98.54285,school_education_centers_raion


In [83]:
#####################################
# Features with VIF greater than 30 #
#####################################

features_to_remove = [ 
    'raion_popul', \
    'preschool_education_centers_raion', \
    'school_education_centers_raion', \
    'sport_objects_raion', \
    'office_raion', \
    'young_all', \
    'work_all', \
    'ekder_all', \
    '0_17_all', \
    'raion_build_count_with_material_info', \
    'raion_build_count_with_builddate_info', \
    'build_count_1946-1970', \
    'metro_min_avto', \
    'metro_km_avto', \
    'metro_min_walk', \
    'school_km', \
    'park_km', \
    'railroad_station_walk_min', \
    'railroad_station_avto_min', \
    'ttk_km', \
    'sadovoe_km', \
    'bulvar_ring_km', \
    'kremlin_km', \
    'zd_vokzaly_avto_km', \
    'bus_terminal_avto_km', \
    'oil_chemistry_km', \
    'nuclear_reactor_km', \
    'radiation_km', \
    'power_transmission_line_km', \
    'thermal_power_plant_km', \
    'ts_km', \
    'swim_pool_km', \
    'ice_rink_km', \
    'stadium_km', \
    'basketball_km', \
    'detention_facility_km', \
    'public_healthcare_km', \
    'university_km', \
    'workplaces_km', \
    'shopping_centers_km', \
    'preschool_km', \
    'big_church_km', \
    'mosque_km', \
    'theater_km', \
    'museum_km', \
    'exhibition_km', \
    'cafe_count_500', \
    'cafe_sum_500_min_price_avg', \
    'cafe_avg_price_500', \
    'office_count_1000', \
    'cafe_count_1000', \
    'cafe_sum_1000_min_price_avg', \
    'cafe_sum_1000_max_price_avg', \
    'cafe_avg_price_1000', \
    'cafe_count_1000_na_price', \
    'cafe_count_1000_price_1000', \
    'cafe_count_1000_price_1500', \
    'office_count_1500', \
    'cafe_count_1500', \
    'cafe_sum_1500_max_price_avg', \
    'cafe_avg_price_1500', \
    'cafe_count_1500_na_price', \
    'cafe_count_1500_price_500', \
    'cafe_count_1500_price_1000', \
    'cafe_count_1500_price_1500', \
    'cafe_count_1500_price_2500', \
    'cafe_count_1500_price_high', \
    'leisure_count_1500', \
    'sport_count_1500', \
    'green_part_2000', \
    'office_count_2000', \
    'office_sqm_2000', \
    'trc_count_2000', \
    'cafe_count_2000', \
    'cafe_sum_2000_min_price_avg', \
    'cafe_sum_2000_max_price_avg', \
    'cafe_avg_price_2000', \
    'cafe_count_2000_na_price', \
    'cafe_count_2000_price_500', \
    'cafe_count_2000_price_1000', \
    'cafe_count_2000_price_1500', \
    'cafe_count_2000_price_2500', \
    'cafe_count_2000_price_high', \
    'sport_count_2000', \
    'green_part_3000', \
    'office_count_3000', \
    'office_sqm_3000', \
    'trc_count_3000', \
    'cafe_count_3000', \
    'cafe_count_3000_na_price', \
    'cafe_count_3000_price_500', \
    'cafe_count_3000_price_1000', \
    'cafe_count_3000_price_1500', \
    'cafe_count_3000_price_2500', \
    'cafe_count_3000_price_4000', \
    'cafe_count_3000_price_high', \
    'big_church_count_3000', \
    'church_count_3000', \
    'leisure_count_3000', \
    'sport_count_3000', \
    'green_part_5000',\
    'office_count_5000',\
    'office_sqm_5000',\
    'trc_count_5000',\
    'trc_sqm_5000',\
    'cafe_count_5000',\
    'cafe_count_5000_na_price', \
    'cafe_count_5000_price_500', \
    'cafe_count_5000_price_1000', \
    'cafe_count_5000_price_1500', \
    'cafe_count_5000_price_2500', \
    'cafe_count_5000_price_4000', \
    'cafe_count_5000_price_high', \
    'big_church_count_5000', \
    'church_count_5000', \
    'leisure_count_5000', \
    'sport_count_5000', \
    'market_count_5000', \
    'avg_price_ID_metro', \
    'avg_price_ID_railroad_station_walk', \
    'avg_price_ID_big_road1', \
    'avg_price_ID_big_road2', \
    'avg_price_ID_railroad_terminal', \
    'avg_price_ID_bus_terminal', \
    'avg_price_sub_area' \
]
for f in features_to_remove:
    if f in df_train:
        del df_train[f]

In [84]:
df_train_macro = df_train.merge(df_macro, left_on='timestamp', right_on='timestamp', how='left').set_index(df_train.index)
df_test_macro = df_test.merge(df_macro, left_on='timestamp', right_on='timestamp', how='left').set_index(df_test.index)
cols = list(df_train_macro.columns.values)
cols.pop(cols.index('price_doc'))
df_train_macro = df_train_macro[cols + ['price_doc']]
df_train.to_csv('./data/train_macro.csv', header=True, index=True)
df_test.to_csv('./data/test_macro.csv', header=True, index=True)