# Proprocessing & Feature Engineering

## 1. Feature/Data Transformation
- Outliers
- New Features

## 2. Missing Data Imputation
- regression
- Mode

## 3. Dimensionality Reduction
- Features with Bad or Constant Data
- Multicollinearity and Variance Inflation Factor

In [1]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")
import os
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('github'))))
import utils.preprocessing as pp 
import utils.correlation as cr
import utils.statsmodel_helper as st
import utils.var_inflation_factor as vif

df_macro = pd.read_csv('../input/macro.csv', parse_dates=['timestamp'])
df_train = pd.read_csv('../input/train.csv', index_col=0, parse_dates=['timestamp'])
df_test = pd.read_csv('../input/test.csv', index_col=0, parse_dates=['timestamp'])

min_corr = 0.3

# 1. Feature/Data Transformation
## outlier

In [2]:
# Drop data with extremely big price #
df_train = df_train.drop([2121]) 

# Replace outliers with proper value #
df_train.loc[df_train.state == 33, 'state'] = 3
df_train.loc[df_train['life_sq'] > 1000,     'life_sq']       = np.mean(df_train['life_sq'].dropna())
df_train.loc[df_train['kitch_sq'] > 250,     'kitch_sq']      = np.mean(df_train['kitch_sq'].dropna())
df_train.loc[df_train['num_room'] > 6,       'num_room']      = np.mean(df_train['num_room'].dropna())
df_train.loc[df_train['build_year'] > 2017,  'build_year']    = np.mean(df_train['build_year'].dropna())
df_train.loc[df_train['build_year'] < 1800,  'build_year']    = np.mean(df_train['build_year'].dropna())
df_train.loc[df_train['floor'] > 50,         'floor']         = np.mean(df_train['floor'].dropna())
df_train.loc[df_train['max_floor'] > 60,     'max_floor']     = np.mean(df_train['max_floor'].dropna())
df_train.loc[df_train.full_sq == 0, 'full_sq'] = 50
df_train = df_train[df_train.price_doc/df_train.full_sq <= 600000]
df_train = df_train[df_train.price_doc/df_train.full_sq >= 10000]

df_test.loc[df_test['life_sq'] > 1000,     'life_sq']       = np.mean(df_test['life_sq'].dropna())
df_test.loc[df_test['kitch_sq'] > 250,     'kitch_sq']      = np.mean(df_test['kitch_sq'].dropna())
df_test.loc[df_test['num_room'] > 6,       'num_room']      = np.mean(df_test['num_room'].dropna())
df_test.loc[df_test['build_year'] > 2017,  'build_year']    = np.mean(df_test['build_year'].dropna())
df_test.loc[df_test['build_year'] < 1800,  'build_year']    = np.mean(df_test['build_year'].dropna())
df_test.loc[df_test['floor'] > 50,         'floor']         = np.mean(df_test['floor'].dropna())
df_test.loc[df_test['max_floor'] > 60,     'max_floor']     = np.mean(df_test['max_floor'].dropna())
df_test.loc[df_test.full_sq == 0, 'full_sq'] = 50

## New features

In [3]:
# Add month and day of week #
df_train['month'] = df_train.timestamp.dt.month
df_train['dow'] = df_train.timestamp.dt.dayofweek

df_test['month'] = df_test.timestamp.dt.month
df_test['dow'] = df_test.timestamp.dt.dayofweek

df_train["yearweek"] = df_train["timestamp"].dt.year*100 + df_train["timestamp"].dt.weekofyear
df_test["yearweek"] = df_test["timestamp"].dt.year*100 + df_test["timestamp"].dt.weekofyear

# Create new features that might help #
df_train['rel_floor'] = df_train['floor'] / df_train['max_floor'].astype(float)
df_train['rel_kitch_sq'] = df_train['kitch_sq'] / df_train['full_sq'].astype(float)

df_test['rel_floor'] = df_test['floor'] / df_test['max_floor'].astype(float)
df_test['rel_kitch_sq'] = df_test['kitch_sq'] / df_test['full_sq'].astype(float)

df_train.apartment_name=df_train.sub_area + df_train['metro_km_avto'].astype(str)
df_test.apartment_name=df_test.sub_area + df_train['metro_km_avto'].astype(str)
del df_train['metro_km_avto']
del df_test['metro_km_avto']

df_train['room_size'] = df_train['life_sq'] / df_train['num_room'].astype(float)
df_test['room_size'] = df_test['life_sq'] / df_test['num_room'].astype(float)

In [4]:
#Average price corresponding to sub_area and ID_* #
id_features = ['ID_metro',
    'ID_railroad_station_walk', \
    'ID_big_road1', \
    'ID_big_road2', \
    'ID_railroad_terminal', \
    'ID_bus_terminal']

for id_f in id_features:
    df_test['avg_price_' + id_f] = 0.0
    for val in df_test[id_f].unique():
        if val == 171 and id_f == 'ID_metro':
            df_test.loc[df_test.ID_metro == 171, 'avg_price_ID_metro'] = df_train[df_train.ID_metro == 170]['price_doc'].mean()
            continue
        if val == 132 and id_f == 'ID_railroad_station_walk':
            df_test.loc[df_test.ID_railroad_station_walk == 132, 'avg_price_ID_railroad_station_walk'] = df_train[df_train.ID_railroad_station_walk == 131]['price_doc'].mean()
            continue
        if val == 121 and id_f == 'ID_railroad_station_walk':
            df_test.loc[df_test.ID_railroad_station_walk == 122, 'avg_price_ID_railroad_station_walk'] = df_train[df_train.ID_railroad_station_walk == 131]['price_doc'].mean()
            continue
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_test.loc[df_test[id_f] == val, 'avg_price_' + id_f] = avg
    del df_test[id_f]
    
for id_f in id_features:
    df_train['avg_price_' + id_f] = 0.0
    for val in df_train[id_f].unique():
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_train.loc[df_train[id_f] == val, 'avg_price_' + id_f] = avg
    del df_train[id_f]
    
cols = list(df_train.columns.values)
cols.pop(cols.index('price_doc'))
df_train = df_train[cols + ['price_doc']]


df_test['avg_price_sub_area'] = 0.0
df_train['avg_price_sub_area'] = 0.0
for subarea in df_train['sub_area'].unique():
    avg = df_train[df_train['sub_area'] == subarea]['price_doc'].mean()
    df_train.loc[df_train['sub_area'] == subarea, 'avg_price_sub_area'] = avg
    df_test.loc[df_test['sub_area'] == subarea, 'avg_price_sub_area'] = avg
del df_train['sub_area']
del df_test['sub_area']

## 2. Missing Data Imputation
### regression

In [5]:
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_macro.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train = pp.impute_by_regression(df_train, 4, 0.1)
df_test = pp.impute_by_regression(df_test, 4, 0.1)
df_macro = pp.impute_by_regression(df_macro, 4, 0.1)
df_train._get_numeric_data()[df_train._get_numeric_data() < 0] = 0
df_test._get_numeric_data()[df_test._get_numeric_data() < 0] = 0

In [6]:
pp.find_missing_data_columns(df_train)

Unnamed: 0,missing_column,missing_count


In [7]:
pp.find_missing_data_columns(df_test)

Unnamed: 0,missing_column,missing_count
10,product_type,33


In [8]:
pp.find_missing_data_columns(df_macro)

Unnamed: 0,missing_column,missing_count
78,child_on_acc_pre_school,658
81,modern_education_share,1389
82,old_education_build_share,1389


### Mode

In [9]:
# numeric #
for col in df_train._get_numeric_data().columns[df_train._get_numeric_data().columns.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].mean(), inplace=True)
for col in df_test._get_numeric_data().columns[df_test._get_numeric_data().columns.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].mean(), inplace=True)

# categorical #
for col in df_train.columns[df_train.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].value_counts().index[0], inplace=True)
for col in df_test.columns[df_test.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].value_counts().index[0], inplace=True)

# 3. Dimensionality Reduction
## Features with Bad or Constant Data

In [10]:
# Features with high correlation with other #
features_to_remove = [
    'children_preschool', 'children_school', 'male_f', \
    'female_f', 'young_male', 'young_female', 'work_male', \
    'work_female', 'ekder_male', 'ekder_female',\
    '0_6_all', '0_6_male', '0_6_female',\
    '7_14_all', '7_14_male', '7_14_female', '0_17_male', '0_17_female',\
    '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female',\
]
for f in features_to_remove:
    del df_train[f]
    del df_test[f]
    
# Macro features with bad data #
del df_macro['modern_education_share']
del df_macro['old_education_build_share']
del df_macro['child_on_acc_pre_school']

# Constant features #
consts = [col for col in df_train.columns if len(df_train[col].value_counts().index) == 1]
for const in consts:
    del df_train[const]
    del df_test[const]
    
consts = [col for col in df_macro.columns if len(df_macro[col].value_counts().index) == 1]
for const in consts:
    del df_macro[const]
    
# Low correlation with price #
corr_limit = 0.1
for column in df_train._get_numeric_data().columns.drop('price_doc').values:
    if abs(df_train[column].corr(df_train['price_doc'])) < corr_limit:
        df_train = df_train.drop(column, axis=1)
        if column in df_test.columns.values:
            df_test = df_test.drop(column, axis=1)

corr_limit = 0.05           
for column in df_macro._get_numeric_data().columns.values:
    if abs(df_macro[column].corr(df_train['price_doc'])) < corr_limit:
        df_macro = df_macro.drop(column, axis=1)

## Multicollinearity and Variance Inflation Factor

In [11]:
# df_train[df_train==np.inf]=np.nan
# df_train.fillna(df_train.median(), inplace=True)
# categorial_ivs = set(df_train.columns.drop('timestamp')) - set(df_train._get_numeric_data().columns)
# numeric_ivs = df_train._get_numeric_data().columns.drop('price_doc')
# temp = vif.VarInflationFactor(impute=True, thresh=10.0).fit_transform(df_train[numeric_ivs])
# df_train = pd.concat([df_train['timestamp'], temp, df_train[categorial_ivs], df_train['price_doc']], axis=1)

### Low correlation with price ; 0.1
- Dropping raion_popul with vif=inf
- Dropping cafe_count_3000 with vif=inf
- Dropping cafe_count_5000 with vif=inf
- Dropping cafe_avg_price_1500 with vif=14570698.395478534
- Dropping raion_build_count_with_builddate_info with vif=307429.81510292145
- Dropping kremlin_km with vif=84072.4496292116
- Dropping cafe_count_2000 with vif=74874.23978052993
- Dropping sadovoe_km with vif=41969.994861328116
- Dropping cafe_count_1500 with vif=21039.17010239846
- Dropping 0_17_all with vif=13646.574347445614
- Dropping bulvar_ring_km with vif=6616.584984269324
- Dropping cafe_sum_1500_max_price_avg with vif=1820.2389885218406
- Dropping cafe_count_5000_price_1000 with vif=1740.0338127865925
- Dropping yearweek with vif=1681.450093689153
- Dropping school_km with vif=1335.81431222714
- Dropping cafe_count_5000_price_1500 with vif=1147.081887683671
- Dropping cafe_count_5000_price_2500 with vif=1029.403433436397
- Dropping cafe_count_3000_price_1500 with vif=991.0424861610946
- Dropping office_count_5000 with vif=897.5418693955999
- Dropping cafe_count_1000 with vif=847.7716902197002
- Dropping cafe_count_3000_price_500 with vif=762.3462932143118
- Dropping office_count_3000 with vif=663.3000988405182
- Dropping cafe_count_3000_price_2500 with vif=649.4909582035716
- Dropping ttk_km with vif=623.0343503068661
- Dropping cafe_count_2000_price_1500 with vif=575.0925791712715
- Dropping cafe_count_2000_price_500 with vif=478.6132614851965
- Dropping cafe_count_5000_price_500 with vif=456.50055580818736
- Dropping avg_price_ID_railroad_terminal with vif=450.89525902835766
- Dropping office_count_2000 with vif=392.24816093741754
- Dropping church_count_5000 with vif=387.01978334666796
- Dropping cafe_count_2000_price_2500 with vif=363.595326511891
- Dropping cafe_count_3000_price_1000 with vif=320.1052674949561
- Dropping cafe_count_1500_price_1500 with vif=282.746038549093
- Dropping cafe_count_5000_na_price with vif=273.08773319959846
- Dropping cafe_count_2000_price_1000 with vif=241.08785114367652
- Dropping work_all with vif=210.87351861655605
- Dropping zd_vokzaly_avto_km with vif=204.2106884705741
- Dropping church_count_3000 with vif=195.53832783449283
- Dropping oil_chemistry_km with vif=170.16364619582987
- Dropping cafe_count_1500_price_500 with vif=161.52313340418294
- Dropping cafe_count_5000_price_4000 with vif=161.05266399808895
- Dropping avg_price_ID_bus_terminal with vif=160.64037784706218
- Dropping cafe_count_3000_price_4000 with vif=152.6997344107902
- Dropping office_count_1500 with vif=142.04934406000297
- Dropping cafe_count_3000_na_price with vif=132.21990919926546
- Dropping trc_count_5000 with vif=128.6064110563572
- Dropping cafe_count_1500_price_1000 with vif=113.77495741603722
- Dropping sport_count_5000 with vif=112.4224905925094
- Dropping leisure_count_5000 with vif=111.13325674128215
- Dropping avg_price_sub_area with vif=104.69793518191196
- Dropping radiation_km with vif=102.64468640327081
- Dropping big_church_count_3000 with vif=98.0565880785707
- Dropping cafe_count_2000_na_price with vif=90.06136291617442
- Dropping cafe_count_1500_price_2500 with vif=88.45223167705826
- Dropping basketball_km with vif=88.18375546300162
- Dropping avg_price_ID_big_road1 with vif=84.25375404590979
- Dropping preschool_education_centers_raion with vif=83.51831699052997
- Dropping green_part_3000 with vif=80.69687373913787
- Dropping cafe_count_500 with vif=79.33669277723212
- Dropping avg_price_ID_railroad_station_walk with vif=71.02406358359809
- Dropping cafe_count_1000_price_1500 with vif=67.11836768927695
- Dropping stadium_km with vif=62.725296129688836
- Dropping avg_price_ID_big_road2 with vif=61.00583829565759
- Dropping leisure_count_3000 with vif=59.1763655451112
- Dropping preschool_km with vif=58.5367444833199
- Dropping sport_count_3000 with vif=55.95603612412343
- Dropping power_transmission_line_km with vif=53.51652710044996
- Dropping office_sqm_3000 with vif=53.44907932128648
- Dropping museum_km with vif=52.46393358334705
- Dropping exhibition_km with vif=49.61405227768155
- Dropping cafe_count_2000_price_high with vif=47.08872672928615
- Dropping mosque_km with vif=45.106132716126076
- Dropping workplaces_km with vif=42.99593357783127
- Dropping cafe_count_1500_na_price with vif=41.018995362179574
- Dropping university_km with vif=40.254959137620595
- Dropping trc_count_3000 with vif=39.81452554144815
- Dropping young_all with vif=38.17886789191855
- Dropping cafe_sum_2000_max_price_avg with vif=37.695448912360646
- Dropping office_count_1000 with vif=36.72871406591059
- Dropping sport_count_2000 with vif=36.4826868577771
- Dropping num_room with vif=35.68873158182139
- Dropping office_sqm_5000 with vif=35.01348813441707
- Dropping thermal_power_plant_km with vif=34.03738323300284
- Dropping office_raion with vif=29.89884708835293
- Dropping swim_pool_km with vif=29.71620942288434
- Dropping railroad_station_avto_min with vif=28.87338406354351
- Dropping green_part_5000 with vif=28.70960472644583
- Dropping trc_count_2000 with vif=27.67002975966542
- Dropping cafe_count_1000_price_1000 with vif=26.853821092056368
- Dropping detention_facility_km with vif=26.05922781884997
- Dropping big_church_km with vif=25.561778861496684
- Dropping cafe_count_3000_price_high with vif=25.282559922207252
- Dropping shopping_centers_km with vif=22.131764462314816
- Dropping avg_price_ID_metro with vif=21.965512627738207
- Dropping leisure_count_1500 with vif=21.95436798807885
- Dropping nuclear_reactor_km with vif=21.12462602292889
- Dropping cafe_sum_1500_min_price_avg with vif=20.6501108508169
- Dropping office_sqm_2000 with vif=20.620193158569712
- Dropping trc_sqm_5000 with vif=18.504195464013076
- Dropping sport_objects_raion with vif=18.291767574631205
- Dropping park_km with vif=18.053696961744084
- Dropping full_sq with vif=18.0474213830802
- Dropping big_road2_km with vif=17.408880983567126
- Dropping ekder_all with vif=15.920877255305822
- Dropping sport_count_1500 with vif=15.148369054142712
- Dropping state with vif=13.91628839390911
- Dropping public_healthcare_km with vif=13.856163768507596
- Dropping ts_km with vif=13.674950877943553
- Dropping big_church_count_5000 with vif=13.027726667821936
- Dropping bus_terminal_avto_km with vif=12.75146274098609
- Dropping theater_km with vif=12.495088881795647
- Dropping area_m with vif=12.232469566632604
- Dropping room_size with vif=12.030123165912915
- Dropping raion_build_count_with_material_info with vif=11.589386861738545
- Dropping cafe_count_1500_price_high with vif=11.431084074893445
- Dropping office_sqm_1500 with vif=10.481504376675996
- Dropping market_count_3000 with vif=10.194467884784672

In [12]:
feature_to_removes = [
    "raion_popul"
    "cafe_count_3000" 
    "cafe_count_5000" 
    "cafe_avg_price_1500" 
    "raion_build_count_with_builddate_info" 
    "kremlin_km" 
    "cafe_count_2000" 
    "sadovoe_km" 
    "cafe_count_1500" 
    "0_17_all" 
    "bulvar_ring_km" 
    "cafe_sum_1500_max_price_avg" 
    "cafe_count_5000_price_1000" 
    "yearweek" 
    "school_km" 
    "cafe_count_5000_price_1500" 
    "cafe_count_5000_price_2500" 
    "cafe_count_3000_price_1500" 
    "office_count_5000" 
    "cafe_count_1000" 
    "cafe_count_3000_price_500" 
    "office_count_3000" 
    "cafe_count_3000_price_2500" 
    "ttk_km" 
    "cafe_count_2000_price_1500" 
    "cafe_count_2000_price_500" 
    "cafe_count_5000_price_500" 
    "avg_price_ID_railroad_terminal" 
    "office_count_2000" 
    "church_count_5000" 
    "cafe_count_2000_price_2500" 
    "cafe_count_3000_price_1000" 
    "cafe_count_1500_price_1500" 
    "cafe_count_5000_na_price" 
    "cafe_count_2000_price_1000" 
    "work_all" 
    "zd_vokzaly_avto_km" 
    "church_count_3000"
    "oil_chemistry_km" 
    "cafe_count_1500_price_500" 
    "cafe_count_5000_price_4000" 
    "avg_price_ID_bus_terminal" 
    "cafe_count_3000_price_4000" 
    "office_count_1500"
    "cafe_count_3000_na_price" 
    "trc_count_5000" 
    "cafe_count_1500_price_1000" 
    "sport_count_5000" 
    "leisure_count_5000"
    "avg_price_sub_area"
    "radiation_km" 
    "big_church_count_3000"
    "cafe_count_2000_na_price"
    "cafe_count_1500_price_2500"
    "basketball_km" 
    "avg_price_ID_big_road1"
    "preschool_education_centers_raion"
    "green_part_3000" 
    "cafe_count_500" 
    "avg_price_ID_railroad_station_walk"
    "cafe_count_1000_price_1500"
    "stadium_km"
    "avg_price_ID_big_road2" 
    "leisure_count_3000" 
    "preschool_km" 
    "sport_count_3000"
    "power_transmission_line_km" 
    "office_sqm_3000" 
    "museum_km" 
    "exhibition_km"
    "cafe_count_2000_price_high" 
    "mosque_km" 
    "workplaces_km"
    "cafe_count_1500_na_price"
    "university_km"
    "trc_count_3000"
    "young_all"
    "cafe_sum_2000_max_price_avg" 
    "office_count_1000"
    "sport_count_2000" 
    "num_room" 
    "office_sqm_5000" 
    "thermal_power_plant_km" 
    "office_raion" 
    "swim_pool_km" 
    "railroad_station_avto_min" 
    "green_part_5000" 
    "trc_count_2000" 
    "cafe_count_1000_price_1000"
    "detention_facility_km"
    "big_church_km" 
    "cafe_count_3000_price_high" 
    "shopping_centers_km"
    "avg_price_ID_metro" 
    "leisure_count_1500" 
    "nuclear_reactor_km" 
    "cafe_sum_1500_min_price_avg" 
    "office_sqm_2000" 
    "trc_sqm_5000" 
    "sport_objects_raion" 
    "park_km" 
    "full_sq" 
    "big_road2_km"
    "ekder_all" 
    "sport_count_1500"
    "state" 
    "public_healthcare_km"
    "ts_km" 
    "big_church_count_5000"
    "bus_terminal_avto_km" 
    "theater_km" 
    "area_m"
    "room_size" 
    "raion_build_count_with_material_info" 
    "cafe_count_1500_price_high"
    "office_sqm_1500" 
    "market_count_3000" 
]

for f in feature_to_removes:
    if f in df_train:
        del df_train

In [13]:
df_train_macro = df_train.merge(df_macro, left_on='timestamp', right_on='timestamp', how='left').set_index(df_train.index)
df_test_macro = df_test.merge(df_macro, left_on='timestamp', right_on='timestamp', how='left').set_index(df_test.index)
cols = list(df_train_macro.columns.values)
cols.pop(cols.index('price_doc'))
df_train_macro = df_train_macro[cols + ['price_doc']]
df_train_macro.to_csv('../input/train_macro.csv', header=True, index=True)
df_test_macro.to_csv('../input/test_macro.csv', header=True, index=True)