# Proprocessing & Feature Engineering

## 1. Feature/Data Transformation
- Outliers
- New Features

## 2. Missing Data Imputation
- interpolate
- Mode

## 3. Dimensionality Reduction
- Features with Bad or Constant Data
- Multicollinearity and Variance Inflation Factor

In [1]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings("ignore")
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import utils.preprocessing as pp 


df_macro = pd.read_csv('./data/macro.csv', parse_dates=['timestamp'])
df_train = pd.read_csv('./data/train.csv', index_col=0, parse_dates=['timestamp'])
df_test = pd.read_csv('./data/test.csv', index_col=0, parse_dates=['timestamp'])

min_corr = 0.3

# 1. Feature/Data Transformation
## outlier

In [2]:
# Drop data with extremely big price #
df_train = df_train.drop([2121]) 

# Replace outliers with proper value #
df_train.loc[df_train.state == 33, 'state'] = 3
df_train.loc[df_train['life_sq'] > 1000,     'life_sq']       = np.median(df_train['life_sq'].dropna())
df_train.loc[df_train['kitch_sq'] > 250,     'kitch_sq']      = np.median(df_train['kitch_sq'].dropna())
df_train.loc[df_train['num_room'] > 6,       'num_room']      = np.median(df_train['num_room'].dropna())
df_train.loc[df_train['build_year'] > 2017,  'build_year']    = np.median(df_train['build_year'].dropna())
df_train.loc[df_train['build_year'] < 1800,  'build_year']    = np.median(df_train['build_year'].dropna())
df_train.loc[df_train['floor'] > 50,         'floor']         = np.median(df_train['floor'].dropna())
df_train.loc[df_train['max_floor'] > 60,     'max_floor']     = np.median(df_train['max_floor'].dropna())
df_train.loc[df_train.full_sq == 0, 'full_sq'] = 50
df_train = df_train[df_train.price_doc/df_train.full_sq <= 600000]
df_train = df_train[df_train.price_doc/df_train.full_sq >= 10000]

df_test.loc[df_test['life_sq'] > 1000,     'life_sq']       = np.median(df_test['life_sq'].dropna())
df_test.loc[df_test['kitch_sq'] > 250,     'kitch_sq']      = np.median(df_test['kitch_sq'].dropna())
df_test.loc[df_test['num_room'] > 6,       'num_room']      = np.median(df_test['num_room'].dropna())
df_test.loc[df_test['build_year'] > 2017,  'build_year']    = np.median(df_test['build_year'].dropna())
df_test.loc[df_test['build_year'] < 1800,  'build_year']    = np.median(df_test['build_year'].dropna())
df_test.loc[df_test['floor'] > 50,         'floor']         = np.median(df_test['floor'].dropna())
df_test.loc[df_test['max_floor'] > 60,     'max_floor']     = np.median(df_test['max_floor'].dropna())
df_test.loc[df_test.full_sq == 0, 'full_sq'] = 50

## New features

In [3]:
# Add month and day of week #
df_train['month'] = df_train.timestamp.dt.month
df_train['dow'] = df_train.timestamp.dt.dayofweek

df_test['month'] = df_test.timestamp.dt.month
df_test['dow'] = df_test.timestamp.dt.dayofweek

# Create new features that might help #
df_train['rel_floor'] = df_train['floor'] / df_train['max_floor'].astype(float)
df_train['rel_kitch_sq'] = df_train['kitch_sq'] / df_train['full_sq'].astype(float)

df_test['rel_floor'] = df_test['floor'] / df_test['max_floor'].astype(float)
df_test['rel_kitch_sq'] = df_test['kitch_sq'] / df_test['full_sq'].astype(float)

df_train.apartment_name=df_train.sub_area + df_train['metro_km_avto'].astype(str)
df_test.apartment_name=df_test.sub_area + df_train['metro_km_avto'].astype(str)

df_train['room_size'] = df_train['life_sq'] / df_train['num_room'].astype(float)
df_test['room_size'] = df_test['life_sq'] / df_test['num_room'].astype(float)

# Average price corresponding to sub_area and ID_* #
id_features = ['ID_metro',
    'ID_railroad_station_walk', \
    'ID_big_road1', \
    'ID_big_road2', \
    'ID_railroad_terminal', \
    'ID_bus_terminal']

for id_f in id_features:
    df_test['avg_price_' + id_f] = 0.0
    for val in df_test[id_f].unique():
        if val == 171 and id_f == 'ID_metro':
            df_test.loc[df_test.ID_metro == 171, 'avg_price_ID_metro'] = df_train[df_train.ID_metro == 170]['price_doc'].mean()
            continue
        if val == 132 and id_f == 'ID_railroad_station_walk':
            df_test.loc[df_test.ID_railroad_station_walk == 132, 'avg_price_ID_railroad_station_walk'] = df_train[df_train.ID_railroad_station_walk == 131]['price_doc'].mean()
            continue
        if val == 121 and id_f == 'ID_railroad_station_walk':
            df_test.loc[df_test.ID_railroad_station_walk == 122, 'avg_price_ID_railroad_station_walk'] = df_train[df_train.ID_railroad_station_walk == 131]['price_doc'].mean()
            continue
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_test.loc[df_test[id_f] == val, 'avg_price_' + id_f] = avg
    del df_test[id_f]
    
for id_f in id_features:
    df_train['avg_price_' + id_f] = 0.0
    for val in df_train[id_f].unique():
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_train.loc[df_train[id_f] == val, 'avg_price_' + id_f] = avg
    del df_train[id_f]
    
cols = list(df_train.columns.values)
cols.pop(cols.index('price_doc'))
df_train = df_train[cols + ['price_doc']]


df_test['avg_price_sub_area'] = 0.0
df_train['avg_price_sub_area'] = 0.0
for subarea in df_train['sub_area'].unique():
    avg = df_train[df_train['sub_area'] == subarea]['price_doc'].mean()
    df_train.loc[df_train['sub_area'] == subarea, 'avg_price_sub_area'] = avg
    df_test.loc[df_test['sub_area'] == subarea, 'avg_price_sub_area'] = avg
del df_train['sub_area']
del df_test['sub_area']


# Add the Macro Feature #
usdrub_pairs = dict(zip(list(df_macro['timestamp']), list(df_macro['usdrub'])))

df_train['timestamp'].replace(usdrub_pairs,inplace=True)
df_test['timestamp'].replace(usdrub_pairs,inplace=True)

df_train.rename(columns={'timestamp' : 'usdrub'}, inplace=True)
df_test.rename(columns={'timestamp' : 'usdrub'}, inplace=True)

## 2. Missing Data Imputation
- interpolate
- Mode

In [4]:
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train._get_numeric_data()[df_train._get_numeric_data() < 0] = 0
df_test._get_numeric_data()[df_test._get_numeric_data() < 0] = 0

In [5]:
# interpolate
df_train = pp.imput_by_interpolate(df_train)
df_test = pp.imput_by_interpolate(df_test)

In [6]:
# numeric #
for col in df_train._get_numeric_data().columns[df_train._get_numeric_data().columns.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].mean(), inplace=True)
for col in df_test._get_numeric_data().columns[df_test._get_numeric_data().columns.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].mean(), inplace=True)

# categorical #
for col in df_train.columns[df_train.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].value_counts().index[0], inplace=True)
for col in df_test.columns[df_test.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].value_counts().index[0], inplace=True)

# 3. Dimensionality Reduction
## Features with Bad or Constant Data

In [7]:
# Features with high correlation with other #
features_to_remove = [
    'children_preschool', 'children_school', 'male_f', \
    'female_f', 'young_male', 'young_female', 'work_male', \
    'work_female', 'ekder_male', 'ekder_female',\
    '0_6_all', '0_6_male', '0_6_female',\
    '7_14_all', '7_14_male', '7_14_female', '0_17_male', '0_17_female',\
    '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female',\
    'metro_km_walk', 'railroad_station_walk_km',\
    'railroad_station_avto_km', 'public_transport_station_km' \
]
for f in features_to_remove:
    del df_train[f]
    del df_test[f]

In [8]:
# Constant features #
consts = [col for col in df_train.columns if len(df_train[col].value_counts().index) == 1]
for const in consts:
    del df_train[const]
    del df_test[const]
    
consts = [col for col in df_macro.columns if len(df_macro[col].value_counts().index) == 1]
for const in consts:
    del df_macro[const]

## Multicollinearity and Variance Inflation Factor

In [9]:
# Low correlation with price #
corr_limit = 0.1
for column in df_train._get_numeric_data().columns.drop('price_doc').values:
    if abs(df_train[column].corr(df_train['price_doc'])) < corr_limit:
        df_train = df_train.drop(column, axis=1)
        if column in df_test.columns.values:
            df_test = df_test.drop(column, axis=1)
            
for column in df_macro._get_numeric_data().columns.values:
    if abs(df_macro[column].corr(df_train['price_doc'])) < corr_limit:
        df_macro = df_macro.drop(column, axis=1)

In [12]:
# Calculate VIF #
df_train[df_train==np.inf]=np.nan
df_train.fillna(df_train.mean(), inplace=True)
categorial_ivs = set(df_train.columns) - set(df_train._get_numeric_data().columns)
numeric_ivs = df_train._get_numeric_data().columns.drop('price_doc')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(
    df_train[numeric_ivs].values, i) for i in range(df_train[numeric_ivs].shape[1])]
vif["features"] = df_train[numeric_ivs].columns

In [32]:
vif.loc[vif['VIF Factor'] > 20]

Unnamed: 0,VIF Factor,features
6,20.2061,area_m
7,inf,raion_popul
8,103.4745,preschool_education_centers_raion
9,91.6833,school_education_centers_raion
14,28.58999,sport_objects_raion
16,30.39614,office_raion
17,inf,young_all
18,inf,work_all
19,inf,ekder_all
20,12900.35,0_17_all


In [33]:
# Features with VIF greater than 30 #
features_to_remove = [ 
     'area_m',\
     'raion_popul',\
     'preschool_education_centers_raion',\
     'school_education_centers_raion',\
     'sport_objects_raion',\
     'office_raion',\
     'young_all',\
     'work_all',\
     'ekder_all',\
     '0_17_all',\
     'metro_min_avto',\
     'metro_km_avto',\
     'metro_min_walk',\
     'kindergarten_km',\
     'school_km',\
     'park_km',\
     'railroad_station_walk_min',\
     'railroad_station_avto_min',\
     'ttk_km',\
     'sadovoe_km',\
     'bulvar_ring_km',\
     'kremlin_km',\
     'big_road2_km',\
     'zd_vokzaly_avto_km',\
     'bus_terminal_avto_km',\
     'oil_chemistry_km',\
     'nuclear_reactor_km',\
     'radiation_km',\
     'power_transmission_line_km',\
     'thermal_power_plant_km',\
     'ts_km',\
     'market_shop_km',\
     'swim_pool_km',\
     'ice_rink_km',\
     'stadium_km',\
     'basketball_km',\
     'hospice_morgue_km',\
     'detention_facility_km',\
     'public_healthcare_km',\
     'university_km',\
     'workplaces_km',\
     'shopping_centers_km',\
     'office_km',\
     'preschool_km',\
     'big_church_km',\
     'mosque_km',\
     'theater_km',\
     'museum_km',\
     'exhibition_km',\
     'cafe_count_500',\
     'cafe_count_500_price_1500',\
     'office_count_1000',\
     'cafe_count_1000',\
     'cafe_count_1000_na_price',\
     'cafe_count_1000_price_1000',\
     'cafe_count_1000_price_1500',\
     'office_count_1500',\
     'office_sqm_1500',\
     'trc_count_1500',\
     'cafe_count_1500',\
     'cafe_sum_1500_max_price_avg',\
     'cafe_avg_price_1500',\
     'cafe_count_1500_na_price',\
     'cafe_count_1500_price_500',\
     'cafe_count_1500_price_1000',\
     'cafe_count_1500_price_1500',\
     'cafe_count_1500_price_2500',\
     'cafe_count_1500_price_high',\
     'leisure_count_1500',\
     'sport_count_1500',\
     'green_part_2000',\
     'office_count_2000',\
     'office_sqm_2000',\
     'trc_count_2000',\
     'cafe_count_2000',\
     'cafe_sum_2000_max_price_avg',\
     'cafe_avg_price_2000',\
     'cafe_count_2000_na_price',\
     'cafe_count_2000_price_500',\
     'cafe_count_2000_price_1000',\
     'cafe_count_2000_price_1500',\
     'cafe_count_2000_price_2500',\
     'cafe_count_2000_price_high',\
     'sport_count_2000',\
     'green_part_3000',\
     'office_count_3000',\
     'office_sqm_3000',\
     'trc_count_3000',\
     'cafe_count_3000',\
     'cafe_count_3000_na_price',\
     'cafe_count_3000_price_500',\
     'cafe_count_3000_price_1000',\
     'cafe_count_3000_price_1500',\
     'cafe_count_3000_price_2500',\
     'cafe_count_3000_price_4000',\
     'cafe_count_3000_price_high',\
     'big_church_count_3000',\
     'church_count_3000',\
     'leisure_count_3000',\
     'sport_count_3000',\
     'green_part_5000',\
     'office_count_5000',\
     'office_sqm_5000',\
     'trc_count_5000',\
     'trc_sqm_5000',\
     'cafe_count_5000',\
     'cafe_count_5000_na_price',\
     'cafe_count_5000_price_500',\
     'cafe_count_5000_price_1000',\
     'cafe_count_5000_price_1500',\
     'cafe_count_5000_price_2500',\
     'cafe_count_5000_price_4000',\
     'cafe_count_5000_price_high',\
     'big_church_count_5000',\
     'church_count_5000',\
     'leisure_count_5000',\
     'sport_count_5000',\
     'market_count_5000',\
     'avg_price_ID_metro',\
     'avg_price_ID_railroad_station_walk',\
     'avg_price_ID_big_road1',\
     'avg_price_ID_big_road2',\
     'avg_price_ID_railroad_terminal',\
     'avg_price_ID_bus_terminal',\
     'avg_price_sub_area'\
]
for f in features_to_remove:
    if f in df_train:
        del df_train[f]

In [34]:
cols = list(df_train.columns.values)
cols.pop(cols.index('price_doc'))
df_train = df_train[cols + ['price_doc']]
df_train.to_csv('./data/train_macro_2.csv', header=True, index=True)
df_test.to_csv('./data/test_macro_2.csv', header=True, index=True)