# Proprocessing & Feature Engineering

In [1]:
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 300
%matplotlib inline

## 1. Feature/Data Transformation

In [2]:
df_train = pd.read_csv('./data/train.csv', parse_dates=['timestamp'])
df_test = pd.read_csv('./data/test.csv', parse_dates=['timestamp'])

## outlier & clean data

In [3]:
## clean data
# check life_sq > full_sq -> change np.NaN
bad_index = df_train[df_train.life_sq > df_train.full_sq].index
df_train.loc[bad_index, "life_sq"] = np.NaN
bad_index = df_test[df_test.life_sq > df_test.full_sq].index
df_test.loc[bad_index, "life_sq"] = np.NaN
bad_index = df_train[df_train.life_sq < 5].index
df_train.loc[bad_index, "life_sq"] = np.NaN
bad_index = df_test[df_test.life_sq < 5].index
df_test.loc[bad_index, "life_sq"] = np.NaN
bad_index = df_train[df_train.full_sq < 5].index
df_train.loc[bad_index, "full_sq"] = np.NaN
bad_index = df_test[df_test.full_sq < 5].index
df_test.loc[bad_index, "full_sq"] = np.NaN
bad_index = df_train[df_train.kitch_sq >= df_train.life_sq].index
df_train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = df_test[df_test.kitch_sq >= df_test.life_sq].index
df_test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = df_train[(df_train.kitch_sq == 0).values + (df_train.kitch_sq == 1).values].index
df_train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = df_test[(df_test.kitch_sq == 0).values + (df_test.kitch_sq == 1).values].index
df_test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = df_train[(df_train.full_sq > 210) & (df_train.life_sq / df_train.full_sq < 0.3)].index
df_train.loc[bad_index, "full_sq"] = np.NaN
bad_index = df_test[(df_test.full_sq > 200) & (df_test.life_sq / df_test.full_sq < 0.3)].index
df_test.loc[bad_index, "full_sq"] = np.NaN
df_train.loc[[13117], "build_year"] = df_train.loc[[13117], "kitch_sq"]

## outlier
# brings error down a lot by removing extreme price per sqm
df_train.loc[df_train.state == 33, 'state'] = 3
df_train.loc[df_train['life_sq'] > 1000, 'life_sq'] = np.median(df_train['life_sq'].dropna())
df_train.loc[df_train['kitch_sq'] > 250, 'kitch_sq'] = np.median(df_train['kitch_sq'].dropna())
df_train.loc[df_train['num_room'] > 6, 'num_room'] = np.median(df_train['num_room'].dropna())
df_train.loc[df_train['floor'] > 50, 'floor'] = np.median(df_train['floor'].dropna())
df_train.loc[df_train['max_floor'] > 60, 'max_floor'] = np.median(df_train['max_floor'].dropna())
df_train.loc[df_train.full_sq == 0, 'full_sq'] = 50
df_train = df_train[df_train.price_doc/df_train.full_sq <= 600000]
df_train = df_train[df_train.price_doc/df_train.full_sq >= 10000]

## New feature

In [4]:
# Add month and day of week
df_train['month'] = df_train.timestamp.dt.month
df_train['dow'] = df_train.timestamp.dt.dayofweek

df_test['month'] = df_test.timestamp.dt.month
df_test['dow'] = df_test.timestamp.dt.dayofweek

# Other feature engineering
df_train['rel_floor'] = df_train['floor'] / df_train['max_floor'].astype(float)
df_train['rel_kitch_sq'] = df_train['kitch_sq'] / df_train['full_sq'].astype(float)
df_test['rel_floor'] = df_test['floor'] / df_test['max_floor'].astype(float)
df_test['rel_kitch_sq'] = df_test['kitch_sq'] / df_test['full_sq'].astype(float)

df_train.apartment_name=df_train.sub_area + df_train['metro_km_avto'].astype(str)
df_test.apartment_name=df_test.sub_area + df_train['metro_km_avto'].astype(str)

df_train['room_size'] = df_train['life_sq'] / df_train['num_room'].astype(float)
df_test['room_size'] = df_test['life_sq'] / df_test['num_room'].astype(float)

# Average price corresponding to sub_area 
id_features = ['ID_metro',
    'ID_railroad_station_walk', \
    'ID_big_road1', \
    'ID_big_road2', \
    'ID_railroad_terminal', \
    'ID_bus_terminal']

df_test['avg_price_sub_area'] = 0.0
df_train['avg_price_sub_area'] = 0.0
for subarea in df_train['sub_area'].unique():
    avg = df_train[df_train['sub_area'] == subarea]['price_doc'].mean()
    df_train.loc[df_train['sub_area'] == subarea, 'avg_price_sub_area'] = avg
    df_test.loc[df_test['sub_area'] == subarea, 'avg_price_sub_area'] = avg
del df_train['sub_area']
del df_test['sub_area']

for id_f in id_features:
    df_train['avg_price_' + id_f] = 0.0
    for val in df_train[id_f].unique():
        avg = df_train[df_train[id_f] == val]['price_doc'].mean()
        df_train.loc[df_train[id_f] == val, 'avg_price_' + id_f] = avg
    del df_train[id_f]
    
cols = list(df_train.columns.values)
cols.pop(cols.index('price_doc'))
df_train = df_train[cols + ['price_doc']]

# 2. Missing Data Imputation

Impute numeric data with mean and categorical data with mode.

In [5]:
# numeric
for col in df_train._get_numeric_data().columns[df_train._get_numeric_data().isnull().any()]:
    df_train[col].fillna(df_train[col].mean(), inplace=True)
for col in df_test._get_numeric_data().columns[df_test._get_numeric_data().isnull().any()]:
    df_test[col].fillna(df_test[col].mean(), inplace=True)

# categorical 
for col in df_train.columns[df_train.isnull().any()].tolist():
    df_train[col].fillna(df_train[col].value_counts().index[0], inplace=True)
for col in df_test.columns[df_test.isnull().any()].tolist():
    df_test[col].fillna(df_train[col].value_counts().index[0], inplace=True)