In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder

from scipy import stats

import lightgbm as lgb
from catboost import CatBoostRegressor

import math

import matplotlib.pylab as plt
plt.style.use('ggplot')


In [2]:
scaling_factor = 1000000000000

In [3]:
train_apartment = pd.read_csv('../data/apartments_train.csv')
train_building = pd.read_csv('../data/buildings_train.csv')
train = pd.merge(train_apartment, train_building, left_on='building_id', right_on='id')
train.rename(columns={'id_x' : 'apartment_id'}, inplace=True)
train.drop('id_y', axis=1, inplace=True)
# Remove duplicate column after merge.

In [4]:
train['district']
train.dtypes

apartment_id            int64
seller                float64
price                 float64
area_total            float64
area_kitchen          float64
area_living           float64
floor                 float64
rooms                 float64
layout                float64
ceiling               float64
bathrooms_shared      float64
bathrooms_private     float64
windows_court         float64
windows_street        float64
balconies             float64
loggias               float64
condition             float64
phones                float64
building_id             int64
new                   float64
latitude              float64
longitude             float64
district              float64
street                 object
address                object
constructed           float64
material              float64
stories               float64
elevator_without      float64
elevator_passenger    float64
elevator_service      float64
parking               float64
garbage_chute         float64
heating   

In [5]:
test_apartment = pd.read_csv('../data/apartments_test.csv')
test_building = pd.read_csv('../data/buildings_test.csv')
print('app length', len(test_apartment))
print('buildings length', len(test_building))
test = pd.merge(test_apartment, test_building, left_on='building_id', right_on='id')
print('merged length', len(test))
test.rename(columns={'id_x' : 'id'}, inplace=True)
test.drop('id_y', axis=1, inplace=True)
test = test.drop(['street', 'address'], axis=1)


app length 9937
buildings length 2931
merged length 9937


In [6]:
test_apartment

Unnamed: 0,id,seller,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,bathrooms_shared,bathrooms_private,windows_court,windows_street,balconies,loggias,condition,phones,building_id
0,23285,,71.4,,,2.0,1.0,,,,,1.0,0.0,1.0,0.0,0.0,1.0,5148
1,23286,,54.0,4.0,50.0,4.0,1.0,,3.00,,,1.0,0.0,,,3.0,1.0,1290
2,23287,3.0,39.4,19.6,11.2,10.0,1.0,,2.65,1.0,0.0,1.0,0.0,,,,1.0,681
3,23288,,60.9,,,5.0,2.0,1.0,,1.0,0.0,0.0,1.0,,,0.0,1.0,4261
4,23289,,34.0,9.0,17.0,14.0,1.0,,2.75,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,7530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,33217,3.0,106.0,19.9,56.7,16.0,3.0,,3.30,,,,,0.0,1.0,,1.0,4252
9933,33218,,82.0,,,3.0,3.0,,,2.0,0.0,1.0,0.0,,,1.0,2.0,7380
9934,33219,,49.3,,,15.0,1.0,,0.00,,,,,,,,1.0,1480
9935,33220,,38.8,10.5,15.1,14.0,1.0,,3.30,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,2154


In [7]:
column_names_apartments = ['apartment_id', 'seller', 'area_total', 'area_kitchen', 'area_living',
       'floor', 'rooms', 'layout', 'ceiling', 'bathrooms_shared',
       'bathrooms_private', 'windows_court', 'windows_street', 'balconies',
       'loggias', 'condition', 'phones', 'building_id', 'price']
column_names_buildings = ['new', 'latitude', 'longitude', 'district', 'street', 'address',
       'constructed', 'material', 'stories', 'elevator_without',
       'elevator_passenger', 'elevator_service', 'parking', 'garbage_chute',
       'heating']
all_features = column_names_apartments[:-1] + column_names_buildings
#features selected based on gini importance in LGBM model in sequential_all_data
selected_features = ['area_total','area_kitchen','floor','bathrooms_private','longitude','latitude','district','constructed','stories']

In [8]:
def preprocess(data,price_outlier_rejection=False,fill=True):
    if price_outlier_rejection is True:
        data = data[(np.abs(stats.zscore(data['price'])) < 3.9)]
    if fill is True:
        return data.fillna(data.mean())
    else:
        return data.dropna()

In [9]:
processed_data = preprocess(train)

In [10]:
processed_test = preprocess(test)
processed_test = processed_test[selected_features]

In [11]:
X = processed_data[selected_features]
y = processed_data['price']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adaboost

In [13]:
adaboost = AdaBoostRegressor(
    n_estimators=1500,
    learning_rate=0.05,
    loss='square',
    random_state=42
)

In [14]:
adaboost.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
ada_prediction = adaboost.predict(X_test)

In [None]:
ada_mse = round(mean_squared_log_error(ada_prediction, y_test), 2)
print('Test MSE:', ada_mse/scaling_factor)

In [None]:
plt.scatter(ada_prediction, y_test)
plt.xlabel('Adaboost prediction')
plt.ylabel('Ground Truth')

# Gradient Boosting Machine

In [None]:
grad_boost_regr = GradientBoostingRegressor(
    learning_rate=0.01,
    n_estimators=2000,
    subsample=1.0, 
    criterion='mse', 
    min_samples_split=4, 
    min_samples_leaf=2, 
    min_weight_fraction_leaf=0.0, 
    max_depth=9, 
    min_impurity_decrease=0.0, 
    init=None, 
    random_state=0, 
    max_features=None,
    alpha=0.9,
    verbose=0,
    max_leaf_nodes=None,
    warm_start=False,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=0.0001,
    ccp_alpha=0.0
)

In [None]:
grad_boost_regr.fit(X_train, y_train)

In [None]:
gb_prediction = grad_boost_regr.predict(X_test)

In [None]:
gb_mse = mean_squared_log_error(gb_prediction, y_test)
print('Test MSE:', math.sqrt(gb_mse))

In [None]:
plt.scatter(gb_prediction, y_test)
plt.xlabel('Gradient Boosting prediction')
plt.ylabel('Ground Truth')
plt.xscale('log')
plt.yscale('log')

In [None]:
importances = grad_boost_regr.feature_importances_
forest_importances = pd.Series(importances, index=X.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("MDI or Gini Importance")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

# LightGBM

In [None]:
light_gbm = lgb.LGBMRegressor(
    num_leaves=10,
    max_depth=5, 
    random_state=42, 
    silent=True, 
    metric='mse',
    n_jobs=4, 
    n_estimators=2000,
    colsample_bytree=0.95,
    subsample=0.9,
    learning_rate=0.05
)

In [None]:
light_gbm.fit(X_train, y_train)

In [None]:
lgbm_prediction = light_gbm.predict(X_test)

In [None]:
lgb_mse = round(mean_squared_log_error(lgbm_prediction, y_test), 2)
print('Test MSE:', lgb_mse/scaling_factor)

In [None]:
plt.scatter(lgbm_prediction, y_test)
plt.xlabel('LightGBM prediction')
plt.ylabel('Ground Truth')

In [None]:
importances = light_gbm.feature_importances_
forest_importances = pd.Series(importances, index=X.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Gini Importance")
ax.set_ylabel("Importance")
fig.tight_layout()

# CatBoost

In [None]:
catboost = CatBoostRegressor(
    n_estimators=500,
    learning_rate=0.1,
    thread_count=-1,
    depth=7,
    silent=True,
    random_seed=42,
    bagging_temperature=0.2
#     od_type="Iter"
)

In [None]:
catboost.fit(X_train, y_train)

In [None]:
cat_prediction = catboost.predict(X_test)

In [None]:
cat_mse = round(mean_squared_log_error(cat_prediction, y_test), 2)
print('Test MSE:', cat_mse/scaling_factor)

In [None]:
plt.scatter(cat_prediction, y_test)
plt.xlabel('CatBoost prediction')
plt.ylabel('Ground Truth')

# Try gradient boost to obtain submission

In [None]:
grad_boost_regr.fit(X, y)

In [None]:
gb_prediction = grad_boost_regr.predict(processed_test)

In [None]:
gb_prediction

In [None]:
print(len(gb_prediction))
print(len(test))

In [None]:
df = pd.DataFrame(gb_prediction, columns = ['price_prediction'])

In [None]:
df

In [None]:
result = pd.concat([test['id'], df], axis=1)

In [None]:
result

In [None]:
result[['id','price_prediction']].to_csv('submission.csv', index=False)

# Numerical + transform categorical

In [None]:
numerical_features = ["area_total", "area_kitchen", "area_living", "floor", "rooms", "ceiling", "bathrooms_shared", "bathrooms_private", "balconies", "latitude", "longitude", "constructed"]

## Categorical to numerical index map

In [None]:
le = LabelEncoder()
train_categorical_removed = train
train_categorical_removed['district'] = le.fit_transform(train_categorical_removed['district'])

In [None]:
train['district']