In [1]:
import json
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

np.random.seed(123)
sns.set_style('darkgrid')
pd.set_option('display.max_colwidth', None)

# Overview
## Loadin data and utilities etc
## Basic EDA
- Pairs plot and correlation matrix
- Exploring NaN-values and groups
## Data cleaning
- Fixing building with wrong coords.
- Imputing some features?
## Feature engineering
- bathrooms_total, windows, ballog etc.
- metro, park and square distance
## Modeling
- PyTorch
- Random Forest, sklearn etc.
- LightGBM
- XGBoost
- CatBoost
- Combinations, stacking?

# Utilities 

In [30]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

def check_if_has_nans(df):
    a = df.isna().any()
    print("The following columns have no NaN-values: ")
    for pairs in a.items():
        if(not pairs[1]):
            print(pairs[0])
        

# Load Data

In [45]:
apartments = pd.read_csv('resources/data/apartments_train.csv')
buildings = pd.read_csv('resources/data/buildings_train.csv')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)

apartments_test = pd.read_csv('resources/data/apartments_test.csv')
buildings_test = pd.read_csv('resources/data/buildings_test.csv')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)

check_if_has_nans(data)
check_if_has_nans(data_test)

The following columns have no NaN-values: 
id
price
area_total
floor
rooms
building_id
latitude
longitude
street
address
stories
The following columns have no NaN-values: 
id
area_total
floor
rooms
building_id
street
address
stories


# Linear Regression

In [47]:
from sklearn.linear_model import LinearRegression
import sklearn.model_selection as model_selection

data_train, data_valid = model_selection.train_test_split(data, test_size=0.33, stratify=np.log(data.price).round())

X_train = data_train[['latitude', 'longitude', 'area_total', 'floor', 'rooms', 'stories']]
y_train = data_train.loc[X_train.index].price
X_valid = data_valid[['latitude', 'longitude', 'area_total', 'floor', 'rooms', 'stories']]
y_valid = data_valid.loc[X_valid.index].price

linreg = LinearRegression(normalize= True).fit(X_train, y_train)
preds_train = linreg.predict(X_train)
preds_valid = linreg.predict(X_valid)

# GET NEGATIVE VALUES from regression ://
#print(root_mean_squared_log_error(preds_train, y_train))
#print(root_mean_squared_log_error(preds_valid, y_valid))


18696      7500000.0
3747      12647097.0
10268    207600000.0
18701      7556237.0
3659      59100000.0
            ...     
143        5300000.0
12251     12900000.0
16373     29960000.0
23171      5847957.0
20996     12000000.0
Name: price, Length: 15600, dtype: float64


# Random Forest

In [49]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 1000, max_depth=2).fit(X_train, y_train)
preds_train = rf.predict(X_train)
preds_valid = rf.predict(X_valid)
# Pretty darn shitty results
print(root_mean_squared_log_error(preds_train, y_train))
print(root_mean_squared_log_error(preds_valid, y_valid))


0.5684917177129843
0.5738513556218434
