
 # Machine Learning – Moscow House Pricing (Short Notebook)
 Group 97: Ilaria Crivellari (ID: 567219), Kirill Menke (ID: 566609), Moritz Leugers (ID: 566563)

In [1]:
import random
import os
import sys
sys.path.append('/proj/ciptmp/ny70konu/python3.9/site-packages')

import numpy as np
import pandas as pd
import geopandas as gp

import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.neighbors import BallTree
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.decomposition import PCA

In [2]:
pd.options.mode.chained_assignment = None
np.random.seed(0)

# Read train/ test data
apartments_train = pd.read_csv("data/apartments_train.csv")
buildings_train = pd.read_csv("data/buildings_train.csv")
apartments_test = pd.read_csv("data/apartments_test.csv")
buildings_test = pd.read_csv("data/buildings_test.csv")
metro_stations = pd.read_csv("data/metro_stations.csv")

# External datasources
sub_area_centers = pd.read_csv("data/sberbank_sub_areas.csv")
sub_areas = gp.read_file('data/mo_kag_SRHM.shp')
sberbank = pd.read_csv("data/sberbank.csv")

# Merge Tables: Apartments and Buildings
train_df = apartments_train.merge(buildings_train, left_on='building_id', right_on='id', suffixes=('', '_r')).sort_values('id').set_index('id')
test_df = apartments_test.merge(buildings_test, left_on='building_id', right_on='id', suffixes=('', '_r')).sort_values('id').set_index('id')

# Merge train and test data
data_df = pd.concat([train_df, test_df])

In [3]:
# Merge features
data_df["street_and_address"] = data_df.street + " " + data_df.address

# Imputing coordinates outside of moscow and NaNs
data_df.latitude[data_df.street_and_address == "Бунинские Луга ЖК к2/2/1"] = 55.5415152
data_df.longitude[data_df.street_and_address == "Бунинские Луга ЖК к2/2/1"] = 37.4821752
data_df.latitude[data_df.street_and_address == "Бунинские Луга ЖК к2/2/2"] = 55.5415152
data_df.longitude[data_df.street_and_address == "Бунинские Луга ЖК к2/2/2"] = 37.4821752
data_df.latitude[data_df.street_and_address == "улица 1-я Линия 57"] = 55.6324711
data_df.longitude[data_df.street_and_address == "улица 1-я Линия 57"] = 37.4536057
data_df.latitude[data_df.street_and_address == "улица Центральная 75"] = 55.5415152
data_df.longitude[data_df.street_and_address == "улица Центральная 75"] = 37.4821752
data_df.latitude[data_df.street_and_address == "улица Центральная 48"] = 55.5415152
data_df.longitude[data_df.street_and_address == "улица Центральная 48"] = 37.4821752

# NaNs
data_df.latitude[data_df.street_and_address == "пос. Коммунарка Москва А101 ЖК"] = 55.5676692
data_df.longitude[data_df.street_and_address == "пос. Коммунарка Москва А101 ЖК"] = 37.4816608

# Encode streets to integers
data_df["street"] = LabelEncoder().fit_transform(data_df["street"])

In [4]:
def haversine_array(lat1, lng1, lat2 = 55.752, lng2 = 37.617):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [5]:
# Add nearest sub_area based on its center
bulding_locs = np.asarray(list(zip(data_df['latitude'], data_df['longitude'])))
sub_area_locs = np.asarray(list(zip(sub_area_centers['longitude'], sub_area_centers['latitude'])))
closest_sub_idx = np.argmin(haversine_distances(bulding_locs, sub_area_locs), axis=1)
data_df["sub_area_"] = sub_area_centers['sub_area'].iloc[closest_sub_idx].values

# Mapping each building to its real sub_area
geo_df = gp.GeoDataFrame(data_df, geometry=gp.points_from_xy(data_df.longitude, data_df.latitude))
geo_df.crs = "EPSG:4326"
data_df = gp.sjoin(sub_areas, geo_df, how='right', predicate='contains')
data_df = data_df.drop(["DISTRICT", "geometry", "OKATO", "OKTMO", "OKATO_AO", "index_left"], axis=1)
    
# Add distance from city center
data_df['center_distance'] = haversine_array(data_df['latitude'], data_df['longitude'])

# Add closest metro_station
metro_locs = np.asarray(list(zip(metro_stations['latitude'], metro_stations['longitude'])))
closest_metro_dist = np.min(haversine_distances(bulding_locs, metro_locs), axis=1)
data_df['closest_metro'] = closest_metro_dist

# Add subarea related features from sberbank dataset
data_df['mean_green_area'] = data_df['sub_area'].map(sberbank.groupby("sub_area").mean().green_zone_part)

# Ordinal Encoding for both sub_area variants
data_df["sub_area"] = LabelEncoder().fit_transform(data_df['sub_area'])
data_df["sub_area_"] = LabelEncoder().fit_transform(data_df['sub_area_'])

# PCA transforming the two variants above
pca = PCA(n_components=1)
data_df["sub_area_pca"] = pca.fit_transform(data_df[["sub_area", "sub_area_"]]).squeeze()
    
# Add average price in the neighborhood
tree = BallTree(data_df[["latitude", "longitude"]])
dist, ind = tree.query(data_df[["latitude", "longitude"]], k=300)

mean_sqm_price = []

for rows in ind:
    mean_sqm_price.append(np.nanmean(data_df['price'].iloc[rows] / np.log(data_df['area_total'].iloc[rows])))

data_df["mean_sqm_price"] = mean_sqm_price

# Drop unimportant features
data_df = data_df.drop(["street_and_address", "address", "windows_court", "windows_street", "elevator_service", "elevator_passenger", "garbage_chute", 
      "layout", "heating", "elevator_without", "district", "phones", "building_id", "id_r", "material", "parking"], axis = 1)

data_df.fillna(-999, inplace = True)

In [6]:
model_lgbm = LGBMRegressor(max_depth=6, n_estimators=1200, learning_rate=0.1)
model_xgb = XGBRegressor(n_estimators=1500, max_depth=6, learning_rate=0.1)
model_cat = CatBoostRegressor(iterations=1000, depth=7, learning_rate=0.1, silent=True)
model_extra = ExtraTreesRegressor(n_estimators=1000, random_state=0)
model_hist = HistGradientBoostingRegressor(max_depth=7, max_iter=1000, learning_rate=0.1, random_state=0)
model_ada_lgbm = AdaBoostRegressor(base_estimator = model_lgbm, n_estimators=50, random_state = 0)

final_model = RidgeCV()

base_learners = [
    ('xgb_tree', model_xgb),
    ("lgbm", model_lgbm),
    ('catboost', model_cat),
    ('extra_trees', model_extra),
    ('hist_boost', model_hist),
    ("ada_lgbm", model_ada_lgbm)
]

model_stacking = StackingRegressor(estimators=base_learners, final_estimator=final_model, cv=5)
trans_stacking = TransformedTargetRegressor(regressor=model_stacking, func=np.log1p, inverse_func=np.expm1)

In [7]:
# Splitting into train and test data
X_train = data_df[0:len(train_df)]
X_train = X_train.drop('price', axis=1)
X_test = data_df[len(train_df):len(data_df)].drop('price', axis=1)
y_train = train_df['price']

y_pred = trans_stacking.fit(X_train, y_train).predict(X_test)
result = np.column_stack((X_test.index.to_numpy(), y_pred))
np.savetxt(r'./result.csv', result, fmt=['%d', ' %.3f'], delimiter=',', header="id,price_prediction", comments='')