# Competition: **Moscow Housing**
## Team Name: **Group 7**
## Team Members:
- **Vegard Skui** (506824)
- **Albert Lesniewski** (488094)
- **Jim Totland** (490741)

Student IDs are given in the parantheses.

# Instructions to Reader
- Our best predictions can be generated by running the second last cell (and all the above) in this notebook. 
- Our second submission which we selected on Kaggle can also be generated from this notebook by running the last cell.

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import lightgbm as lgb
import optuna
import catboost
import geopandas as gpd

# Set up the KML driver for GeoPandas
gpd.io.file.fiona.drvsupport.supported_drivers["KML"] = "rw"

import warnings
warnings.filterwarnings("ignore")

SEED = 42

  import pandas.util.testing as tm


# Data Cleaning/Feature Engineering

In [2]:
# Read the apartment datasets
apartments_train = pd.read_csv("resources/data/apartments_train.csv").set_index("id")
apartments_train["split"] = "train"
apartments_test = pd.read_csv("resources/data/apartments_test.csv").set_index("id")
apartments_test["split"] = "test"

# Create a DataFrame of all apartments
apartments = pd.concat([apartments_train, apartments_test])

# Read the building datasets
buildings_train = pd.read_csv("resources/data/buildings_train.csv").set_index("id")
buildings_train["split"] = "train"
buildings_test = pd.read_csv("resources/data/buildings_test.csv").set_index("id")
buildings_test["split"] = "test"

# Create a GeoDataFrame of all buildings
buildings = pd.concat([buildings_train, buildings_test])
buildings = gpd.GeoDataFrame(buildings, geometry=gpd.points_from_xy(
    buildings.longitude, buildings.latitude, crs="EPSG:4326"
))

In [3]:
# Find all buildings missing coordinates
no_coords = buildings.latitude.isna() | buildings.longitude.isna()
buildings[no_coords][[
    "split", "latitude", "longitude", "district", "street", "address",
    "constructed", "material", "stories"
]]

Unnamed: 0_level_0,split,latitude,longitude,district,street,address,constructed,material,stories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3803,test,,,,пос. Коммунарка,Москва А101 ЖК,2019.0,2.0,12.0


In [4]:
# Find other buildings on the same street
street = buildings[~no_coords & (buildings.street == "пос. Коммунарка")]
street[[
    "split", "latitude", "longitude", "district", "street", "address",
    "constructed", "material", "stories"
]]

Unnamed: 0_level_0,split,latitude,longitude,district,street,address,constructed,material,stories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2036,train,55.569133,37.474408,11.0,пос. Коммунарка,22,2013.0,,14.0
2461,train,55.571632,37.473492,11.0,пос. Коммунарка,20,2011.0,,17.0
6375,train,55.567285,37.477929,11.0,пос. Коммунарка,17,1994.0,3.0,12.0
6280,train,55.572986,37.475189,11.0,пос. Коммунарка,4,1963.0,4.0,5.0
8577,train,55.570843,37.473635,11.0,пос. Коммунарка,20А,2012.0,,17.0
6200,test,55.560891,37.473761,11.0,пос. Коммунарка,101,2020.0,0.0,16.0


In [5]:
# Set the location and district for the building with missing coordinates
buildings.loc[no_coords, "latitude"] = street.latitude.mean()
buildings.loc[no_coords, "longitude"] = street.longitude.mean()
buildings.loc[no_coords, "district"] = street.district.mode()[0]

print("There are now", (buildings.latitude.isna() | buildings.longitude.isna()).sum(), "buildings with missing coordinates.")

There are now 0 buildings with missing coordinates.


In [6]:
# The coordinates of the southwest and northeast corners of a rectangle approximately encompassing Moscow
MOSCOW_SW_LAT = 55.101131
MOSCOW_SW_LON = 36.754394
MOSCOW_NE_LAT = 56.117476
MOSCOW_NE_LON = 38.218283

In [7]:
# Find all buildings with coordinates outside of Moscow
outside = (((buildings.latitude < MOSCOW_SW_LAT) | (buildings.latitude > MOSCOW_NE_LAT))
          & ((buildings.longitude < MOSCOW_SW_LON) | (buildings.longitude > MOSCOW_NE_LON)))
buildings[outside][[
    "split", "latitude", "longitude", "district", "street", "address",
    "constructed", "material", "stories"
]]

Unnamed: 0_level_0,split,latitude,longitude,district,street,address,constructed,material,stories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5667,test,43.352968,132.759204,,улица Центральная,48,,,2.0
4412,test,17.141734,-61.7905,,Бунинские Луга ЖК,к2/2/2,2021.0,,9.0
4202,test,42.9147,74.517184,,улица 1-я Линия,57,2016.0,,3.0
8811,test,43.353545,132.768097,,улица Центральная,75,2007.0,,5.0
4636,test,17.141734,-61.7905,,Бунинские Луга ЖК,к2/2/1,2021.0,3.0,17.0


In [8]:
# Find the number of buildings in the dataset on the same street
streets = buildings[outside].street.unique()
pd.DataFrame(
    index=streets,
    data={
        "test": map(
            lambda street: len(buildings[
                ~outside
                & (buildings.split == "test")
                & (buildings.street == street)
            ]),
            streets
        ),
        "train": map(
            lambda street: len(buildings[
                ~outside
                & (buildings.split == "train")
                & (buildings.street == street)
            ]),
            streets
        ),
    },
)

Unnamed: 0,test,train
улица Центральная,8,16
Бунинские Луга ЖК,1,8
улица 1-я Линия,0,0


In [9]:
# Set the location (and district) of the buildings with coordinates outside of Moscow
for idx, building in buildings[outside].iterrows():
    street = buildings[~outside & (buildings.street == building.street)]
    if len(street):
        buildings.loc[idx, "latitude"] = street.latitude.mean()
        buildings.loc[idx, "longitude"] = street.longitude.mean()
        buildings.loc[idx, "district"] = street.district.mode()[0]
    else:
        buildings.loc[idx, "latitude"] = buildings[~outside].latitude.mean()
        buildings.loc[idx, "longitude"] = buildings[~outside].longitude.mean()

print("There are now", (((buildings.latitude < MOSCOW_SW_LAT) | (buildings.latitude > MOSCOW_NE_LAT)) & ((buildings.longitude < MOSCOW_SW_LON) | (buildings.longitude > MOSCOW_NE_LON))).sum(), "buildings outside of Moscow")

There are now 0 buildings outside of Moscow


In [10]:
# Find all buildings missing a district
no_district = buildings.district.isna()
buildings[no_district][["split", "latitude", "longitude", "district", "street", "address", "constructed", "material", "stories"]]

Unnamed: 0_level_0,split,latitude,longitude,district,street,address,constructed,material,stories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4162,train,55.59516,37.741109,,23-й км,5к1,2021.0,3.0,9.0
1647,test,55.59516,37.741109,,23-й км,7к1,2018.0,,25.0
6403,test,55.59516,37.741109,,23-й км,11к1,2019.0,,16.0
7317,test,55.583551,37.711356,,25-й км,вл1с1,,,13.0
2265,test,55.59516,37.741109,,23-й км,13к2,2018.0,,25.0
926,test,55.921627,37.781578,,В мкр,37,1989.0,2.0,16.0
6879,test,55.932127,37.793705,,В мкр,30,1975.0,2.0,9.0
4202,test,55.727684,37.592893,,улица 1-я Линия,57,2016.0,,3.0
183,test,55.59516,37.741109,,23-й км,11к2,2019.0,,25.0


In [11]:
# For each building with district missing, copy the district from the closest building
# Closest here is the smallest coordinate difference
districts = buildings.loc[no_district].apply(
    lambda b: buildings.loc[
        (buildings[~no_district][["latitude", "longitude"]] - b[["latitude", "longitude"]]).abs().sum(axis=1).idxmin()
    ].district,
    axis=1
)
districts.rename("district", inplace=True)
buildings.update(districts)

print("There are now", buildings.district.isna().sum(), "buildings with district missing")

There are now 0 buildings with district missing


## Nearest POI

In [12]:
# The Earth's radius in meters
EARTH_RADIUS = 6371000

# Create columns for coordinates given in radians to avoid having to recalculate
buildings["lat_rad"] = np.radians(buildings.latitude)
buildings["lon_rad"] = np.radians(buildings.longitude)

In [13]:
# Read the metro station location data into a DataFrame
stations = gpd.read_file("resources/metro_stations.kml", driver="KML").drop(columns=["Description"]).rename(columns={"Name": "name"})

# Create columns for the coordinates given in radians
stations["lat_rad"] = np.radians(stations.geometry.y)
stations["lon_rad"] = np.radians(stations.geometry.x)

In [14]:
# Calculate the distance to the nearest metro station for each building using
# the haversine formula with the Earth's radius as given above
metro_dist =  buildings.apply(
    lambda row:
            2 * EARTH_RADIUS
            * np.arcsin(
                np.sqrt(
                    np.sin((stations.lat_rad - row.lat_rad) / 2) ** 2
                    + np.cos(row.lat_rad)
                    * np.cos(stations.lat_rad)
                    * np.sin((stations.lon_rad - row.lon_rad) / 2) ** 2
            )), axis=1
)

buildings["metro_distance"] = np.min(metro_dist, axis = 1)

# Calculate the number of 'close' metro stations.
buildings['metro_close'] = np.sum(metro_dist < 1000, axis = 1)

In [15]:
# Read park and garden location data
parks = gpd.read_file("resources/parks_and_gardens.kml", driver="KML").drop(columns=["Description"]).rename(columns={"Name": "name"})

# Create columns for coordinates given in radians
parks["lon_rad"] = np.radians(parks.geometry.x)
parks["lat_rad"] = np.radians(parks.geometry.y)

In [16]:
# Calculate the distance to the nearest park or garden for each building using
# the haversine formula with the Earth's radius as given above
park_dist = buildings.apply(
    lambda row:
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((parks.lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(parks.lat_rad)
                * np.sin((parks.lon_rad - row.lon_rad) / 2) ** 2
            )
        ), axis = 1
    )

buildings["park_distance"] = np.min(park_dist, axis = 1)

# Calculate number of 'close' parks to the house.
buildings['park_close']  = np.sum(park_dist < 2000, axis = 1)


In [17]:
# Read square location data
squares = gpd.read_file("resources/squares.kml", dirver="KML").drop(columns=["Description"]).rename(columns={"Name": "name"})

# Create columns for coordinates given in radians
squares["lon_rad"] = np.radians(squares.geometry.x)
squares["lat_rad"] = np.radians(squares.geometry.y)

In [18]:
# Calculate the distance to the nearest square for each building using the
# haversine formula with the Earth's radius as given above
square_dist = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((squares.lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(squares.lat_rad)
                * np.sin((squares.lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

buildings['square_distance'] = np.min(square_dist, axis = 1)

# Calculate number of 'close' squares
buildings['square_close']  = np.sum(square_dist < 2000, axis = 1)

In [19]:
rublevka_lat = 55.73870
rublevka_lon = 37.25904
rublevka_lat_rad = np.radians(rublevka_lat)
rublevka_lon_rad = np.radians(rublevka_lon)

buildings['rublevka_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((rublevka_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(rublevka_lat_rad)
                * np.sin((rublevka_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [20]:
ostoz_lat = 55.74041592186014
ostoz_lon = 37.59606083068356
ostoz_lat_rad = np.radians(ostoz_lat)
ostoz_lon_rad = np.radians(ostoz_lon)

buildings['ostoz_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((ostoz_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(ostoz_lat_rad)
                * np.sin((ostoz_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [21]:
khamo_lat = 55.733078399352074
khamo_lon = 37.574861386328806
khamo_lat_rad = np.radians(khamo_lat)
khamo_lon_rad = np.radians(khamo_lon)

buildings['khamo_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((khamo_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(khamo_lat_rad)
                * np.sin((khamo_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [22]:
yaki_lat = 55.732884073626586
yaki_lon = 37.610828401028435
yaki_lat_rad = np.radians(yaki_lat)
yaki_lon_rad = np.radians(yaki_lon)

buildings['yaki_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((yaki_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(yaki_lat_rad)
                * np.sin((yaki_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [23]:
arbat_lat = 55.751646651040474
arbat_lon = 37.601199713824386
arbat_lat_rad = np.radians(arbat_lat)
arbat_lon_rad = np.radians(arbat_lon)

buildings['arbat_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((arbat_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(arbat_lat_rad)
                * np.sin((arbat_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [24]:
pres_lat = 55.7638390539512
pres_lon = 37.559469484210624
pres_lat_rad = np.radians(pres_lat)
pres_lon_rad = np.radians(pres_lon)

buildings['pres_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((pres_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(pres_lat_rad)
                * np.sin((pres_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [25]:
tver_lat = 55.76960731290398
tver_lon = 37.608153625557165
tver_lat_rad = np.radians(tver_lat)
tver_lon_rad = np.radians(tver_lon)

buildings['tver_dist'] = buildings.apply(
    lambda row: 
        2
        * EARTH_RADIUS
        * np.arcsin(
            np.sqrt(
                np.sin((tver_lat_rad - row.lat_rad) / 2) ** 2
                + np.cos(row.lat_rad)
                * np.cos(tver_lat_rad)
                * np.sin((tver_lon_rad - row.lon_rad) / 2) ** 2
            )
        ), 
    axis=1
)

In [26]:
del stations, parks, squares

buildings.drop(columns=["lat_rad", "lon_rad"], inplace=True)

## More Cleaning and Feature Engineering

In [27]:
district_constructed_mean = buildings.groupby("district").constructed.mean().round()
district_constructed_median = buildings.groupby("district").constructed.median().round()

In [28]:
# Find the mean and median construction year for each street
street_constructed_mean = buildings.groupby("street").constructed.mean().round().dropna()
street_constructed_median = buildings.groupby("street").constructed.median().round().dropna()

In [29]:
idx = ~buildings.constructed.isna()
buildings['constructed_fix'] = buildings.constructed

In [30]:
buildings.loc[~idx, "constructed_fix"] = buildings[~idx].apply(
    lambda b: street_constructed_median[b.street]
              if b.street in street_constructed_median
              else district_constructed_median[b.district],
    axis=1
)

In [31]:
idx = apartments.ceiling > 50
# Create a new feature, `ceiling_fix`, where this has been corrected
apartments["ceiling_fix"] = apartments.ceiling
apartments.loc[idx, "ceiling_fix"] = apartments.ceiling[idx] / 100

In [32]:
apartments['log_area_per_room'] = np.log(apartments.area_total/apartments.rooms)

## Merge

In [33]:
data = apartments.reset_index().merge(
    buildings, how="left", left_on=["building_id", "split"], right_on=["id", "split"]
).set_index("id")

## Extra features

In [34]:
data["ballog"] = data.balconies + data.loggias
data["bathrooms_total"] = data.bathrooms_private + data.bathrooms_shared
data["log_price_per_sqm"] = np.log(data.price / data.area_total)
data["log_area_total"] = np.log(data.area_total)

In [35]:
data['price_per_sqm'] = data.price/data.area_total
avg_price_per_sqm = data.groupby('district').agg('mean')['price_per_sqm']

distr_avg = np.empty(data.shape[0])
for i in range(data.shape[0]):
    distr_avg[i] = avg_price_per_sqm[int(data.district[i])]
data['distr_avg'] = distr_avg

In [36]:
data_train = data[data.split == "train"].copy()
data_test = data[data.split == "test"].copy()

# Modeling and prediction

In [37]:
LIGHTGBM_F4LC_FEATURES = [
    "latitude", "longitude", "district", "constructed_fix", "log_area_total",
    "rooms", "ballog", "metro_distance", "park_distance", "square_distance",
    "material", "condition", "heating", "stories", "floor", "ceiling_fix",
    "bathrooms_total", "new",
]
LIGHTGBM_F4LC_CATEGORICAL_FEATURES = [
    "district", "material", "condition", "heating", "new",
]

In [38]:
lgb_f4lc_study = optuna.create_study(
    storage='sqlite:///resources/vegard_optuna.sqlite',
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name="LightGBM-F4LC",
    direction="minimize",
    load_if_exists=True
)

X = data_train[LIGHTGBM_F4LC_FEATURES]
y = data_train.price
area = data_train.area_total

lgb_f4lc_mod = lgb.LGBMRegressor(random_state=SEED, metric="rmse", n_estimators=5000, n_jobs=3, **lgb_f4lc_study.best_params)
lgb_f4lc_mod.fit(X, np.log(y/area), categorical_feature=LIGHTGBM_F4LC_CATEGORICAL_FEATURES)

[32m[I 2021-11-19 20:25:50,667][0m Using an existing study with name 'LightGBM-F4LC' instead of creating a new one.[0m


LGBMRegressor(boosting_type='gbdt', cat_smooth=56, class_weight=None,
              colsample_bytree=0.6, importance_type='split', learning_rate=0.03,
              max_depth=121, metric='rmse', min_child_samples=1,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
              n_jobs=3, num_leaves=515, objective=None, random_state=42,
              reg_alpha=0.13020331134351731, reg_lambda=1.2937692395596345,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [39]:
lgb_f4lc_preds = np.exp(lgb_f4lc_mod.predict(data_test[LIGHTGBM_F4LC_FEATURES])) * data_test.area_total

In [40]:
CATBOOST_T2LC_FEATURES = LIGHTGBM_F4LC_FEATURES

cat_t2lc_study = optuna.create_study(
    storage='sqlite:///resources/vegard_optuna.sqlite',
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name="CatBoost-T2LC",
    direction="minimize",
    load_if_exists=True
)

cat_t2lc_mod = catboost.CatBoostRegressor(objective="RMSE", random_seed=SEED, silent=True, thread_count=1, **cat_t2lc_study.best_params)
cat_t2lc_mod.fit(data_train[CATBOOST_T2LC_FEATURES],  np.log(y/area))

[32m[I 2021-11-19 20:26:43,402][0m Using an existing study with name 'CatBoost-T2LC' instead of creating a new one.[0m


<catboost.core.CatBoostRegressor at 0x7f8fbad5f8d0>

In [41]:
cat_t2lc_preds = np.exp(cat_t2lc_mod.predict(data_test[CATBOOST_T2LC_FEATURES])) * data_test.area_total

In [42]:
NUMERIC_FEATURES_H2 = ["latitude", "longitude", "constructed", "log_area_total", "log_area_per_room", "rooms", "metro_distance", "park_distance",
            "square_distance", "metro_close", "park_close", "square_close", "stories", "floor", "ceiling_fix", "bathrooms_total", "ballog", "distr_avg", 
            'khamo_dist', 'ostoz_dist', 'rublevka_dist', 'yaki_dist', 'arbat_dist', 'pres_dist', 'tver_dist']
            
CATEGORICAL_FEATURES_H2 = ['condition', 'district', 'garbage_chute', 'heating', 'layout', 'material',
                         'new', 'parking', 'seller', 'windows_court', 'windows_street']


X = data_train[NUMERIC_FEATURES_H2 + CATEGORICAL_FEATURES_H2].copy()
X[CATEGORICAL_FEATURES_H2] = X[CATEGORICAL_FEATURES_H2].astype('category')

X_test = data_test[NUMERIC_FEATURES_H2 + CATEGORICAL_FEATURES_H2]
X_test[CATEGORICAL_FEATURES_H2] = X_test[CATEGORICAL_FEATURES_H2].astype('category')

y = data_train.price
area = data_train.area_total

lgb_h2_study = optuna.create_study(
    storage="sqlite:///resources/jim_optuna.sqlite",
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name="LightGBM-h2",
    direction="minimize",
    load_if_exists=True
)

lgb_h2_mod = lgb.LGBMRegressor(random_state=SEED, metric="rmse", n_jobs=3, n_estimators=6000, **lgb_h2_study.best_params)
lgb_h2_mod.fit(X, np.log(y/area), categorical_feature=CATEGORICAL_FEATURES_H2)

[32m[I 2021-11-19 20:27:14,399][0m Using an existing study with name 'LightGBM-h2' instead of creating a new one.[0m


LGBMRegressor(boosting_type='gbdt', cat_smooth=8, class_weight=None,
              colsample_bytree=0.4, importance_type='split',
              learning_rate=0.006, max_depth=32, metric='rmse',
              min_child_samples=1, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=6000, n_jobs=3, num_leaves=327, objective=None,
              random_state=42, reg_alpha=0.04332781805200478,
              reg_lambda=0.02011247431970239, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [43]:
lgb_h2_preds = np.exp(lgb_h2_mod.predict(X_test)) * data_test.area_total

In [44]:
cat_h2_study = optuna.create_study(
    storage="sqlite:///resources/jim_optuna.sqlite",
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name="cat-lh",
    direction="minimize",
    load_if_exists=True
)

X = data_train[NUMERIC_FEATURES_H2 + CATEGORICAL_FEATURES_H2].copy()
X['district'] = X['district'].astype(int)
X_test = data_test[NUMERIC_FEATURES_H2 + CATEGORICAL_FEATURES_H2]
X_test['district'] = X_test['district'].astype(int)

cat_h2_mod = catboost.CatBoostRegressor(objective="RMSE", random_seed=SEED, silent=True, thread_count=4, **cat_h2_study.best_params)
cat_h2_mod.fit(X, np.log(y/area), cat_features = ['district'])

[32m[I 2021-11-19 20:28:56,708][0m Using an existing study with name 'cat-lh' instead of creating a new one.[0m


<catboost.core.CatBoostRegressor at 0x7f8fbad689e8>

In [45]:
cat_h2_preds = np.exp(cat_h2_mod.predict(X_test)) * data_test.area_total

## Best Prediictions

In [46]:
final_preds = np.average(
        [lgb_f4lc_preds, cat_t2lc_preds, 
         lgb_h2_preds, cat_h2_preds],
        axis=0,
        weights=[ -lgb_f4lc_study.best_value, -cat_t2lc_study.best_value, 
                  -lgb_h2_study.best_value, -cat_h2_study.best_value],
    )
submission = pd.DataFrame()
submission['id'] = data_test.index
submission['price_prediction'] = final_preds
submission.to_csv('best_preds.csv', index=False)

## Second Submission (next best submission)

In [47]:
second_preds = np.average(
        [lgb_f4lc_preds, cat_t2lc_preds],
        axis=0,
        weights=[ -lgb_f4lc_study.best_value, -cat_t2lc_study.best_value],
    )
second_submission = pd.DataFrame()
second_submission['id'] = data_test.index
second_submission['price_prediction'] = second_preds
second_submission.to_csv('next_best_preds.csv', index=False)