# 1. Install and import libraries and modules

In [1]:
%load_ext autoreload

In [2]:
import sys
!{sys.executable} -m pip install -r requirements.txt



In [3]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import os
import shutil
import geopandas as gpd
import catboost as cb
import optuna
import lightgbm as lgb
import geopandas as gpd

from pyproj import Geod
from shapely import wkt
from shapely.geometry import Point, LineString
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.cluster import DBSCAN
from typing import Callable, Dict, List, Tuple
from catboost.utils import get_gpu_device_count
from tqdm import tqdm

tqdm.pandas()

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None  # default='warn'

SEED = 23

spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv').set_index(['grunnkrets_id', 'year']).add_prefix('income_').reset_index()
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')

train = pd.read_csv('data/stores_train.csv')
train_extra = pd.read_csv('data/stores_extra.csv')
test = pd.read_csv('data/stores_test.csv') 

submission = pd.read_csv('data/sample_submission.csv')  # Please do not delete this file

Create geopandas version of some of the datasets:

In [4]:
busstops_geo = gpd.GeoDataFrame(busstops, geometry=busstops.geometry.apply(wkt.loads))

train_geo = gpd.GeoDataFrame(train[['store_id', 'lon','lat']], geometry=gpd.points_from_xy(train.lon, train.lat)).drop(columns=['lon', 'lat'])
train_geo = train_geo.set_crs('epsg:4326', allow_override=True).to_crs('epsg:3857')

train_geo_extra = gpd.GeoDataFrame(train_extra[['store_id', 'lon','lat']], geometry=gpd.points_from_xy(train_extra.lon, train_extra.lat)).drop(columns=['lon', 'lat'])
train_geo_extra = train_geo_extra.set_crs('epsg:4326', allow_override=True).to_crs('epsg:3857')

test_geo = gpd.GeoDataFrame(test[['store_id', 'lon','lat']], geometry=gpd.points_from_xy(test.lon, test.lat)).drop(columns=['lon', 'lat'])
test_geo = test_geo.set_crs('epsg:4326', allow_override=True).to_crs('epsg:3857')

## 1.2 Helper functions

In [5]:
def rmsle(y_true, y_pred):
    y_pred[y_pred < 0] = 0 + 1e-6
    y_true[y_true < 0] = 0 + 1e-6
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


rmsle_scorer = make_scorer(lambda y, y_true: rmsle(y, y_true), greater_is_better=False)

def to_categorical(df: pd.DataFrame):
    for cat_col in df.select_dtypes(include=[object]).columns:
        df[cat_col] = df[cat_col].astype('category')
    return df


def object_encoder(df: pd.DataFrame):
    enc = OrdinalEncoder()
    obj_cols = df.select_dtypes(include=[object]).columns
    df[obj_cols] = enc.fit_transform(df[obj_cols])
    return df


def nan_to_string(df: pd.DataFrame):
    nan = '#N/A'
    cols = df[df.columns[df.isna().any()]].columns
    df[cols] = df[cols].fillna(nan)
    return df


def meter_distance(lat1, lon1, lat2, lon2):
    line_string = LineString([Point(lon1, lat1), Point(lon2, lat2)])
    geod = Geod(ellps="WGS84")
    return geod.geometry_length(line_string)


def add_city_centre_dist(X: pd.DataFrame):
    old_shape = X.shape

    city_centres = X.groupby(['municipality_name'])[['lat', 'lon']].apply(lambda x: x.sum() / (x.count()))[['lat', 'lon']]
    X = X.merge(city_centres, on=['municipality_name'], how='left', suffixes=(None, '_center'))
    assert X.shape[0] == old_shape[0]

    X.fillna(value={'lat_center': X.lat, 'lon_center': X.lon}, inplace=True)

    X['dist_to_center'] = X.apply(lambda row: meter_distance(row.lat, row.lon, row.lat_center, row.lon_center), axis=1)
    assert X.shape[0] == old_shape[0]

    return X


def group_ages(age: pd.DataFrame, age_ranges: List[Tuple[int, int]]):
    age_new = age[['grunnkrets_id', 'year']].drop_duplicates(subset=['grunnkrets_id'], keep='last')

    for rng in age_ranges:
        cols = [f'age_{age}' for age in range(rng[0], rng[1] + 1)]
        rng_sum = age[cols].sum(axis=1).astype(int)
        age_new[f'age_{rng[0]}_{rng[-1]}'] = rng_sum

    age = age.drop_duplicates(subset='grunnkrets_id').drop(columns=['year', *(f'age_{age}' for age in range(0, 91))], axis=1)
    age = age.merge(age_new.drop(columns=['year']), on='grunnkrets_id')

    return age


def only_latest_data(df: pd.DataFrame):
    df = df.sort_values(by='year', ascending=False)
    df = df.drop_duplicates(subset='grunnkrets_id', keep='first')
    return df


def clean_out_nan_heavy_rows(df: pd.DataFrame, age, age_ranges, spatial_2016, income_2016, households_2016):
    """Cleans out rows that have no match in the age, spatial, income or household datasets."""

    # df2 = df.merge(group_ages(age, age_ranges), on='grunnkrets_id', how='left')
    df2 = df.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(income_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(households_2016.drop(columns=['year']), on='grunnkrets_id', how='left')

    df_cleaned = df2[
        ~(df2.couple_children_0_to_5_years.isna() | df2.grunnkrets_name.isna() | df2.income_all_households.isna()) # df2.age_0_19.isna() | 
    ]

    print(f'Cleaned out {len(df) - len(df_cleaned)} out of {len(df)} rows.')

    return df_cleaned


def create_busstops_files():
    """
    Creates a .parquet file that stores info aboute the number of busstops within a 1 kilometer
    radius of a given store, as well as the number of stops within the different importance categories.  
    """

    def bus_fields(row: pd.Series):
        output_dict = {}
        gpd.options.use_pygeos = True
        
        stops_with_dist = busstops[busstops_geo.distance(row.geometry) < 1000]
        output_dict['bus_stops_count'] = len(stops_with_dist)
        
        output_dict.update(stops_with_dist.importance_level.value_counts().reindex(
            busstops_geo.importance_level.unique(), fill_value=0
        ).to_dict())
        
        return output_dict

    train_with_extras_bus = pd.concat([train_geo, train_geo_extra], ignore_index=True)

    train_with_extras_bus = train_with_extras_bus.join(train_with_extras_bus.progress_apply(lambda row: bus_fields(row), axis=1, result_type='expand'))
    train_with_extras_bus.to_parquet('derived_data/stores_bus_stops_lt_1km_train.parquet')

    test_bus = test_bus.join(test_bus.progress_apply(lambda row: bus_fields(row), axis=1, result_type='expand'))
    test_bus.to_parquet('derived_data/stores_bus_stops_lt_1km_test.parquet')


def create_stores_in_vicinity_files():
    """
    Creates a .parquet file that stores info aboute the number of other stores within a 1 kilometer
    radius of a given store.
    """

    train_with_extras = pd.concat([train_geo, train_geo_extra], ignore_index=True)[['store_id', 'geometry']]

    def store_count_in_vicinity(row: pd.Series):
        stores_in_vicinity = train_with_extras[train_with_extras.distance(row.geometry) < 1000]
        return {'stores_count_lt_1km': len(stores_in_vicinity)}

    train_with_extras = train_with_extras.join(train_with_extras.progress_apply(lambda row: store_count_in_vicinity(row), axis=1, result_type='expand'))
    train_with_extras.to_parquet('derived_data/stores_count_lt_1km_train.parquet')

    test = test.join(test.progress_apply(lambda row: store_count_in_vicinity(row), axis=1, result_type='expand'))
    test.to_parquet('derived_data/stores_count_lt_1km_test.parquet')


def add_spatial_clusters(df: pd.DataFrame):
    clusters = DBSCAN(eps=0.145, min_samples=100)
    # clusters = DBSCAN(eps=0.12, min_samples=30)
    cl = clusters.fit_predict(df[['lat', 'lon']].to_numpy())
    cl_counts = dict(zip(*np.unique(cl, return_counts=True)))

    print(len(set(cl)), 'clusters created')
    print('Cluster counts:', cl_counts)

    df['cluster_id'] = cl
    df['cluster_member_count'] = df.apply(lambda row: cl_counts[row.cluster_id], axis=1)

    X_no_outliers = df[df.cluster_id != -1]
    cluster_centroids = X_no_outliers.groupby('cluster_id')[['lat', 'lon']].mean()

    def closest_centroid(lat, lon):
        dist_series = cluster_centroids.apply(lambda row: meter_distance(lat, lon, row.lat, row.lon), axis=1)
        return dist_series.min()

    print('Calculating distance to closest cluster for each data point...')
    df['closest_cluster_centroid_dist'] = df.progress_apply(lambda row: closest_centroid(row.lat, row.lon), axis=1)
    
    return df

# 2. Data cleaning

The train and test data only contains data from 2016, so for the other datasets with an age column
we only use the values from 2016, where possible. 

In [7]:
age_ranges = [
    (0, 19),
    (20, 39),
    (40, 59),
    (60, 79),
    (80, 90),
]

spatial_latest = only_latest_data(spatial)
income_latest = only_latest_data(income)
households_latest = only_latest_data(households)

train_spatial = train.merge(spatial_latest.drop(columns=['year']), on='grunnkrets_id', how='left')
muni_avg_revenue = train_spatial.groupby(by='municipality_name', as_index=False)['revenue'].mean()

Futhermore, we noticed that a number of rows in the train and test datasets didn't have  

In [8]:
def clean(df: pd.DataFrame, min_val=0, max_val=100):
    print('Length of data frame:', len(df))
    df = df[(df.revenue > min_val) & (df.revenue < max_val)]
    print('Length after removing extreme values and zero revenue retail stores:',  len(df))
    return df.drop(columns=['revenue']), df.revenue


def clean_out_nan_heavy_rows(df: pd.DataFrame):
    """Cleans out rows that have no match in the age, spatial, income or household datasets."""

    # df2 = df.merge(group_ages(age, age_ranges), on='grunnkrets_id', how='left')
    df2 = df.merge(spatial_latest.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(income_latest.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(households_latest.drop(columns=['year']), on='grunnkrets_id', how='left')

    df_cleaned = df[
        ~(df2.couple_children_0_to_5_years.isna() | df2.grunnkrets_name.isna() | df2.income_all_households.isna())  # | df2.age_0_19.isna() 
    ]

    print(f'Cleaned out {len(df) - len(df_cleaned)} out of {len(df)} rows.')

    return df_cleaned


train_test_clustered = add_spatial_clusters(pd.concat([train.drop(columns=['revenue']), test], axis=0).reset_index())
train_test_clustered = train_test_clustered[['store_id', 'cluster_id', 'cluster_member_count', 'closest_cluster_centroid_dist']]

train = train.merge(train_test_clustered, on='store_id', how='left')
test = test.merge(train_test_clustered, on='store_id', how='left')

train = clean_out_nan_heavy_rows(train)
label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8, random_state=SEED)
X_train, y_train = clean(pd.merge(X_train, y_train, left_index=True, right_index=True))

y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

26 clusters created
Cluster counts: {-1: 5882, 0: 7293, 1: 155, 2: 219, 3: 823, 4: 1156, 5: 104, 6: 1413, 7: 181, 8: 143, 9: 139, 10: 414, 11: 310, 12: 385, 13: 520, 14: 515, 15: 211, 16: 180, 17: 414, 18: 315, 19: 113, 20: 113, 21: 105, 22: 126, 23: 103, 24: 104}
Calculating distance to closest cluster for each data point...


100%|██████████| 21436/21436 [01:01<00:00, 350.87it/s]

Cleaned out 127 out of 12859 rows.
Length of data frame: 10185
Length after removing extreme values and zero revenue retail stores: 9947





# 3. Feature generation

In [9]:
!mkdir -p derived_data

if len(os.listdir('derived_data')) == 0:
    create_busstops_files()
    create_stores_in_vicinity_files()       

In [12]:
def generate_features(df: pd.DataFrame, data_origin: str, predictor: str = ''):
    # Define datasets to be merged
    age_groups_merge = group_ages(age, age_ranges)
    spatial_merge = spatial_latest.drop(columns=['year'])
    income_merge = income_latest.drop(columns=['year'])
    households_merge = households_latest.drop(columns=['year'])
    plaace_merge = plaace.drop_duplicates(subset='plaace_hierarchy_id')
    bus_data_train_merge = gpd.read_parquet(f'derived_data/stores_bus_stops_lt_1km_{data_origin}.parquet').drop(columns=['geometry'])
    stores_vicinity_merge = gpd.read_parquet(f'derived_data/stores_count_lt_1km_{data_origin}.parquet').drop(columns=['geometry'])

    # Merge datasets
    df = df.merge(age_groups_merge, on='grunnkrets_id', how='left')
    df = df.merge(spatial_merge, on='grunnkrets_id', how='left')
    # df = df.merge(muni_avg_revenue, on='municipality_name', how='left', suffixes=(None, '_muni_avg'))
    df = df.merge(income_merge, on='grunnkrets_id', how='left')
    df = df.merge(households_merge, on='grunnkrets_id', how='left')
    df = df.merge(plaace_merge, how='left')
    df = df.merge(bus_data_train_merge, on='store_id', how='left')
    df = df.merge(stores_vicinity_merge, on='store_id', how='left')
    df = add_city_centre_dist(df).drop(columns=['lon_center', 'lat_center'])

    # Transformations and some post-merge cleaning
    df.stores_count_lt_1km = np.log(df.stores_count_lt_1km)
    df.closest_cluster_centroid_dist = np.log(df.closest_cluster_centroid_dist)
    df[age_groups_merge.columns] = df[age_groups_merge.columns].fillna(0)
    

    # Handle categories for different predictors
    if predictor == 'xgb':
        # df = to_categorical(df)
        df = object_encoder(df)
    elif predictor == 'cb':
        df = nan_to_string(df)
    elif predictor == 'lgb':
        df = to_categorical(df)
    else: 
        raise ValueError('Invalid predictor')

    features = [
        'store_name', 
        'mall_name', 
        'chain_name',
        'address', 
        'lat', 'lon',
        
        *age_groups_merge.drop(columns=['grunnkrets_id']).columns,
        *income_merge.drop(columns=['grunnkrets_id']).columns,
        *households_merge.drop(columns=['grunnkrets_id']).columns,
        'lv1_desc', 'lv2_desc', 'sales_channel_name',  # 'lv3_desc', 'lv4_desc',
        *bus_data_train_merge.drop(columns=['store_id']).columns,
        *stores_vicinity_merge.drop(columns=['store_id']).columns,
        'dist_to_center',
        'cluster_id', 'cluster_member_count', 'closest_cluster_centroid_dist'
    ]

    return df[features]

In [13]:
# Features adapted to Catboost
X_train_cb = generate_features(X_train, data_origin='train', predictor='cb')
X_val_cb = generate_features(X_val, data_origin='train', predictor='cb')
X_test_cb = generate_features(test, data_origin='test', predictor='cb')

# Features adapted to LightGBM
X_train_lgb = generate_features(X_train, data_origin='train', predictor='lgb')
X_val_lgb = generate_features(X_val, data_origin='train', predictor='lgb')
X_test_lgb = generate_features(test, data_origin='test', predictor='lgb')

# 4. Hyper parameter tuning

### Preparing pools and parameter grid for Catboost

In [14]:
def get_cb_pools():
    text_features = ['store_name', 'address', 'sales_channel_name'] 
    cat_features = ['mall_name', 'chain_name', 'lv1_desc', 'lv2_desc', 'cluster_id']

    train_pool = cb.Pool(
        X_train_cb,
        y_train,
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train_cb)
    )

    valid_pool = cb.Pool(
        X_val_cb,
        y_val,
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train_cb)
    )

    return train_pool, valid_pool


def get_cb_params(trial: optuna.Trial = None):
    gpu_count = get_gpu_device_count()
    non_tunable_cb_params = {
        'objective': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU' if gpu_count else 'CPU', 
        'devices': f'0:{gpu_count}',
        'random_seed': SEED
    }

    if trial is None:
        return 'cb', non_tunable_cb_params
    
    tunable_params = {
        'depth': trial.suggest_int('depth', 4, 9),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2, 6),
        # 'iterations': trial.suggest_int('iterations', 1000, 2000),
        # 'learning_rate': trial.suggest_categorical('learning_rate', 0.1, 0.5)
    }

    return 'cb', non_tunable_cb_params, tunable_params

### Preparing DMatrices and parameter grid for LightGBM

In [15]:
def get_lgb_dmatrices():
    dtrain = lgb.Dataset(X_train_lgb, y_train, params={'verbose': -1}, free_raw_data=False)
    dvalid = lgb.Dataset(X_val_lgb, y_val, params={'verbose': -1}, free_raw_data=False)
    return dtrain, dvalid


def get_lgb_params(trial: optuna.Trial = None):
    non_tunable_lgb_params = {
        'objective': 'rmse',
        'verbose': -1,
        'seed': 1
    }

    if trial is None:
        return 'lgb', non_tunable_lgb_params

    tunable_params = {
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'goss', 'dart']),
    }

    if tunable_params['boosting_type'] != 'goss':
        tunable_params["bagging_fraction"]: trial.suggest_float("bagging_fraction", 0.4, 1.0)
        tunable_params["bagging_freq"]: trial.suggest_int("bagging_freq", 1, 7)

    return 'lgb', non_tunable_lgb_params, tunable_params

### Hyper parameter tuning with Optuna

In [16]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def objective(trial: optuna.Trial, param_grid_fn: Callable) -> float:
    model_name, non_tunable_params, tunable_params = param_grid_fn(trial)

    if model_name == 'cb':
        if tunable_params['bootstrap_type'] == 'Bayesian': 
            tunable_params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
        elif tunable_params['bootstrap_type'] == 'Bernoulli':
            tunable_params['subsample'] = trial.suggest_float('subsample', 0.1, 1, log=True)

        cbr = cb.CatBoostRegressor(**non_tunable_params, **tunable_params) 
        train_pool, valid_pool = get_cb_pools()
        cbr.fit(
            train_pool,
            eval_set=[(X_val_cb, y_val)],
            verbose=0,
            early_stopping_rounds=100,
        )
        y_pred = cbr.predict(X_val_cb)
    
    elif model_name == 'lgb':
        dtrain_lgb, dvalid_lgb = get_lgb_dmatrices()
        lgbr = lgb.train(
            params={**non_tunable_params, **tunable_params},
            train_set=dtrain_lgb,
            valid_sets=dvalid_lgb,
            verbose_eval=False,
        )
        y_pred = lgbr.predict(X_val_lgb)

    score = rmsle(np.expm1(y_val), np.expm1(y_pred))

    return score


def get_hyper_parameters(param_grid_fn: Callable, n_trials=100):
    study = optuna.create_study(
        study_name='hyperparam-tuning',
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), 
        direction='minimize'
    )
    objective_fn = lambda trial: objective(trial, param_grid_fn)
    study.optimize(objective_fn, n_trials=n_trials, timeout=900) 

    print('Number of finished trials: {}'.format(len(study.trials)))
    
    trial = study.best_trial
    print(f'Best trial ({trial.number}):')
    print('Value:', trial.value)
    print('Params:')
    print(trial.params)

    return param_grid_fn()[1], trial.params

# 5. Training step

### Making predictions with Catboost

In [17]:
non_tunable_cb_params, tuned_params = get_hyper_parameters(get_cb_params, n_trials=30)
train_pool, valid_pool = get_cb_pools()
cbm = cb.CatBoostRegressor(**non_tunable_cb_params, **tuned_params, iterations=1000) 
cbm.fit(train_pool, eval_set=valid_pool, verbose=50, plot=True, early_stopping_rounds=50)

[32m[I 2022-11-12 20:40:31,511][0m A new study created in memory with name: hyperparam-tuning[0m
[32m[I 2022-11-12 20:40:39,204][0m Trial 0 finished with value: 0.7353974356671156 and parameters: {'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 2.012629426592682, 'subsample': 0.13707335228166675}. Best is trial 0 with value: 0.7353974356671156.[0m
[32m[I 2022-11-12 20:40:44,181][0m Trial 1 finished with value: 0.7320560939233565 and parameters: {'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 3.188534746081909, 'subsample': 0.5362369742134349}. Best is trial 1 with value: 0.7320560939233565.[0m
[32m[I 2022-11-12 20:40:57,667][0m Trial 2 finished with value: 0.740570780182583 and parameters: {'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'l2_leaf_reg': 5.559081900853538}. Best is trial 1 with value: 0.7320560939233565.[0m
[32m[I 2022-11-12 20:41:04,019][0m Trial 3 finished with value

Number of finished trials: 30
Best trial (26):
Value: 0.725313931538961
Params:
{'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 4.530293719264458, 'subsample': 0.7109684663687614}


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9766495	test: 0.9983013	best: 0.9983013 (0)	total: 27.4ms	remaining: 27.4s
50:	learn: 0.7455458	test: 0.7785081	best: 0.7785081 (50)	total: 1.51s	remaining: 28.2s
100:	learn: 0.7128559	test: 0.7523571	best: 0.7523571 (100)	total: 2.89s	remaining: 25.7s
150:	learn: 0.7010646	test: 0.7446445	best: 0.7446445 (150)	total: 4.16s	remaining: 23.4s
200:	learn: 0.6927123	test: 0.7399220	best: 0.7399220 (200)	total: 5.45s	remaining: 21.7s
250:	learn: 0.6865278	test: 0.7364377	best: 0.7364377 (250)	total: 6.63s	remaining: 19.8s
300:	learn: 0.6824240	test: 0.7346559	best: 0.7346559 (300)	total: 7.71s	remaining: 17.9s
350:	learn: 0.6782867	test: 0.7329806	best: 0.7329806 (350)	total: 8.75s	remaining: 16.2s
400:	learn: 0.6743727	test: 0.7312285	best: 0.7312285 (400)	total: 9.73s	remaining: 14.5s
450:	learn: 0.6712531	test: 0.7298971	best: 0.7298890 (449)	total: 10.7s	remaining: 13.1s
500:	learn: 0.6680591	test: 0.7291033	best: 0.7290995 (499)	total: 11.7s	remaining: 11.7s
550:	learn: 0.6

<catboost.core.CatBoostRegressor at 0x7f64ad299d90>

### Making predictions with LightGBM

In [18]:
non_tunable_lgb_params, tunable_lgb_params = get_hyper_parameters(get_lgb_params, n_trials=400)
dtrain_lgb, dvalid_lgb = get_lgb_dmatrices()
lgbm = lgb.train(
    params={**non_tunable_lgb_params, **tunable_lgb_params},
    train_set=dtrain_lgb,
    valid_sets=dvalid_lgb,
    verbose_eval=False
)

[32m[I 2022-11-12 20:46:40,485][0m A new study created in memory with name: hyperparam-tuning[0m
[32m[I 2022-11-12 20:46:41,790][0m Trial 0 finished with value: 0.7495174295153084 and parameters: {'lambda_l1': 7.091025725138702e-06, 'lambda_l2': 0.004756444007920753, 'num_leaves': 241, 'feature_fraction': 0.8525494191110508, 'min_child_samples': 97, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.7495174295153084.[0m
[32m[I 2022-11-12 20:46:41,920][0m Trial 1 finished with value: 0.7584480773623337 and parameters: {'lambda_l1': 4.924852251829013e-05, 'lambda_l2': 0.12362936507928729, 'num_leaves': 64, 'feature_fraction': 0.4526298862437346, 'min_child_samples': 89, 'boosting_type': 'goss'}. Best is trial 0 with value: 0.7495174295153084.[0m
[32m[I 2022-11-12 20:46:42,061][0m Trial 2 finished with value: 0.7588301707111884 and parameters: {'lambda_l1': 1.0110041546359765e-05, 'lambda_l2': 0.028502440614931072, 'num_leaves': 30, 'feature_fraction': 0.6831143920405299, 

Number of finished trials: 400
Best trial (174):
Value: 0.7332635871610907
Params:
{'lambda_l1': 0.3371997446134467, 'lambda_l2': 3.207693527732911e-06, 'num_leaves': 15, 'feature_fraction': 0.4217394939052542, 'min_child_samples': 75, 'boosting_type': 'gbdt'}


In [19]:
# Catboost validation prediction
y_pred_val_cb = np.expm1(cbm.predict(X_val_cb))
print('Catboost validation score:', rmsle(np.expm1(y_val), y_pred_val_cb))

# LightGBM validation prediction
y_val_pred_lgb = np.expm1(lgbm.predict(X_val_lgb))
print('LightGBM validation score:', rmsle(np.expm1(y_val), y_val_pred_lgb))

Catboost validation score: 0.725313931538961
LightGBM validation score: 0.7332635871610907


In [20]:
y_pred_test_cb = np.expm1(cbm.predict(X_test_cb))
y_pred_test_lgb = np.expm1(lgbm.predict(X_test_lgb))

test_stack = np.array([y_pred_test_cb, y_pred_test_lgb])
stack_test_avg = np.mean(test_stack, axis=0)

submission = pd.read_csv('data/sample_submission.csv')
submission['predicted'] = stack_test_avg
submission.to_csv('submissions/submission_lordag_kveld.csv', index=False)