# Train LGBMRegressor(tuned) + catboost(default) + combined dataset + 10 split fold + labeled datasources + distance to cities

# Imports

In [1]:
!pip install polars

Collecting polars
  Downloading polars-0.15.14-cp37-abi3-macosx_10_7_x86_64.whl (14.0 MB)
[K     |████████████████████████████████| 14.0 MB 712 kB/s eta 0:00:01
Installing collected packages: polars
Successfully installed polars-0.15.14


In [2]:
!pip install snoop

Collecting snoop
  Downloading snoop-0.4.3-py2.py3-none-any.whl (27 kB)
Collecting cheap-repr>=0.4.0
  Downloading cheap_repr-0.5.1-py2.py3-none-any.whl (12 kB)
Installing collected packages: cheap-repr, snoop
Successfully installed cheap-repr-0.5.1 snoop-0.4.3


In [3]:
import polars as pl
from snoop import pp
from polars.testing import assert_frame_equal, assert_series_equal

In [4]:
import pandas as pd
from pathlib import Path
import numpy as np

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
train_pl = pl.read_csv('train.csv')
test_pl = pl.read_csv('test.csv')
sample_sub_pl = pl.read_csv('sample_submission.csv')

# join Kaggle dataset with SKlearn dataset

In [7]:
from sklearn.datasets import fetch_california_housing

original_data = fetch_california_housing()
original_data.feature_names
original_data.target_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

['MedHouseVal']

In [9]:
additional_data = ( 
    pl.concat([
        pl.from_numpy(original_data.data, original_data.feature_names), 
        pl.from_numpy(original_data.target, original_data.target_names),
    ],how='horizontal')
    .select([
        pl.all().exclude('MedHouseVal'),
        pl.lit(False).alias('is_generated'), # add labels to distinguish two datasets
        'MedHouseVal',
    ])
)
additional_data.head()
additional_data.columns

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,MedHouseVal
f64,f64,f64,f64,f64,f64,f64,f64,bool,f64
8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,False,4.526
8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,False,3.585
7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,False,3.521
5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,False,3.413
3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,False,3.422


['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'MedHouseVal']

In [10]:
train_pl = (
    train_pl
    .select([
        pl.all().exclude('MedHouseVal'),
        pl.lit(True).alias('is_generated'),
        'MedHouseVal',
    ])
)
train_pl.head()


id,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,MedHouseVal
i64,f64,f64,f64,f64,f64,f64,f64,f64,bool,f64
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,True,0.98
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,True,0.946
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,True,1.576
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,True,1.336
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,True,4.5


In [11]:
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'is_generated'] # added ext_src
target = 'MedHouseVal'

In [12]:
features + [target]
train_joined_pl = train_pl[features + [target]].vstack(additional_data) # not including column `id`
train_joined_pl.shape
train_joined_pl.columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'MedHouseVal']

(57777, 10)

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'MedHouseVal']

# Feature: distance to cities
Thanks to @phongnguyen1, reference: https://www.kaggle.com/code/phongnguyen1/distance-to-cities-features-clustering?scriptVersionId=115694922&cellId=40

**About haversine_distances** [link](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html)

> The Haversine (or great circle) distance is the angular distance between two points on the surface of a sphere. The first coordinate of each point is assumed to be the latitude, the second is the longitude, given in radians. The dimension of the data must be 2.

> As the Earth is nearly spherical, the haversine formula provides a good approximation of the distance between two points of the Earth surface, with a less than 1% error on average.

In [13]:
def get_distance(lat1, long1, lat2, long2):
    from sklearn.metrics.pairwise import haversine_distances
    from math import radians

    loc1 = [radians(lat1), radians(long1)]
    loc2 = [radians(lat2), radians(long2)]
    result = haversine_distances([loc1, loc2])
    result * 6371000/1000  # multiply by Earth radius to get kilometers
    return result[0][1]

In [14]:
Sacramento = (38.576931, -121.494949)
SanFrancisco = (37.780080, -122.420160)
SanJose = (37.334789, -121.888138)
LosAngeles = (34.052235, -118.243683)
SanDiego = (32.715759, -117.163818)

In [15]:
train_joined_pl_add_dist = (
    train_joined_pl
    .with_columns([
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], Sacramento[0], Sacramento[1])).alias('dist2Sacramento'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanFrancisco[0], SanFrancisco[1])).alias('dist2SanFrancisco'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanJose[0], SanJose[1])).alias('dist2SanJose'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], LosAngeles[0], LosAngeles[1])).alias('dist2LosAngeles'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanDiego[0], SanDiego[1])).alias('dist2SanDiego'),        
    ])
    .with_columns([
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: min([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2nearestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: max([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2furthestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: sum([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2allCity')        
    ])
)
train_joined_pl_add_dist

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,MedHouseVal,dist2Sacramento,dist2SanFrancisco,dist2SanJose,dist2LosAngeles,dist2SanDiego,dist2nearestCity,dist2furthestCity,dist2allCity
f64,f64,f64,f64,f64,f64,f64,f64,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64
2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,true,0.98,0.072033,0.064262,0.053868,0.028684,0.054086,0.028684,0.072033,0.272933
3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,true,0.946,0.004236,0.022867,0.025374,0.091098,0.119034,0.004236,0.119034,0.262609
4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,true,1.576,0.069057,0.06033,0.050106,0.033789,0.05905,0.033789,0.069057,0.272332
2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,true,1.336,0.12067,0.117263,0.106563,0.029551,0.001457,0.001457,0.12067,0.375504
3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,true,4.5,0.018478,0.000375,0.010865,0.087989,0.115991,0.000375,0.115991,0.233698
6.8075,26.0,6.764372,1.091787,2147.0,2.70354,33.84,-118.31,true,3.714,0.094041,0.090052,0.079356,0.003827,0.025782,0.003827,0.094041,0.293057
2.3654,21.0,4.734884,1.011396,1112.0,2.937247,35.65,-117.64,true,0.811,0.074069,0.076502,0.066456,0.029196,0.051671,0.029196,0.076502,0.297894
2.3562,31.0,5.186567,1.154229,1346.0,3.348259,32.68,-117.07,true,0.824,0.120514,0.117176,0.106475,0.029432,0.001513,0.001513,0.120514,0.375111
2.2672,23.0,4.640155,1.064302,628.0,1.774461,38.53,-121.43,true,0.995,0.001207,0.018867,0.021793,0.090078,0.118121,0.001207,0.118121,0.250065
5.637,20.0,4.391863,1.053312,1634.0,2.681388,33.88,-118.36,true,2.291,0.093087,0.089053,0.078358,0.003445,0.026784,0.003445,0.093087,0.290728


In [16]:
features = (train_joined_pl_add_dist.columns)

features.remove('MedHouseVal')

features, target
len(features)

(['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude',
  'is_generated',
  'dist2Sacramento',
  'dist2SanFrancisco',
  'dist2SanJose',
  'dist2LosAngeles',
  'dist2SanDiego',
  'dist2nearestCity',
  'dist2furthestCity',
  'dist2allCity'],
 'MedHouseVal')

17

# Training LGBMRegressor model

Let's begin by splitting our data into a train and validation set.

In [17]:
from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

The variable that we will be predicting is the `MedHouseVal`. We will use the rest of the columns (minus the id column) for training.

In [18]:
# these parameters come from soupmonsters awesome notebook here: https://www.kaggle.com/code/soupmonster/simple-lightgbm-baseline
params= {
 'learning_rate': 0.02, 
 'n_estimators': 100_000, 
 'metric': 'rmse',
 'lambda_l1': 1.945,
 'num_leaves': 87,
 'feature_fraction': 0.79,
 'bagging_fraction': 0.93,
 'bagging_freq': 4,
 'min_data_in_leaf': 103,
 'max_depth': 17,
}

In [19]:
clfs_f64pl = []
kf = KFold(n_splits=10, random_state=0, shuffle=True) # this line must be included in the same cell as the training block below
rmses = []

for i, (train_index, val_index) in enumerate(kf.split(train_joined_pl_add_dist)): # kf.split can work with pl.DataFrame
    X_train, X_val = train_joined_pl_add_dist[features][train_index].select(pl.all().cast(pl.Float64)), train_joined_pl_add_dist[features][val_index].select(pl.all().cast(pl.Float64))
    y_train, y_val = train_joined_pl_add_dist[target][train_index].cast(pl.Float64), train_joined_pl_add_dist[target][val_index].cast(pl.Float64)

    clf = LGBMRegressor(**params)
    clf.fit(X_train.to_numpy(),
            y_train.to_numpy(),
            eval_set=[(X_val.to_numpy(), y_val.to_numpy())], 
            callbacks=[lgbm.early_stopping(85, verbose=True)]) # why early_stop at 500 

    preds = clf.predict(X_val.to_numpy())
    
    clfs_f64pl.append(clf) # save 5 trained models into this list
    rmses.append(mean_squared_error(y_val.to_numpy(), preds, squared=False))
    
print(f'mean RMSE across all folds: {pl.Series(rmses).mean()}')

[1]	valid_0's rmse: 1.13748
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.1217
[3]	valid_0's rmse: 1.10775
[4]	valid_0's rmse: 1.09252
[5]	valid_0's rmse: 1.07783
[6]	valid_0's rmse: 1.06334
[7]	valid_0's rmse: 1.05029
[8]	valid_0's rmse: 1.03772
[9]	valid_0's rmse: 1.02457
[10]	valid_0's rmse: 1.01162
[11]	valid_0's rmse: 0.998749
[12]	valid_0's rmse: 0.986248
[13]	valid_0's rmse: 0.974235
[14]	valid_0's rmse: 0.962521
[15]	valid_0's rmse: 0.950954
[16]	valid_0's rmse: 0.940384
[17]	valid_0's rmse: 0.929345
[18]	valid_0's rmse: 0.919883
[19]	valid_0's rmse: 0.909517
[20]	valid_0's rmse: 0.900054
[21]	valid_0's rmse: 0.889968
[22]	valid_0's rmse: 0.880108
[23]	valid_0's rmse: 0.870514
[24]	valid_0's rmse: 0.86165
[25]	valid_0's rmse: 0.852518
[26]	valid_0's rmse: 0.843729
[27]	valid_0's rmse: 0.83532
[28]	valid_0's rmse: 0.827017
[29]	valid_0's rmse: 0.818716
[30]	valid_0's rmse: 0.810758
[31]	valid_0's rmse: 0.803132
[32]	valid_0's rmse: 0.795779
[

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.14325
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.12711
[3]	valid_0's rmse: 1.11308
[4]	valid_0's rmse: 1.09748
[5]	valid_0's rmse: 1.08246
[6]	valid_0's rmse: 1.06788
[7]	valid_0's rmse: 1.05443
[8]	valid_0's rmse: 1.04163
[9]	valid_0's rmse: 1.02808
[10]	valid_0's rmse: 1.01487
[11]	valid_0's rmse: 1.00179
[12]	valid_0's rmse: 0.989148
[13]	valid_0's rmse: 0.977046
[14]	valid_0's rmse: 0.965087
[15]	valid_0's rmse: 0.952996
[16]	valid_0's rmse: 0.942185
[17]	valid_0's rmse: 0.930795
[18]	valid_0's rmse: 0.921417
[19]	valid_0's rmse: 0.910563
[20]	valid_0's rmse: 0.901039
[21]	valid_0's rmse: 0.890679
[22]	valid_0's rmse: 0.880645
[23]	valid_0's rmse: 0.870902
[24]	valid_0's rmse: 0.862057
[25]	valid_0's rmse: 0.852935
[26]	valid_0's rmse: 0.844019
[27]	valid_0's rmse: 0.835681
[28]	valid_0's rmse: 0.827264
[29]	valid_0's rmse: 0.818858
[30]	valid_0's rmse: 0.810698
[31]	valid_0's rmse: 0.802847
[32]	valid_0's rmse: 0.795308

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.15133
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.13537
[3]	valid_0's rmse: 1.1212
[4]	valid_0's rmse: 1.10597
[5]	valid_0's rmse: 1.09099
[6]	valid_0's rmse: 1.07642
[7]	valid_0's rmse: 1.06311
[8]	valid_0's rmse: 1.05014
[9]	valid_0's rmse: 1.03674
[10]	valid_0's rmse: 1.02357
[11]	valid_0's rmse: 1.0105
[12]	valid_0's rmse: 0.99791
[13]	valid_0's rmse: 0.985959
[14]	valid_0's rmse: 0.974039
[15]	valid_0's rmse: 0.962355
[16]	valid_0's rmse: 0.951547
[17]	valid_0's rmse: 0.94047
[18]	valid_0's rmse: 0.930987
[19]	valid_0's rmse: 0.920435
[20]	valid_0's rmse: 0.910819
[21]	valid_0's rmse: 0.900741
[22]	valid_0's rmse: 0.890843
[23]	valid_0's rmse: 0.881252
[24]	valid_0's rmse: 0.872205
[25]	valid_0's rmse: 0.863025
[26]	valid_0's rmse: 0.854154
[27]	valid_0's rmse: 0.845676
[28]	valid_0's rmse: 0.837248
[29]	valid_0's rmse: 0.828952
[30]	valid_0's rmse: 0.820903
[31]	valid_0's rmse: 0.813088
[32]	valid_0's rmse: 0.805552
[33

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13484
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.11922
[3]	valid_0's rmse: 1.10524
[4]	valid_0's rmse: 1.09032
[5]	valid_0's rmse: 1.07585
[6]	valid_0's rmse: 1.06173
[7]	valid_0's rmse: 1.04882
[8]	valid_0's rmse: 1.03618
[9]	valid_0's rmse: 1.02309
[10]	valid_0's rmse: 1.01032
[11]	valid_0's rmse: 0.99767
[12]	valid_0's rmse: 0.985423
[13]	valid_0's rmse: 0.973671
[14]	valid_0's rmse: 0.962144
[15]	valid_0's rmse: 0.95072
[16]	valid_0's rmse: 0.94032
[17]	valid_0's rmse: 0.929436
[18]	valid_0's rmse: 0.920236
[19]	valid_0's rmse: 0.90991
[20]	valid_0's rmse: 0.900634
[21]	valid_0's rmse: 0.890811
[22]	valid_0's rmse: 0.881179
[23]	valid_0's rmse: 0.871883
[24]	valid_0's rmse: 0.863362
[25]	valid_0's rmse: 0.85453
[26]	valid_0's rmse: 0.845954
[27]	valid_0's rmse: 0.837976
[28]	valid_0's rmse: 0.829864
[29]	valid_0's rmse: 0.821769
[30]	valid_0's rmse: 0.813922
[31]	valid_0's rmse: 0.806289
[32]	valid_0's rmse: 0.799037
[33

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.14598
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.13015
[3]	valid_0's rmse: 1.11637
[4]	valid_0's rmse: 1.10105
[5]	valid_0's rmse: 1.08635
[6]	valid_0's rmse: 1.07206
[7]	valid_0's rmse: 1.05913
[8]	valid_0's rmse: 1.0465
[9]	valid_0's rmse: 1.03312
[10]	valid_0's rmse: 1.01992
[11]	valid_0's rmse: 1.00699
[12]	valid_0's rmse: 0.994429
[13]	valid_0's rmse: 0.982241
[14]	valid_0's rmse: 0.970335
[15]	valid_0's rmse: 0.958651
[16]	valid_0's rmse: 0.948019
[17]	valid_0's rmse: 0.937012
[18]	valid_0's rmse: 0.927545
[19]	valid_0's rmse: 0.91707
[20]	valid_0's rmse: 0.907826
[21]	valid_0's rmse: 0.897731
[22]	valid_0's rmse: 0.887869
[23]	valid_0's rmse: 0.878232
[24]	valid_0's rmse: 0.86949
[25]	valid_0's rmse: 0.86039
[26]	valid_0's rmse: 0.85158
[27]	valid_0's rmse: 0.843392
[28]	valid_0's rmse: 0.834996
[29]	valid_0's rmse: 0.826772
[30]	valid_0's rmse: 0.818726
[31]	valid_0's rmse: 0.810981
[32]	valid_0's rmse: 0.803735
[33]

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.16901
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.15307
[3]	valid_0's rmse: 1.13899
[4]	valid_0's rmse: 1.12372
[5]	valid_0's rmse: 1.10882
[6]	valid_0's rmse: 1.09419
[7]	valid_0's rmse: 1.08065
[8]	valid_0's rmse: 1.06791
[9]	valid_0's rmse: 1.05456
[10]	valid_0's rmse: 1.04148
[11]	valid_0's rmse: 1.02851
[12]	valid_0's rmse: 1.01595
[13]	valid_0's rmse: 1.00379
[14]	valid_0's rmse: 0.991939
[15]	valid_0's rmse: 0.98018
[16]	valid_0's rmse: 0.969364
[17]	valid_0's rmse: 0.958214
[18]	valid_0's rmse: 0.948512
[19]	valid_0's rmse: 0.937951
[20]	valid_0's rmse: 0.928336
[21]	valid_0's rmse: 0.918174
[22]	valid_0's rmse: 0.9083
[23]	valid_0's rmse: 0.898549
[24]	valid_0's rmse: 0.889631
[25]	valid_0's rmse: 0.880531
[26]	valid_0's rmse: 0.8716
[27]	valid_0's rmse: 0.863281
[28]	valid_0's rmse: 0.854893
[29]	valid_0's rmse: 0.846585
[30]	valid_0's rmse: 0.838523
[31]	valid_0's rmse: 0.830828
[32]	valid_0's rmse: 0.823306
[33]	v

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.14072
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.12521
[3]	valid_0's rmse: 1.11136
[4]	valid_0's rmse: 1.0965
[5]	valid_0's rmse: 1.08223
[6]	valid_0's rmse: 1.06806
[7]	valid_0's rmse: 1.05518
[8]	valid_0's rmse: 1.04267
[9]	valid_0's rmse: 1.02982
[10]	valid_0's rmse: 1.01702
[11]	valid_0's rmse: 1.00457
[12]	valid_0's rmse: 0.992386
[13]	valid_0's rmse: 0.980748
[14]	valid_0's rmse: 0.969115
[15]	valid_0's rmse: 0.95773
[16]	valid_0's rmse: 0.947291
[17]	valid_0's rmse: 0.936552
[18]	valid_0's rmse: 0.927074
[19]	valid_0's rmse: 0.916978
[20]	valid_0's rmse: 0.907627
[21]	valid_0's rmse: 0.897814
[22]	valid_0's rmse: 0.888287
[23]	valid_0's rmse: 0.879014
[24]	valid_0's rmse: 0.870364
[25]	valid_0's rmse: 0.861557
[26]	valid_0's rmse: 0.85297
[27]	valid_0's rmse: 0.844812
[28]	valid_0's rmse: 0.836754
[29]	valid_0's rmse: 0.828886
[30]	valid_0's rmse: 0.821116
[31]	valid_0's rmse: 0.813552
[32]	valid_0's rmse: 0.806379
[3

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13326
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.11749
[3]	valid_0's rmse: 1.10369
[4]	valid_0's rmse: 1.08857
[5]	valid_0's rmse: 1.074
[6]	valid_0's rmse: 1.05981
[7]	valid_0's rmse: 1.04674
[8]	valid_0's rmse: 1.03416
[9]	valid_0's rmse: 1.02109
[10]	valid_0's rmse: 1.0082
[11]	valid_0's rmse: 0.995579
[12]	valid_0's rmse: 0.983331
[13]	valid_0's rmse: 0.971488
[14]	valid_0's rmse: 0.95975
[15]	valid_0's rmse: 0.94819
[16]	valid_0's rmse: 0.937659
[17]	valid_0's rmse: 0.926762
[18]	valid_0's rmse: 0.917506
[19]	valid_0's rmse: 0.907171
[20]	valid_0's rmse: 0.897899
[21]	valid_0's rmse: 0.887766
[22]	valid_0's rmse: 0.878023
[23]	valid_0's rmse: 0.868683
[24]	valid_0's rmse: 0.859877
[25]	valid_0's rmse: 0.850888
[26]	valid_0's rmse: 0.842251
[27]	valid_0's rmse: 0.833986
[28]	valid_0's rmse: 0.825821
[29]	valid_0's rmse: 0.817719
[30]	valid_0's rmse: 0.809877
[31]	valid_0's rmse: 0.802226
[32]	valid_0's rmse: 0.794926
[33

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.1533
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.13739
[3]	valid_0's rmse: 1.12306
[4]	valid_0's rmse: 1.10786
[5]	valid_0's rmse: 1.09305
[6]	valid_0's rmse: 1.0786
[7]	valid_0's rmse: 1.0653
[8]	valid_0's rmse: 1.05234
[9]	valid_0's rmse: 1.03906
[10]	valid_0's rmse: 1.026
[11]	valid_0's rmse: 1.0131
[12]	valid_0's rmse: 1.0005
[13]	valid_0's rmse: 0.988413
[14]	valid_0's rmse: 0.976527
[15]	valid_0's rmse: 0.964747
[16]	valid_0's rmse: 0.953969
[17]	valid_0's rmse: 0.943016
[18]	valid_0's rmse: 0.93375
[19]	valid_0's rmse: 0.923255
[20]	valid_0's rmse: 0.913639
[21]	valid_0's rmse: 0.903433
[22]	valid_0's rmse: 0.893535
[23]	valid_0's rmse: 0.883963
[24]	valid_0's rmse: 0.8749
[25]	valid_0's rmse: 0.865816
[26]	valid_0's rmse: 0.857092
[27]	valid_0's rmse: 0.848673
[28]	valid_0's rmse: 0.840341
[29]	valid_0's rmse: 0.832063
[30]	valid_0's rmse: 0.824087
[31]	valid_0's rmse: 0.816328
[32]	valid_0's rmse: 0.809
[33]	valid_0'

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.12582
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.11034
[3]	valid_0's rmse: 1.0969
[4]	valid_0's rmse: 1.08206
[5]	valid_0's rmse: 1.06764
[6]	valid_0's rmse: 1.05354
[7]	valid_0's rmse: 1.04073
[8]	valid_0's rmse: 1.02851
[9]	valid_0's rmse: 1.01559
[10]	valid_0's rmse: 1.00279
[11]	valid_0's rmse: 0.990364
[12]	valid_0's rmse: 0.978189
[13]	valid_0's rmse: 0.966502
[14]	valid_0's rmse: 0.954912
[15]	valid_0's rmse: 0.9434
[16]	valid_0's rmse: 0.933092
[17]	valid_0's rmse: 0.922234
[18]	valid_0's rmse: 0.912905
[19]	valid_0's rmse: 0.902812
[20]	valid_0's rmse: 0.893731
[21]	valid_0's rmse: 0.883889
[22]	valid_0's rmse: 0.874218
[23]	valid_0's rmse: 0.864887
[24]	valid_0's rmse: 0.856489
[25]	valid_0's rmse: 0.847588
[26]	valid_0's rmse: 0.83898
[27]	valid_0's rmse: 0.830996
[28]	valid_0's rmse: 0.822789
[29]	valid_0's rmse: 0.814926
[30]	valid_0's rmse: 0.80723
[31]	valid_0's rmse: 0.799835
[32]	valid_0's rmse: 0.792853
[33

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

mean RMSE across all folds: 0.5122948870760979


# Train a catboost model

In [20]:
from catboost import CatBoostRegressor
clfs_f64pl_cat = []
rmses = []
kf = KFold(n_splits=10, random_state=1, shuffle=True)
for train_index, val_index in kf.split(train_joined_pl_add_dist):
    X_train = train_joined_pl_add_dist[features][train_index].select(pl.all().cast(pl.Float64))
    X_val = train_joined_pl_add_dist[features][val_index].select(pl.all().cast(pl.Float64))
    y_train = train_joined_pl_add_dist[target][train_index].cast(pl.Float64)
    y_val = train_joined_pl_add_dist[target][val_index].cast(pl.Float64)

    clf = CatBoostRegressor(iterations=100_000, loss_function='RMSE')
    clf.fit(X_train.to_numpy(),
            y_train.to_numpy(),
            eval_set=(X_val.to_numpy(), y_val.to_numpy()),
            early_stopping_rounds=1000, verbose=False)

    preds = clf.predict(X_val.to_numpy())

    clfs_f64pl_cat.append(clf)
    rmses.append(mean_squared_error(y_val.to_numpy(), preds, squared=False))
print(f'mean RMSE across all folds: {np.mean(rmses)}')

<catboost.core.CatBoostRegressor at 0x7fa6f0030370>

<catboost.core.CatBoostRegressor at 0x7fa7515c7d90>

<catboost.core.CatBoostRegressor at 0x7fa73017ceb0>

<catboost.core.CatBoostRegressor at 0x7fa751dba460>

<catboost.core.CatBoostRegressor at 0x7fa751db5730>

<catboost.core.CatBoostRegressor at 0x7fa751db6250>

<catboost.core.CatBoostRegressor at 0x7fa7201316a0>

<catboost.core.CatBoostRegressor at 0x7fa7201312b0>

<catboost.core.CatBoostRegressor at 0x7fa751db6fa0>

<catboost.core.CatBoostRegressor at 0x7fa751db63d0>

mean RMSE across all folds: 0.5116063811779467


Let us now look at the variables that are important according to our model.

In [21]:
(
    pl.DataFrame({
    "features": features,
    "importance": clf.feature_importances_, # using the latest model
    })
    .with_columns([
        (pl.col('importance')/pl.col('importance').sum()).alias('ratio')
    ])
    .sort('ratio', reverse=True)
)

features,importance,ratio
str,f64,f64
"""MedInc""",32.28483,0.322848
"""AveOccup""",11.392482,0.113925
"""dist2nearestCi...",9.955326,0.099553
"""Longitude""",9.010301,0.090103
"""Latitude""",7.208617,0.072086
"""dist2Sacrament...",5.178957,0.05179
"""dist2furthestC...",3.903603,0.039036
"""HouseAge""",3.42074,0.034207
"""dist2LosAngele...",2.985077,0.029851
"""AveRooms""",2.91111,0.029111


# prepare test set

In [22]:
features

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'dist2Sacramento',
 'dist2SanFrancisco',
 'dist2SanJose',
 'dist2LosAngeles',
 'dist2SanDiego',
 'dist2nearestCity',
 'dist2furthestCity',
 'dist2allCity']

In [23]:
test_pl_adddist = (

    test_pl
    .select([
        pl.all(),
        pl.lit(True).alias('is_generated'),
    ])
    .with_columns([
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], Sacramento[0], Sacramento[1])).alias('dist2Sacramento'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanFrancisco[0], SanFrancisco[1])).alias('dist2SanFrancisco'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanJose[0], SanJose[1])).alias('dist2SanJose'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], LosAngeles[0], LosAngeles[1])).alias('dist2LosAngeles'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanDiego[0], SanDiego[1])).alias('dist2SanDiego'),        
    ])
    .with_columns([
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: min([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2nearestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: max([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2furthestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: sum([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2allCity')        
    ])
)
test_pl_adddist

id,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,dist2Sacramento,dist2SanFrancisco,dist2SanJose,dist2LosAngeles,dist2SanDiego,dist2nearestCity,dist2furthestCity,dist2allCity
i64,f64,f64,f64,f64,f64,f64,f64,f64,bool,f64,f64,f64,f64,f64,f64,f64,f64
37137,1.7062,35.0,4.966368,1.096539,1318.0,2.844411,39.75,-121.85,true,0.02103,0.035246,0.042157,0.111431,0.13932,0.02103,0.13932,0.349183
37138,1.3882,22.0,4.187035,1.098229,2296.0,3.180218,33.95,-118.29,true,0.09248,0.088755,0.078053,0.001906,0.027087,0.001906,0.09248,0.288281
37139,7.7197,21.0,7.129436,0.959276,1535.0,2.888889,33.61,-117.81,true,0.10105,0.097784,0.08708,0.009955,0.018241,0.009955,0.10105,0.31411
37140,4.6806,49.0,4.769697,1.048485,707.0,1.74359,34.17,-118.34,true,0.088763,0.085369,0.074665,0.002482,0.03062,0.002482,0.088763,0.2819
37141,3.1284,25.0,3.765306,1.081633,4716.0,2.003827,34.17,-118.29,true,0.089116,0.085847,0.075143,0.002162,0.030219,0.002162,0.089116,0.282486
37142,5.7268,23.0,6.0625,1.14527,1039.0,2.387097,33.81,-118.11,true,0.095873,0.092303,0.081601,0.00465,0.023567,0.00465,0.095873,0.297994
37143,3.3583,25.0,5.068783,1.227273,949.0,3.602564,33.14,-117.12,true,0.113253,0.110574,0.099871,0.022811,0.007432,0.007432,0.113253,0.353941
37144,4.1302,35.0,5.944724,1.062361,1043.0,3.165919,34.09,-117.98,true,0.092583,0.089894,0.079196,0.003869,0.026771,0.003869,0.092583,0.292312
37145,1.7991,23.0,4.928364,1.174061,848.0,2.558011,37.3,-120.89,true,0.023791,0.022773,0.013868,0.067969,0.096099,0.013868,0.096099,0.2245
37146,1.7857,44.0,5.717122,1.101644,4276.0,2.373069,33.98,-117.33,true,0.099327,0.097848,0.087194,0.013278,0.022198,0.013278,0.099327,0.319845


# Ensemble 

In [24]:
test_preds = []

for clf in (clfs_f64pl + clfs_f64pl_cat):
    preds = clf.predict(test_pl_adddist[features].to_numpy())
    test_preds.append(preds)

test_preds_mean_pl = (
    pl.DataFrame(test_preds)
    .transpose()
    .select([
        pl.all().explode()
    ])
    .mean(axis=1)
    .to_list()
)

# Make a submission

In [25]:
submission = pl.DataFrame({
    'id': test_pl.select('id').to_series(),
    'MedHouseVal': test_preds_mean_pl
})
# submission.head()

submission.write_csv('clfs_lgbm_cat_extsrc.csv')

This is shaping up to be a very excting challenge! 🥳 

**If you found this notebook useful, please upvote! 🙏 Thank you!**

All the best in the competition!

In [26]:
60/555

0.10810810810810811