# Train LGBMRegressor(tuned) + catboost(default) + combined dataset + 10 split fold + labeled datasources + distance to cities

# Imports

In [1]:
!pip install polars



In [2]:
!pip install snoop



In [3]:
import polars as pl
from snoop import pp
from polars.testing import assert_frame_equal, assert_series_equal

In [4]:
import pandas as pd
from pathlib import Path
import numpy as np

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.datasets import fetch_california_housing

In [10]:
class Data:
    def __init__(self, data_type: str = 'train', scaler_type: str = 'minmax', use_original_data=False):
        self.data_type = data_type
        self.data = pd.read_csv('train.csv', index_col=0) if data_type == 'train' else pd.read_csv('test.csv',
                                                                                                   index_col=0)
        self.scaler_type = scaler_type
        self.use_original_data = use_original_data

        if self.data_type == 'train':
            self.processed_data = self.process_data(self.data, scaler_type=scaler_type, handle_last_col=False)
        if self.data_type == 'test':
            self.processed_data = self.process_data(self.data, scaler_type=scaler_type, handle_last_col=True)

        if self.use_original_data:
            self.data = self.add_raw_data_to_train(self.data)

    @staticmethod
    def add_raw_data_to_train(df):
        original_df = fetch_california_housing()['data']
        original_df_train = pd.DataFrame(original_df, columns=fetch_california_housing()['feature_names'])
        original_df_train['MedHouseVal'] = fetch_california_housing()['target']
        return pd.concat([df, original_df_train], axis=0)

    @staticmethod
    def process_data(df, scaler_type='minmax', handle_last_col=True):
        raw_cols = df.columns
        raw_index = df.index
        if scaler_type == 'minmax':
            # use minmax scaler
            scaler = MinMaxScaler()

            if handle_last_col:
                # handle all columns
                return pd.DataFrame(scaler.fit_transform(df), columns=raw_cols, index=raw_index)
            else:
                # use the original last column
                return pd.DataFrame(scaler.fit_transform(df.iloc[:, :-1]), columns=raw_cols[:-1], index=raw_index).join(
                    df.iloc[:, -1])

        elif scaler_type == 'standard':
            # use standard scaler
            scaler = StandardScaler()
            # use the original last column

            if handle_last_col:
                # handle all columns
                return pd.DataFrame(scaler.fit_transform(df), columns=raw_cols, index=raw_index)
            else:
                return pd.DataFrame(scaler.fit_transform(df.iloc[:, :-1]), columns=raw_cols[:-1], index=raw_index). \
                    join(df.iloc[:, -1])

        elif scaler_type == 'none':
            return df

    def get_train_test_split(self, test_size: float = 0.3, random_state: int = 42):
        if self.use_original_data:
            raw_train = self.add_raw_data_to_train(pd.read_csv('train.csv', index_col=0))
        else:
            raw_train = pd.read_csv('train.csv', index_col=0)
        processed_train = self.process_data(raw_train, scaler_type=self.scaler_type, handle_last_col=False)

        X = processed_train.drop('MedHouseVal', axis=1)
        y = processed_train['MedHouseVal']

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return x_train, x_test, y_train, y_test

In [11]:
data = Data('train', scaler_type='none', use_original_data=True)

In [12]:
data.data.to_csv('train_added_raw.csv')

In [13]:
train_pl = pl.read_csv('train_added_raw.csv')
test_pl = pl.read_csv('test.csv')
sample_sub_pl = pl.read_csv('sample_submission.csv')

# join Kaggle dataset with SKlearn dataset

In [14]:
from sklearn.datasets import fetch_california_housing

original_data = fetch_california_housing()
original_data.feature_names
original_data.target_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

['MedHouseVal']

In [15]:
additional_data = ( 
    pl.concat([
        pl.from_numpy(original_data.data, original_data.feature_names), 
        pl.from_numpy(original_data.target, original_data.target_names),
    ],how='horizontal')
    .select([
        pl.all().exclude('MedHouseVal'),
        pl.lit(False).alias('is_generated'), # add labels to distinguish two datasets
        'MedHouseVal',
    ])
)
additional_data.head()
additional_data.columns

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,MedHouseVal
f64,f64,f64,f64,f64,f64,f64,f64,bool,f64
8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,False,4.526
8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,False,3.585
7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,False,3.521
5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,False,3.413
3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,False,3.422


['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'MedHouseVal']

In [16]:
train_pl = (
    train_pl
    .select([
        pl.all().exclude('MedHouseVal'),
        pl.lit(True).alias('is_generated'),
        'MedHouseVal',
    ])
)
train_pl.head()


Unnamed: 0_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,MedHouseVal
i64,f64,f64,f64,f64,f64,f64,f64,f64,bool,f64
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,True,0.98
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,True,0.946
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,True,1.576
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,True,1.336
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,True,4.5


In [17]:
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'is_generated'] # added ext_src
target = 'MedHouseVal'

In [18]:
features + [target]
train_joined_pl = train_pl[features + [target]].vstack(additional_data) # not including column `id`
train_joined_pl.shape
train_joined_pl.columns

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'MedHouseVal']

(78417, 10)

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'MedHouseVal']

# Feature: distance to cities
Thanks to @phongnguyen1, reference: https://www.kaggle.com/code/phongnguyen1/distance-to-cities-features-clustering?scriptVersionId=115694922&cellId=40

**About haversine_distances** [link](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html)

> The Haversine (or great circle) distance is the angular distance between two points on the surface of a sphere. The first coordinate of each point is assumed to be the latitude, the second is the longitude, given in radians. The dimension of the data must be 2.

> As the Earth is nearly spherical, the haversine formula provides a good approximation of the distance between two points of the Earth surface, with a less than 1% error on average.

In [19]:
def get_distance(lat1, long1, lat2, long2):
    from sklearn.metrics.pairwise import haversine_distances
    from math import radians

    loc1 = [radians(lat1), radians(long1)]
    loc2 = [radians(lat2), radians(long2)]
    result = haversine_distances([loc1, loc2])
    result * 6371000/1000  # multiply by Earth radius to get kilometers
    return result[0][1]

In [20]:
Sacramento = (38.576931, -121.494949)
SanFrancisco = (37.780080, -122.420160)
SanJose = (37.334789, -121.888138)
LosAngeles = (34.052235, -118.243683)
SanDiego = (32.715759, -117.163818)

In [21]:
train_joined_pl_add_dist = (
    train_joined_pl
    .with_columns([
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], Sacramento[0], Sacramento[1])).alias('dist2Sacramento'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanFrancisco[0], SanFrancisco[1])).alias('dist2SanFrancisco'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanJose[0], SanJose[1])).alias('dist2SanJose'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], LosAngeles[0], LosAngeles[1])).alias('dist2LosAngeles'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanDiego[0], SanDiego[1])).alias('dist2SanDiego'),        
    ])
    .with_columns([
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: min([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2nearestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: max([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2furthestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: sum([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2allCity')        
    ])
)
train_joined_pl_add_dist

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,MedHouseVal,dist2Sacramento,dist2SanFrancisco,dist2SanJose,dist2LosAngeles,dist2SanDiego,dist2nearestCity,dist2furthestCity,dist2allCity
f64,f64,f64,f64,f64,f64,f64,f64,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64
2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,true,0.98,0.072033,0.064262,0.053868,0.028684,0.054086,0.028684,0.072033,0.272933
3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,true,0.946,0.004236,0.022867,0.025374,0.091098,0.119034,0.004236,0.119034,0.262609
4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,true,1.576,0.069057,0.06033,0.050106,0.033789,0.05905,0.033789,0.069057,0.272332
2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,true,1.336,0.12067,0.117263,0.106563,0.029551,0.001457,0.001457,0.12067,0.375504
3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,true,4.5,0.018478,0.000375,0.010865,0.087989,0.115991,0.000375,0.115991,0.233698
6.8075,26.0,6.764372,1.091787,2147.0,2.70354,33.84,-118.31,true,3.714,0.094041,0.090052,0.079356,0.003827,0.025782,0.003827,0.094041,0.293057
2.3654,21.0,4.734884,1.011396,1112.0,2.937247,35.65,-117.64,true,0.811,0.074069,0.076502,0.066456,0.029196,0.051671,0.029196,0.076502,0.297894
2.3562,31.0,5.186567,1.154229,1346.0,3.348259,32.68,-117.07,true,0.824,0.120514,0.117176,0.106475,0.029432,0.001513,0.001513,0.120514,0.375111
2.2672,23.0,4.640155,1.064302,628.0,1.774461,38.53,-121.43,true,0.995,0.001207,0.018867,0.021793,0.090078,0.118121,0.001207,0.118121,0.250065
5.637,20.0,4.391863,1.053312,1634.0,2.681388,33.88,-118.36,true,2.291,0.093087,0.089053,0.078358,0.003445,0.026784,0.003445,0.093087,0.290728


In [22]:
features = (train_joined_pl_add_dist.columns)

features.remove('MedHouseVal')

features, target
len(features)

(['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude',
  'is_generated',
  'dist2Sacramento',
  'dist2SanFrancisco',
  'dist2SanJose',
  'dist2LosAngeles',
  'dist2SanDiego',
  'dist2nearestCity',
  'dist2furthestCity',
  'dist2allCity'],
 'MedHouseVal')

17

# Training LGBMRegressor model

Let's begin by splitting our data into a train and validation set.

In [23]:
from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

The variable that we will be predicting is the `MedHouseVal`. We will use the rest of the columns (minus the id column) for training.

In [24]:
# these parameters come from soupmonsters awesome notebook here: https://www.kaggle.com/code/soupmonster/simple-lightgbm-baseline
params= {
 'learning_rate': 0.02, 
 'n_estimators': 100_000, 
 'metric': 'rmse',
 'lambda_l1': 1.945,
 'num_leaves': 87,
 'feature_fraction': 0.79,
 'bagging_fraction': 0.93,
 'bagging_freq': 4,
 'min_data_in_leaf': 103,
 'max_depth': 17,
}

In [25]:
clfs_f64pl = []
kf = KFold(n_splits=10, random_state=0, shuffle=True) # this line must be included in the same cell as the training block below
rmses = []

for i, (train_index, val_index) in enumerate(kf.split(train_joined_pl_add_dist)): # kf.split can work with pl.DataFrame
    X_train, X_val = train_joined_pl_add_dist[features][train_index].select(pl.all().cast(pl.Float64)), train_joined_pl_add_dist[features][val_index].select(pl.all().cast(pl.Float64))
    y_train, y_val = train_joined_pl_add_dist[target][train_index].cast(pl.Float64), train_joined_pl_add_dist[target][val_index].cast(pl.Float64)

    clf = LGBMRegressor(**params)
    clf.fit(X_train.to_numpy(),
            y_train.to_numpy(),
            eval_set=[(X_val.to_numpy(), y_val.to_numpy())], 
            callbacks=[lgbm.early_stopping(85, verbose=True)]) # why early_stop at 500 

    preds = clf.predict(X_val.to_numpy())
    
    clfs_f64pl.append(clf) # save 5 trained models into this list
    rmses.append(mean_squared_error(y_val.to_numpy(), preds, squared=False))
    
print(f'mean RMSE across all folds: {pl.Series(rmses).mean()}')

[1]	valid_0's rmse: 1.14643
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.13041
[3]	valid_0's rmse: 1.11599
[4]	valid_0's rmse: 1.10063
[5]	valid_0's rmse: 1.08576
[6]	valid_0's rmse: 1.0711
[7]	valid_0's rmse: 1.05779
[8]	valid_0's rmse: 1.04482
[9]	valid_0's rmse: 1.0313
[10]	valid_0's rmse: 1.01813
[11]	valid_0's rmse: 1.00481
[12]	valid_0's rmse: 0.992086
[13]	valid_0's rmse: 0.980017
[14]	valid_0's rmse: 0.967873
[15]	valid_0's rmse: 0.955951
[16]	valid_0's rmse: 0.945018
[17]	valid_0's rmse: 0.933796
[18]	valid_0's rmse: 0.924023
[19]	valid_0's rmse: 0.913454
[20]	valid_0's rmse: 0.903876
[21]	valid_0's rmse: 0.893681
[22]	valid_0's rmse: 0.883738
[23]	valid_0's rmse: 0.874009
[24]	valid_0's rmse: 0.864938
[25]	valid_0's rmse: 0.855728
[26]	valid_0's rmse: 0.84679
[27]	valid_0's rmse: 0.838217
[28]	valid_0's rmse: 0.829745
[29]	valid_0's rmse: 0.821309
[30]	valid_0's rmse: 0.813181
[31]	valid_0's rmse: 0.805276
[32]	valid_0's rmse: 0.797594
[3

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13745
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.12145
[3]	valid_0's rmse: 1.10696
[4]	valid_0's rmse: 1.09157
[5]	valid_0's rmse: 1.07673
[6]	valid_0's rmse: 1.0622
[7]	valid_0's rmse: 1.04863
[8]	valid_0's rmse: 1.03561
[9]	valid_0's rmse: 1.02213
[10]	valid_0's rmse: 1.00885
[11]	valid_0's rmse: 0.995823
[12]	valid_0's rmse: 0.983184
[13]	valid_0's rmse: 0.970934
[14]	valid_0's rmse: 0.958901
[15]	valid_0's rmse: 0.947166
[16]	valid_0's rmse: 0.936078
[17]	valid_0's rmse: 0.924966
[18]	valid_0's rmse: 0.915066
[19]	valid_0's rmse: 0.904543
[20]	valid_0's rmse: 0.894802
[21]	valid_0's rmse: 0.884581
[22]	valid_0's rmse: 0.874667
[23]	valid_0's rmse: 0.865064
[24]	valid_0's rmse: 0.855819
[25]	valid_0's rmse: 0.846549
[26]	valid_0's rmse: 0.83765
[27]	valid_0's rmse: 0.829045
[28]	valid_0's rmse: 0.820599
[29]	valid_0's rmse: 0.812255
[30]	valid_0's rmse: 0.804117
[31]	valid_0's rmse: 0.796206
[32]	valid_0's rmse: 0.788465


LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13733
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.12117
[3]	valid_0's rmse: 1.10657
[4]	valid_0's rmse: 1.09108
[5]	valid_0's rmse: 1.07606
[6]	valid_0's rmse: 1.06129
[7]	valid_0's rmse: 1.04774
[8]	valid_0's rmse: 1.03461
[9]	valid_0's rmse: 1.0211
[10]	valid_0's rmse: 1.00776
[11]	valid_0's rmse: 0.994672
[12]	valid_0's rmse: 0.981911
[13]	valid_0's rmse: 0.969624
[14]	valid_0's rmse: 0.9576
[15]	valid_0's rmse: 0.945691
[16]	valid_0's rmse: 0.934689
[17]	valid_0's rmse: 0.923408
[18]	valid_0's rmse: 0.913573
[19]	valid_0's rmse: 0.90295
[20]	valid_0's rmse: 0.893315
[21]	valid_0's rmse: 0.882967
[22]	valid_0's rmse: 0.872821
[23]	valid_0's rmse: 0.863095
[24]	valid_0's rmse: 0.853886
[25]	valid_0's rmse: 0.844614
[26]	valid_0's rmse: 0.835719
[27]	valid_0's rmse: 0.827054
[28]	valid_0's rmse: 0.818511
[29]	valid_0's rmse: 0.810087
[30]	valid_0's rmse: 0.801887
[31]	valid_0's rmse: 0.793885
[32]	valid_0's rmse: 0.786305
[3

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13055
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.11463
[3]	valid_0's rmse: 1.10043
[4]	valid_0's rmse: 1.08506
[5]	valid_0's rmse: 1.07018
[6]	valid_0's rmse: 1.05573
[7]	valid_0's rmse: 1.04227
[8]	valid_0's rmse: 1.02926
[9]	valid_0's rmse: 1.01591
[10]	valid_0's rmse: 1.00277
[11]	valid_0's rmse: 0.989817
[12]	valid_0's rmse: 0.977202
[13]	valid_0's rmse: 0.965091
[14]	valid_0's rmse: 0.95314
[15]	valid_0's rmse: 0.941339
[16]	valid_0's rmse: 0.930377
[17]	valid_0's rmse: 0.919262
[18]	valid_0's rmse: 0.909606
[19]	valid_0's rmse: 0.899063
[20]	valid_0's rmse: 0.889308
[21]	valid_0's rmse: 0.879055
[22]	valid_0's rmse: 0.869263
[23]	valid_0's rmse: 0.859637
[24]	valid_0's rmse: 0.850484
[25]	valid_0's rmse: 0.841377
[26]	valid_0's rmse: 0.832559
[27]	valid_0's rmse: 0.823989
[28]	valid_0's rmse: 0.815616
[29]	valid_0's rmse: 0.807437
[30]	valid_0's rmse: 0.799404
[31]	valid_0's rmse: 0.791705
[32]	valid_0's rmse: 0.784056

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.16348
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.14746
[3]	valid_0's rmse: 1.13299
[4]	valid_0's rmse: 1.11767
[5]	valid_0's rmse: 1.10272
[6]	valid_0's rmse: 1.0881
[7]	valid_0's rmse: 1.07453
[8]	valid_0's rmse: 1.06144
[9]	valid_0's rmse: 1.04799
[10]	valid_0's rmse: 1.03467
[11]	valid_0's rmse: 1.02161
[12]	valid_0's rmse: 1.00892
[13]	valid_0's rmse: 0.996766
[14]	valid_0's rmse: 0.984734
[15]	valid_0's rmse: 0.972887
[16]	valid_0's rmse: 0.961818
[17]	valid_0's rmse: 0.950683
[18]	valid_0's rmse: 0.940935
[19]	valid_0's rmse: 0.930287
[20]	valid_0's rmse: 0.920494
[21]	valid_0's rmse: 0.910137
[22]	valid_0's rmse: 0.90021
[23]	valid_0's rmse: 0.890494
[24]	valid_0's rmse: 0.881234
[25]	valid_0's rmse: 0.871889
[26]	valid_0's rmse: 0.862829
[27]	valid_0's rmse: 0.854294
[28]	valid_0's rmse: 0.845662
[29]	valid_0's rmse: 0.837265
[30]	valid_0's rmse: 0.829169
[31]	valid_0's rmse: 0.821251
[32]	valid_0's rmse: 0.813562
[3

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13106
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.11528
[3]	valid_0's rmse: 1.10109
[4]	valid_0's rmse: 1.08591
[5]	valid_0's rmse: 1.07124
[6]	valid_0's rmse: 1.05678
[7]	valid_0's rmse: 1.04341
[8]	valid_0's rmse: 1.03052
[9]	valid_0's rmse: 1.01727
[10]	valid_0's rmse: 1.00422
[11]	valid_0's rmse: 0.991464
[12]	valid_0's rmse: 0.978915
[13]	valid_0's rmse: 0.966883
[14]	valid_0's rmse: 0.954911
[15]	valid_0's rmse: 0.943116
[16]	valid_0's rmse: 0.932257
[17]	valid_0's rmse: 0.921118
[18]	valid_0's rmse: 0.911469
[19]	valid_0's rmse: 0.901052
[20]	valid_0's rmse: 0.891489
[21]	valid_0's rmse: 0.881297
[22]	valid_0's rmse: 0.871436
[23]	valid_0's rmse: 0.861829
[24]	valid_0's rmse: 0.852751
[25]	valid_0's rmse: 0.843643
[26]	valid_0's rmse: 0.834902
[27]	valid_0's rmse: 0.82635
[28]	valid_0's rmse: 0.817949
[29]	valid_0's rmse: 0.80963
[30]	valid_0's rmse: 0.801561
[31]	valid_0's rmse: 0.793707
[32]	valid_0's rmse: 0.786092


LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.14859
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.13289
[3]	valid_0's rmse: 1.11862
[4]	valid_0's rmse: 1.10348
[5]	valid_0's rmse: 1.08881
[6]	valid_0's rmse: 1.07451
[7]	valid_0's rmse: 1.06121
[8]	valid_0's rmse: 1.0484
[9]	valid_0's rmse: 1.03519
[10]	valid_0's rmse: 1.02214
[11]	valid_0's rmse: 1.00924
[12]	valid_0's rmse: 0.996794
[13]	valid_0's rmse: 0.984812
[14]	valid_0's rmse: 0.973034
[15]	valid_0's rmse: 0.961308
[16]	valid_0's rmse: 0.950557
[17]	valid_0's rmse: 0.939495
[18]	valid_0's rmse: 0.929702
[19]	valid_0's rmse: 0.919178
[20]	valid_0's rmse: 0.90972
[21]	valid_0's rmse: 0.899592
[22]	valid_0's rmse: 0.889913
[23]	valid_0's rmse: 0.880353
[24]	valid_0's rmse: 0.871286
[25]	valid_0's rmse: 0.862187
[26]	valid_0's rmse: 0.853278
[27]	valid_0's rmse: 0.844847
[28]	valid_0's rmse: 0.836471
[29]	valid_0's rmse: 0.828284
[30]	valid_0's rmse: 0.820293
[31]	valid_0's rmse: 0.812487
[32]	valid_0's rmse: 0.80497
[3

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.13974
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.12365
[3]	valid_0's rmse: 1.10889
[4]	valid_0's rmse: 1.09348
[5]	valid_0's rmse: 1.07852
[6]	valid_0's rmse: 1.06393
[7]	valid_0's rmse: 1.05027
[8]	valid_0's rmse: 1.03705
[9]	valid_0's rmse: 1.02356
[10]	valid_0's rmse: 1.01021
[11]	valid_0's rmse: 0.997186
[12]	valid_0's rmse: 0.984417
[13]	valid_0's rmse: 0.972222
[14]	valid_0's rmse: 0.960175
[15]	valid_0's rmse: 0.948335
[16]	valid_0's rmse: 0.937257
[17]	valid_0's rmse: 0.926006
[18]	valid_0's rmse: 0.916057
[19]	valid_0's rmse: 0.905368
[20]	valid_0's rmse: 0.895522
[21]	valid_0's rmse: 0.885288
[22]	valid_0's rmse: 0.875153
[23]	valid_0's rmse: 0.865485
[24]	valid_0's rmse: 0.856174
[25]	valid_0's rmse: 0.846918
[26]	valid_0's rmse: 0.83799
[27]	valid_0's rmse: 0.829231
[28]	valid_0's rmse: 0.820691
[29]	valid_0's rmse: 0.812277
[30]	valid_0's rmse: 0.803941
[31]	valid_0's rmse: 0.795957
[32]	valid_0's rmse: 0.788032

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.1422
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.12632
[3]	valid_0's rmse: 1.1122
[4]	valid_0's rmse: 1.09712
[5]	valid_0's rmse: 1.08224
[6]	valid_0's rmse: 1.0677
[7]	valid_0's rmse: 1.05438
[8]	valid_0's rmse: 1.04137
[9]	valid_0's rmse: 1.02805
[10]	valid_0's rmse: 1.01492
[11]	valid_0's rmse: 1.00196
[12]	valid_0's rmse: 0.989207
[13]	valid_0's rmse: 0.977025
[14]	valid_0's rmse: 0.965114
[15]	valid_0's rmse: 0.953313
[16]	valid_0's rmse: 0.942334
[17]	valid_0's rmse: 0.931205
[18]	valid_0's rmse: 0.921406
[19]	valid_0's rmse: 0.910906
[20]	valid_0's rmse: 0.901289
[21]	valid_0's rmse: 0.891166
[22]	valid_0's rmse: 0.881345
[23]	valid_0's rmse: 0.871781
[24]	valid_0's rmse: 0.862594
[25]	valid_0's rmse: 0.853372
[26]	valid_0's rmse: 0.844416
[27]	valid_0's rmse: 0.835951
[28]	valid_0's rmse: 0.827406
[29]	valid_0's rmse: 0.819016
[30]	valid_0's rmse: 0.810865
[31]	valid_0's rmse: 0.802907
[32]	valid_0's rmse: 0.795245
[3

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

[1]	valid_0's rmse: 1.14644
Training until validation scores don't improve for 85 rounds
[2]	valid_0's rmse: 1.13041
[3]	valid_0's rmse: 1.11599
[4]	valid_0's rmse: 1.10049
[5]	valid_0's rmse: 1.08555
[6]	valid_0's rmse: 1.07094
[7]	valid_0's rmse: 1.05745
[8]	valid_0's rmse: 1.04451
[9]	valid_0's rmse: 1.03106
[10]	valid_0's rmse: 1.01778
[11]	valid_0's rmse: 1.00484
[12]	valid_0's rmse: 0.992215
[13]	valid_0's rmse: 0.980154
[14]	valid_0's rmse: 0.96814
[15]	valid_0's rmse: 0.956242
[16]	valid_0's rmse: 0.945265
[17]	valid_0's rmse: 0.933958
[18]	valid_0's rmse: 0.924173
[19]	valid_0's rmse: 0.913559
[20]	valid_0's rmse: 0.903926
[21]	valid_0's rmse: 0.893718
[22]	valid_0's rmse: 0.88382
[23]	valid_0's rmse: 0.874189
[24]	valid_0's rmse: 0.865146
[25]	valid_0's rmse: 0.855877
[26]	valid_0's rmse: 0.846885
[27]	valid_0's rmse: 0.838303
[28]	valid_0's rmse: 0.829857
[29]	valid_0's rmse: 0.821614
[30]	valid_0's rmse: 0.813565
[31]	valid_0's rmse: 0.805739
[32]	valid_0's rmse: 0.798205
[

LGBMRegressor(bagging_fraction=0.93, bagging_freq=4, feature_fraction=0.79,
              lambda_l1=1.945, learning_rate=0.02, max_depth=17, metric='rmse',
              min_data_in_leaf=103, n_estimators=100000, num_leaves=87)

mean RMSE across all folds: 0.42904179521301045


# Train a catboost model

In [26]:
from catboost import CatBoostRegressor
clfs_f64pl_cat = []
rmses = []
kf = KFold(n_splits=10, random_state=1, shuffle=True)
for train_index, val_index in kf.split(train_joined_pl_add_dist):
    X_train = train_joined_pl_add_dist[features][train_index].select(pl.all().cast(pl.Float64))
    X_val = train_joined_pl_add_dist[features][val_index].select(pl.all().cast(pl.Float64))
    y_train = train_joined_pl_add_dist[target][train_index].cast(pl.Float64)
    y_val = train_joined_pl_add_dist[target][val_index].cast(pl.Float64)

    clf = CatBoostRegressor(iterations=100_000, loss_function='RMSE')
    clf.fit(X_train.to_numpy(),
            y_train.to_numpy(),
            eval_set=(X_val.to_numpy(), y_val.to_numpy()),
            early_stopping_rounds=1000, verbose=False)

    preds = clf.predict(X_val.to_numpy())

    clfs_f64pl_cat.append(clf)
    rmses.append(mean_squared_error(y_val.to_numpy(), preds, squared=False))
print(f'mean RMSE across all folds: {np.mean(rmses)}')

<catboost.core.CatBoostRegressor at 0x7fc870497550>

<catboost.core.CatBoostRegressor at 0x7fc870489610>

<catboost.core.CatBoostRegressor at 0x7fc8704899a0>

<catboost.core.CatBoostRegressor at 0x7fc8704965b0>

<catboost.core.CatBoostRegressor at 0x7fc870497b50>

<catboost.core.CatBoostRegressor at 0x7fc86af868b0>

<catboost.core.CatBoostRegressor at 0x7fc870489970>

<catboost.core.CatBoostRegressor at 0x7fc870497790>

<catboost.core.CatBoostRegressor at 0x7fc870497c40>

<catboost.core.CatBoostRegressor at 0x7fc8704974f0>

mean RMSE across all folds: 0.4356248721244428


Let us now look at the variables that are important according to our model.

In [27]:
(
    pl.DataFrame({
    "features": features,
    "importance": clf.feature_importances_, # using the latest model
    })
    .with_columns([
        (pl.col('importance')/pl.col('importance').sum()).alias('ratio')
    ])
    .sort('ratio', reverse=True)
)

features,importance,ratio
str,f64,f64
"""MedInc""",28.165776,0.281658
"""AveOccup""",11.746715,0.117467
"""dist2nearestCi...",9.821343,0.098213
"""Longitude""",9.040001,0.0904
"""Latitude""",7.004514,0.070045
"""dist2Sacrament...",5.085325,0.050853
"""AveRooms""",4.121815,0.041218
"""HouseAge""",3.958421,0.039584
"""dist2furthestC...",3.92325,0.039233
"""dist2LosAngele...",3.288231,0.032882


# prepare test set

In [28]:
features

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'dist2Sacramento',
 'dist2SanFrancisco',
 'dist2SanJose',
 'dist2LosAngeles',
 'dist2SanDiego',
 'dist2nearestCity',
 'dist2furthestCity',
 'dist2allCity']

In [29]:
test_pl_adddist = (

    test_pl
    .select([
        pl.all(),
        pl.lit(True).alias('is_generated'),
    ])
    .with_columns([
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], Sacramento[0], Sacramento[1])).alias('dist2Sacramento'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanFrancisco[0], SanFrancisco[1])).alias('dist2SanFrancisco'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanJose[0], SanJose[1])).alias('dist2SanJose'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], LosAngeles[0], LosAngeles[1])).alias('dist2LosAngeles'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanDiego[0], SanDiego[1])).alias('dist2SanDiego'),        
    ])
    .with_columns([
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: min([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2nearestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: max([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2furthestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: sum([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2allCity')        
    ])
)
test_pl_adddist

id,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,is_generated,dist2Sacramento,dist2SanFrancisco,dist2SanJose,dist2LosAngeles,dist2SanDiego,dist2nearestCity,dist2furthestCity,dist2allCity
i64,f64,f64,f64,f64,f64,f64,f64,f64,bool,f64,f64,f64,f64,f64,f64,f64,f64
37137,1.7062,35.0,4.966368,1.096539,1318.0,2.844411,39.75,-121.85,true,0.02103,0.035246,0.042157,0.111431,0.13932,0.02103,0.13932,0.349183
37138,1.3882,22.0,4.187035,1.098229,2296.0,3.180218,33.95,-118.29,true,0.09248,0.088755,0.078053,0.001906,0.027087,0.001906,0.09248,0.288281
37139,7.7197,21.0,7.129436,0.959276,1535.0,2.888889,33.61,-117.81,true,0.10105,0.097784,0.08708,0.009955,0.018241,0.009955,0.10105,0.31411
37140,4.6806,49.0,4.769697,1.048485,707.0,1.74359,34.17,-118.34,true,0.088763,0.085369,0.074665,0.002482,0.03062,0.002482,0.088763,0.2819
37141,3.1284,25.0,3.765306,1.081633,4716.0,2.003827,34.17,-118.29,true,0.089116,0.085847,0.075143,0.002162,0.030219,0.002162,0.089116,0.282486
37142,5.7268,23.0,6.0625,1.14527,1039.0,2.387097,33.81,-118.11,true,0.095873,0.092303,0.081601,0.00465,0.023567,0.00465,0.095873,0.297994
37143,3.3583,25.0,5.068783,1.227273,949.0,3.602564,33.14,-117.12,true,0.113253,0.110574,0.099871,0.022811,0.007432,0.007432,0.113253,0.353941
37144,4.1302,35.0,5.944724,1.062361,1043.0,3.165919,34.09,-117.98,true,0.092583,0.089894,0.079196,0.003869,0.026771,0.003869,0.092583,0.292312
37145,1.7991,23.0,4.928364,1.174061,848.0,2.558011,37.3,-120.89,true,0.023791,0.022773,0.013868,0.067969,0.096099,0.013868,0.096099,0.2245
37146,1.7857,44.0,5.717122,1.101644,4276.0,2.373069,33.98,-117.33,true,0.099327,0.097848,0.087194,0.013278,0.022198,0.013278,0.099327,0.319845


# Ensemble 

In [30]:
test_preds = []

for clf in (clfs_f64pl + clfs_f64pl_cat):
    preds = clf.predict(test_pl_adddist[features].to_numpy())
    test_preds.append(preds)

test_preds_mean_pl = (
    pl.DataFrame(test_preds)
    .transpose()
    .select([
        pl.all().explode()
    ])
    .mean(axis=1)
    .to_list()
)

# Make a submission

In [31]:
submission = pl.DataFrame({
    'id': test_pl.select('id').to_series(),
    'MedHouseVal': test_preds_mean_pl
})
# submission.head()

submission.write_csv('clfs_lgbm_cat_extsrc.csv')

This is shaping up to be a very excting challenge! 🥳 

**If you found this notebook useful, please upvote! 🙏 Thank you!**

All the best in the competition!

In [26]:
60/555

0.10810810810810811