
# imports 

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from scipy.optimize import minimize
from tqdm import tqdm

import ydf
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


# training p1

In [0]:
#data
train = pd.read_csv("data/train_features.csv")
y = pd.read_csv("data/train_labels.csv")
test = pd.read_csv("data/test_features.csv")
ss = pd.read_csv("data/submission_format.csv")

merged_df = pd.merge(train, y, on='uid', how='left')
merged_test = pd.merge(test, ss, on='uid', how='left')

def feature_engineering(data):
    data['rjob_hrswk_change'] = (data['rjob_hrswk_12'] - data['rjob_hrswk_03']).astype(float)
    data['max_work_year'] = data[['rjob_end_12','rjob_end_03']].max(axis=1).astype(float)
    data['years_since_work'] = (data['year'] - data['max_work_year']).astype(float)
    data['hincome_change'] = (data['hincome_12'] - data['hincome_03']).astype(float)
    data['niadl_change'] = (data['n_iadl_12'] - data['n_iadl_03']).astype(float)
    data['adl_change'] = (data['n_adl_12'] - data['n_adl_03']).astype(float)
    data['depr_change'] = (data['n_depr_12'] - data['n_depr_03']).astype(float)

    data['glob_hlth_03']=data['glob_hlth_03'].replace({
        '5. Poor':0, '4. Fair':1, '3. Good':2, '2. Very good':3, '1. Excellent':4}).astype(float)
    data['glob_hlth_12']=data['glob_hlth_12'].replace({
        '5. Poor':0, '4. Fair':1, '3. Good':2, '2. Very good':3, '1. Excellent':4}).astype(float)
    data['glob_hlth_change']=(data['glob_hlth_12'] - data['glob_hlth_03']).astype(float)

    data['bmi_03']=data['bmi_03'].replace({
        '1. Underweight':1, '2. Normal weight':2, '3. Overweight':3, '4. Obese':4, '5. Morbidly obese':5}).astype(float)
    data['bmi_12']=data['bmi_12'].replace({
        '1. Underweight':1, '2. Normal weight':2, '3. Overweight':3, '4. Obese':4, '5. Morbidly obese':5}).astype(float)
    data['bmi_change']=(data['bmi_12'] - data['bmi_03']).astype(float)

    data['employment_03']=data['employment_03'].replace({
        '1. Currently Working':'Working', 
        '2. Currently looking for work':'Looking for work', 
        '3. Dedicated to household chores':'House', 
        '4. Retired, incapacitated, or does not work':'No work'})
    data['employment_12']=data['employment_12'].replace({
        '1. Currently Working':'Working', 
        '2. Currently looking for work':'Looking for work', 
        '3. Dedicated to household chores':'House', 
        '4. Retired, incapacitated, or does not work':'No work'})

    data['memory_12']=data['memory_12'].replace({
        '5. Poor':0, '4. Fair':1, '3. Good':2, '2. Very good':3, '1. Excellent':4}).astype(float)

    data['edu_gru_03']=data['edu_gru_03'].replace({
        '0. No education':0,'1. 1–5 years':1, '2. 6 years':2, '3. 7–9 years':3,'4. 10+ years':4}).astype(float)
    data['edu_gru_12']=data['edu_gru_12'].replace({
        '0. No education':0,'1. 1–5 years':1, '2. 6 years':2, '3. 7–9 years':3,'4. 10+ years':4}).astype(float)
    data['edu_gru_change']=(data['edu_gru_12']-data['edu_gru_03']).astype(float)

    data['illnesses_change']=(data['n_illnesses_12'] - data['n_illnesses_03']).astype(float)
    return data

Y = merged_df['composite_score']
data = pd.concat((merged_df, merged_test)).reset_index(drop=True).copy()
data = feature_engineering(data)
data=data.drop(columns=['uid','composite_score'],axis=1)

object_cols = data.select_dtypes(include=['object']).columns
for col in object_cols:
    data[col] = pd.Categorical(data[col].fillna("Missing"))

merged_df = data[:len(Y)]
merged_test = data[len(Y):]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
optimized_weights_list = []

templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
ydf_params = templates["benchmark_rank1v1"]


# training p2

In [0]:

for fold, (train_index, val_index) in enumerate(kf.split(merged_df), 1):
    print(f"========== Fold {fold} ==========")
    train_X, val_X = merged_df.iloc[train_index], merged_df.iloc[val_index]
    train_y, val_y = Y.iloc[train_index], Y.iloc[val_index]

    ##########################################################################################
    ##########################################################################################

    # Scikit-learn
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded_train_X = encoder.fit_transform(train_X.select_dtypes(include=['category']))
    encoded_val_X = encoder.transform(val_X.select_dtypes(include=['category']))

    # Combine encoded features with the rest of the dataset
    train_X_encoded = np.hstack([train_X.select_dtypes(exclude=['category']).values, encoded_train_X])
    val_X_encoded = np.hstack([val_X.select_dtypes(exclude=['category']).values, encoded_val_X])
    
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    train_X_encoded = imputer.fit_transform(train_X_encoded)
    val_X_encoded = imputer.transform(val_X_encoded)
    
    # Sklearn Gradient Boosting
    model5 = GradientBoostingRegressor(
        n_estimators=10000, learning_rate=0.01, max_depth=3, random_state=42
    )
    model5.fit(train_X_encoded, train_y)
    pred5 = model5.predict(val_X_encoded)


    ##########################################################################################
    ##########################################################################################

    # LightGBM
    train_data_lgb = lgb.Dataset(train_X, label=train_y, categorical_feature='auto')
    val_data_lgb = lgb.Dataset(val_X, label=val_y, categorical_feature='auto')
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'n_estimators': 10000,
        'random_seed': 42
    }
    model1 = lgb.train(
        params,
        train_data_lgb,
        valid_sets=[val_data_lgb],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=500, verbose=True),
            lgb.log_evaluation(100)
        ]
    )
    pred1 = model1.predict(val_X, num_iteration=model1.best_iteration)

    ##########################################################################################
    ##########################################################################################

    # CatBoost
    model2 = CatBoostRegressor(
        iterations=10000, learning_rate=0.01, depth=10, loss_function='RMSE',
        cat_features=train_X.select_dtypes(include=['category']).columns.to_list(),
        verbose=100, early_stopping_rounds=500
    )
    model2.fit(train_X, train_y, eval_set=(val_X, val_y))
    pred2 = model2.predict(val_X)

    ##########################################################################################
    ##########################################################################################

    # XGBoost
    model3 = XGBRegressor(
        n_estimators=10000, learning_rate=0.01,
        max_depth=3, random_state=42,
        enable_categorical=True,
        eval_metric='rmse', early_stopping_rounds=500, verbosity=1
    )
    model3.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=100)
    pred3 = model3.predict(val_X)

    ##########################################################################################
    ##########################################################################################

    # YDF 
    train_data_combined = train_X.copy()
    train_data_combined['composite_score'] = train_y
    # train_ds = ydf.Dataset.from_pandas(train_data_combined, label='composite_score')
    
    model4 = (
        ydf.GradientBoostedTreesLearner(
            label='composite_score', 
            task=ydf.Task.REGRESSION,
            **ydf_params
        )
        .train(train_data_combined)
    )
    pred4 = model4.predict(val_X)  # predicts directly from val_X DataFrame


    ##########################################################################################
    ##########################################################################################

    # Optimize weights for all five models
    def loss_function(weights):
        w1, w2, w3, w4, w5 = weights
        combined_predictions = (w1*pred1 + w2*pred2 + w3*pred3 + w4*pred4 + w5*pred5)
        mse = np.mean((combined_predictions - val_y) ** 2)
        return mse

    initial_weights = np.array([1/5]*5)
    constraints = {'type': 'eq', 'fun': lambda w: w.sum() - 1}
    bounds = [(0,1)]*5

    result = minimize(loss_function, initial_weights, constraints=constraints, bounds=bounds)
    optimized_weights = result.x

    final_predictions = (optimized_weights[0]*pred1 +
                         optimized_weights[1]*pred2 +
                         optimized_weights[2]*pred3 +
                         optimized_weights[3]*pred4 +
                         optimized_weights[4]*pred5)

    fold_rmse = np.sqrt(mean_squared_error(val_y, final_predictions))
    print(f"Fold {fold} RMSE: {fold_rmse}")
    fold_results.append(fold_rmse)
    optimized_weights_list.append(optimized_weights)







[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007909 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1924
[LightGBM] [Info] Number of data points in the train set: 3474, number of used features: 195
[LightGBM] [Info] Start training from score 157.136730
Training until validation scores don't improve for 500 rounds
[100]	valid's rmse: 44.2152
[200]	valid's rmse: 40.554
[300]	valid's rmse: 39.0301
[400]	valid's rmse: 38.2889
[500]	valid's rmse: 37.7932
[600]	valid's rmse: 37.4694
[700]	valid's rmse: 37.1953
[800]	valid's rmse: 37.0456
[900]	valid's rmse: 36.9341
[1000]	valid's rmse: 36.8258
[1100]	valid's rmse: 36.7689
[1200]	valid's rmse: 36.7129
[1300]	valid's rmse: 36.6745
[1400]	valid's rmse: 36.686
[1500]	valid's rmse: 36.7099
[1600]	valid's rmse: 36.7041
[1700]	valid's rmse: 36.7297
[1800]	valid's rmse: 36.7263
Early stopping, best iteration is:
[1327]	valid's rmse: 36.6658
0:	learn: 61.106362



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1918
[LightGBM] [Info] Number of data points in the train set: 3474, number of used features: 195
[LightGBM] [Info] Start training from score 157.271445
Training until validation scores don't improve for 500 rounds
[100]	valid's rmse: 45.5276
[200]	valid's rmse: 40.5444
[300]	valid's rmse: 38.7598
[400]	valid's rmse: 37.8913
[500]	valid's rmse: 37.5243
[600]	valid's rmse: 37.218
[700]	valid's rmse: 37.0013
[800]	valid's rmse: 36.8715
[900]	valid's rmse: 36.76
[1000]	valid's rmse: 36.6967
[1100]	valid's rmse: 36.6146
[1200]	valid's rmse: 36.557
[1300]	valid's rmse: 36.5015
[1400]	valid's rmse: 36.4897
[1500]	valid's rmse: 36.4513
[1600]	valid's rmse: 36.4412
[1700]	valid's rmse: 36.4359
[1800]	valid's rmse: 36.4388
[1900]	valid's rmse: 36.4367
[2000]	valid's rmse: 36.4755
[2100]	valid's rmse: 36.469



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 3474, number of used features: 195
[LightGBM] [Info] Start training from score 156.375360
Training until validation scores don't improve for 500 rounds
[100]	valid's rmse: 45.4773
[200]	valid's rmse: 41.9952
[300]	valid's rmse: 40.7429
[400]	valid's rmse: 40.1255
[500]	valid's rmse: 39.8105
[600]	valid's rmse: 39.6052
[700]	valid's rmse: 39.4801
[800]	valid's rmse: 39.321
[900]	valid's rmse: 39.2487
[1000]	valid's rmse: 39.1792
[1100]	valid's rmse: 39.1557
[1200]	valid's rmse: 39.1131
[1300]	valid's rmse: 39.0614
[1400]	valid's rmse: 39.047
[1500]	valid's rmse: 39.0166
[1600]	valid's rmse: 39.0289
[1700]	valid's rmse: 39.0461
[1800]	valid's rmse: 39.0231
[1900]	valid's rmse: 39.0052
[2000]	valid's rmse: 38.9936
[2100]	valid's rmse: 38.9



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1924
[LightGBM] [Info] Number of data points in the train set: 3475, number of used features: 195
[LightGBM] [Info] Start training from score 157.237122
Training until validation scores don't improve for 500 rounds
[100]	valid's rmse: 45.119
[200]	valid's rmse: 40.8769
[300]	valid's rmse: 39.3643
[400]	valid's rmse: 38.7037
[500]	valid's rmse: 38.1786
[600]	valid's rmse: 37.7851
[700]	valid's rmse: 37.5267
[800]	valid's rmse: 37.2898
[900]	valid's rmse: 37.2023
[1000]	valid's rmse: 37.0639
[1100]	valid's rmse: 36.955
[1200]	valid's rmse: 36.8508
[1300]	valid's rmse: 36.807
[1400]	valid's rmse: 36.7527
[1500]	valid's rmse: 36.6901
[1600]	valid's rmse: 36.6676
[1700]	valid's rmse: 36.6405
[1800]	valid's rmse: 36.634
[1900]	valid's rmse: 36.6238
[2000]	valid's rmse: 36.5785
[2100]	valid's rmse: 36.549



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1910
[LightGBM] [Info] Number of data points in the train set: 3475, number of used features: 195
[LightGBM] [Info] Start training from score 157.063309
Training until validation scores don't improve for 500 rounds
[100]	valid's rmse: 46.2887
[200]	valid's rmse: 42.8784
[300]	valid's rmse: 41.7866
[400]	valid's rmse: 41.3726
[500]	valid's rmse: 41.0776
[600]	valid's rmse: 40.9306
[700]	valid's rmse: 40.8462
[800]	valid's rmse: 40.7456
[900]	valid's rmse: 40.6874
[1000]	valid's rmse: 40.6109
[1100]	valid's rmse: 40.5364
[1200]	valid's rmse: 40.5211
[1300]	valid's rmse: 40.4733
[1400]	valid's rmse: 40.4645
[1500]	valid's rmse: 40.4567
[1600]	valid's rmse: 40.4233
[1700]	valid's rmse: 40.3846
[1800]	valid's rmse: 40.3589
[1900]	valid's rm


# post epoch

In [0]:
# post epoch results

# Display average results
final_cv_rmse = np.mean(fold_results)
print(f"Average RMSE across folds: {final_cv_rmse}")
print(f"Optimized weights per fold: {optimized_weights_list}")

# Calculate the average weights from cross-validation
average_weights = np.mean(optimized_weights_list, axis=0)

# LightGBM
final_train_data_lgb = lgb.Dataset(merged_df, label=Y, categorical_feature='auto')
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'n_estimators': 10000,
    'random_seed': 42
}
final_model1 = lgb.train(
    params,
    final_train_data_lgb
)

# CatBoost
final_model2 = CatBoostRegressor(
    iterations=10000, learning_rate=0.01, depth=10, loss_function='RMSE',
    cat_features=merged_df.select_dtypes(include=['category']).columns.to_list(),
    verbose=False
)
final_model2.fit(merged_df, Y, verbose=False)

# XGBoost 
final_model3 = XGBRegressor(
    n_estimators=10000, learning_rate=0.01,
    max_depth=3, random_state=42,
    enable_categorical=True,
    eval_metric='rmse', verbosity=0
)
final_model3.fit(merged_df, Y, verbose=False)

# Yggdrusil 
train_data_full_combined = merged_df.copy()
train_data_full_combined['composite_score'] = Y
final_model4 = (
    ydf.GradientBoostedTreesLearner(
        label='composite_score', 
        task=ydf.Task.REGRESSION,
        **ydf_params
    )
    .train(train_data_full_combined)
)

# Scikit-learn 
final_model5 = GradientBoostingRegressor(
    n_estimators=10000, learning_rate=0.01, max_depth=3, random_state=42
)
final_model5.fit(merged_df, Y)


# Generate predictions on the test dataset
test_pred1 = final_model1.predict(merged_test)
test_pred2 = final_model2.predict(merged_test)
test_pred3 = final_model3.predict(merged_test)
test_pred4 = final_model4.predict(merged_test)
test_pred5 = final_model5.predict(merged_test)

final_test_predictions = (
    average_weights[0] * test_pred1 +
    average_weights[1] * test_pred2 +
    average_weights[2] * test_pred3 +
    average_weights[3] * test_pred4 +
    average_weights[4] * test_pred5
)

final_test_predictions = np.round(final_test_predictions).astype(int)

print("Final blended predictions for the test dataset:")
print(final_test_predictions)

ss['composite_score']=final_test_predictions
ss.to_csv('LGBM_CatBoost_XGBoost_YDF_SklearnGB_FINAL.csv', index=False)

print(f"FINAL ENSEMBLE RMSE (CV average): {final_cv_rmse}")

Average RMSE across folds: 37.006780187843816
Optimized weights per fold: [array([6.93830566e-02, 2.83848311e-01, 1.17890399e-01, 5.28878234e-01,
       1.80127708e-17]), array([0.326815  , 0.4468834 , 0.04331949, 0.09240241, 0.09058018]), array([1.68981217e-10, 6.10708456e-01, 2.23672114e-01, 1.65619430e-01,
       2.48355744e-10]), array([5.41519188e-01, 2.00986976e-01, 7.81143646e-02, 1.79379472e-01,
       7.64363522e-11]), array([7.20910897e-02, 6.39050236e-01, 2.65575661e-16, 2.18849886e-01,
       7.00087885e-02])]
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2030
[LightGBM] [Info] Number of data points in the train set: 4343, number of used features: 195
[LightGBM] [Info] Start training from score 157.016809




Train model on 4343 examples
Model trained in 0:00:04.554793


[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
[0;32m~/.ipykernel/2701/command-4042265090800783-2968310636[0m in [0;36m?[0;34m()[0m
[1;32m     54[0m [0;31m# Scikit-learn[0m[0;34m[0m[0;34m[0m[0m
[1;32m     55[0m final_model5 = GradientBoostingRegressor(
[1;32m     56[0m     [0mn_estimators[0m[0;34m=[0m[0;36m10000[0m[0;34m,[0m [0mlearning_rate[0m[0;34m=[0m[0;36m0.01[0m[0;34m,[0m [0mmax_depth[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0;36m42[0m[0;34m[0m[0;34m[0m[0m
[1;32m     57[0m )
[0;32m---> 58[0;31m [0mfinal_model5[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mmerged_df[0m[0;34m,[0m [0mY[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     59[0m [0;34m[0m[0m
[1;32m     60[0m [0;34m[0m[0m
[1;32m     61[0m [0;31m# Generate predictions on the test dataset[0m[0;34m[0m


## clean data for Scikit-learn

In [0]:
# Replace 'Missing' with NaN
merged_df.replace('Missing', np.nan, inplace=True)
merged_test.replace('Missing', np.nan, inplace=True)

# Convert all columns to numeric, coerce non-numeric to NaN
merged_df = merged_df.apply(pd.to_numeric, errors='coerce')
merged_test = merged_test.apply(pd.to_numeric, errors='coerce')

# Drop any columns in both train/test that are entirely NaN
all_nan_cols = merged_df.columns[merged_df.isnull().all()]
merged_df.drop(all_nan_cols, axis=1, inplace=True)
merged_test.drop(all_nan_cols, axis=1, inplace=True)

# Impute missing values using mean for each column
merged_df.fillna(merged_df.mean(), inplace=True)
merged_test.fillna(merged_test.mean(), inplace=True)

# Double-check no NaNs remain
print("Train NaN count after imputation:", merged_df.isnull().sum().sum())
print("Test NaN count after imputation:", merged_test.isnull().sum().sum())

# Now merged_df and merged_test should have no NaNs and be purely numeric.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_test.replace('Missing', np.nan, inplace=True)


Train NaN count after imputation: 0
Test NaN count after imputation: 0


In [0]:
# Handle missing values by replacing 'Missing' with NaN and then imputing or dropping
merged_df.replace('Missing', np.nan, inplace=True)

# Option 2: Impute missing values (e.g., with the mean of the column)
merged_df.fillna(merged_df.mean(), inplace=True)

# post epoch results

# Display average results
final_cv_rmse = np.mean(fold_results)
print(f"Average RMSE across folds: {final_cv_rmse}")
print(f"Optimized weights per fold: {optimized_weights_list}")

# Calculate the average weights from cross-validation
average_weights = np.mean(optimized_weights_list, axis=0)


# Scikit-learn 
final_model5 = GradientBoostingRegressor(
    n_estimators=10000, learning_rate=0.01, max_depth=3, random_state=42
)
final_model5.fit(merged_df, Y)

# LightGBM
final_train_data_lgb = lgb.Dataset(merged_df, label=Y, categorical_feature='auto')
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'n_estimators': 10000,
    'random_seed': 42
}
final_model1 = lgb.train(
    params,
    final_train_data_lgb
)

# CatBoost
final_model2 = CatBoostRegressor(
    iterations=10000, learning_rate=0.01, depth=10, loss_function='RMSE',
    cat_features=merged_df.select_dtypes(include=['category']).columns.to_list(),
    verbose=False
)
final_model2.fit(merged_df, Y, verbose=False)

# XGBoost 
final_model3 = XGBRegressor(
    n_estimators=10000, learning_rate=0.01,
    max_depth=3, random_state=42,
    enable_categorical=True,
    eval_metric='rmse', verbosity=0
)
final_model3.fit(merged_df, Y, verbose=False)


## ydf onward

In [0]:
# Yggdrusil 
train_data_full_combined = merged_df.copy()
train_data_full_combined['composite_score'] = Y
final_model4 = (
    ydf.GradientBoostedTreesLearner(
        label='composite_score', 
        task=ydf.Task.REGRESSION,
        **ydf_params
    )
    .train(train_data_full_combined)
)

# Generate predictions on the test dataset
test_pred1 = final_model1.predict(merged_test)
test_pred2 = final_model2.predict(merged_test)
test_pred3 = final_model3.predict(merged_test)
test_pred4 = final_model4.predict(merged_test)
test_pred5 = final_model5.predict(merged_test)

final_test_predictions = (
    average_weights[0] * test_pred1 +
    average_weights[1] * test_pred2 +
    average_weights[2] * test_pred3 +
    average_weights[3] * test_pred4 +
    average_weights[4] * test_pred5
)

final_test_predictions = np.round(final_test_predictions).astype(int)

print("Final blended predictions for the test dataset:")
print(final_test_predictions)

ss['composite_score'] = final_test_predictions
ss.to_csv('LGBM_CatBoost_XGBoost_YDF_SklearnGB_FINAL.csv', index=False)

print(f"FINAL ENSEMBLE RMSE (CV average): {final_cv_rmse}")

Train model on 4343 examples
Model trained in 0:00:09.632628
Final blended predictions for the test dataset:
[188 201 207 ... 185 166 149]
FINAL ENSEMBLE RMSE (CV average): 37.006780187843816
