In [3]:
import pickle
from urllib.request import urlopen
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

from sklearn import preprocessing
from sklearn.ensemble import BaggingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

In [8]:
# dataset website: https://dev.meteostat.net/bulk/hourly.html#endpoints
# dataset 2020 https://bulk.meteostat.net/v2/hourly/2020/07156.csv.gz
# dataset 2021 https://bulk.meteostat.net/v2/hourly/2021/07156.csv.gz
# license https://dev.meteostat.net/terms.html#license

## This notebook contains a list of regressors we found useful and hence noted down there mean cv and bagged scores.

# With Covid data (Standard scaling all variables)

## Catboost

In [21]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=10,
        l2_leaf_reg=10,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.5,
        random_strength=5)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.341 ± 0.0324  64.1 ± 25.51
	valid  0.679 ± 0.1041    3.3 ± 0.48
	test   0.565 ± 0.0198    0.4 ± 0.06
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.687
	test   0.545


In [22]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=10,
        l2_leaf_reg=5,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.5,
        random_strength=5)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.336 ± 0.0341  61.6 ± 28.64
	valid  0.678 ± 0.1085    3.1 ± 0.13
	test   0.561 ± 0.0207    0.4 ± 0.06
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.686
	test   0.541


In [5]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=10,
        l2_leaf_reg=15,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.5,
        random_strength=5)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.344 ± 0.0313  64.4 ± 26.56
	valid  0.679 ± 0.1119    3.2 ± 0.27
	test   0.565 ± 0.0199    0.4 ± 0.04
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.688
    test   0.548

In [6]:
regressor = CatBoostRegressor(
        iterations=700,
        learning_rate=0.03,
        depth=10,
        l2_leaf_reg=5,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.4,
        random_strength=6)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.415 ± 0.0393  44.9 ± 19.86
	valid    0.7 ± 0.1018    2.8 ± 0.11
	test   0.574 ± 0.0138    0.3 ± 0.01
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.707
	test   0.559


## XGBoost

In [23]:
regressor = XGBRegressor(colsample_bytree=0.4,eval_metric='rmse',
                         gamma=0.3, learning_rate=0.1,
                         max_cat_threshold=64, max_cat_to_onehot=4,
                         max_depth=9, min_child_weight=3,
                         n_estimators=200, random_state=0)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.341 ± 0.0398  35.8 ± 19.08
	valid   0.709 ± 0.106    4.4 ± 0.32
	test     0.6 ± 0.0206    0.5 ± 0.02
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.717
	test   0.564

In [25]:
regressor = XGBRegressor(colsample_bytree=0.4, eval_metric='rmse',
                         gamma=0.5, learning_rate=0.05,
                         max_cat_threshold=64, max_cat_to_onehot=4,
                         max_depth=9, min_child_weight=4,
                         n_estimators=200, subsample=0.5,
                         random_state=0)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.418 ± 0.0416  31.1 ± 16.31
	valid   0.717 ± 0.092    4.2 ± 0.19
	test     0.59 ± 0.014    0.5 ± 0.02
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.723
	test   0.566


## LightGBM

In [30]:
regressor = LGBMRegressor(learning_rate=0.05, n_estimators=1000,
                          colsample_bytree=0.7, reg_lambda=0.5)

Mean CV scores
----------------------------
	score            rmse         time
	train  0.368 ± 0.0345    6.1 ± 2.5
	valid  0.738 ± 0.1012  11.2 ± 0.76
	test   0.606 ± 0.0315   1.1 ± 0.04
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.745
	test   0.566


## Gradient Boosting

In [31]:
regressor = GradientBoostingRegressor(max_depth=4, n_estimators=200, random_state=0)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.516 ± 0.0562  83.3 ± 49.52
	valid  0.761 ± 0.1123    3.4 ± 0.07
	test   0.642 ± 0.0177    0.4 ± 0.01
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.769
	test   0.614

# Final Best Data

## CatBoost

In [55]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=10,
        l2_leaf_reg=5,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.5,
        random_strength=5)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.369 ± 0.0343  52.8 ± 23.44
	valid  0.695 ± 0.1103    2.5 ± 0.35
	test   0.568 ± 0.0253    0.3 ± 0.02
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.704
	test   0.541


In [56]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=10,
        l2_leaf_reg=10,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.5,
        random_strength=5)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.375 ± 0.0321  52.5 ± 17.67
	valid  0.693 ± 0.1056    2.5 ± 0.36
	test   0.572 ± 0.0295    0.3 ± 0.01
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.701
	test   0.540


In [57]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=8,
        l2_leaf_reg=7,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.3,
        random_strength=3)

Mean CV scores
----------------------------
	score            rmse         time
	train  0.355 ± 0.0332  20.3 ± 8.88
	valid  0.695 ± 0.1091   1.9 ± 0.17
	test   0.571 ± 0.0266   0.2 ± 0.03
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.703
	test   0.539


In [7]:
regressor = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=8,
        l2_leaf_reg=10,
        bootstrap_type = "Bayesian",
        bagging_temperature = 0.3,
        random_strength=3
    )

Mean CV scores
----------------------------
	score            rmse         time
	train  0.358 ± 0.0323  21.8 ± 8.43
	valid  0.696 ± 0.1121   2.1 ± 0.34
	test   0.573 ± 0.0251   0.2 ± 0.03
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.705
	test   0.537


## XGBoost

In [59]:
regressor = XGBRegressor(colsample_bytree=0.4, eval_metric='rmse',
                         gamma=0.3, learning_rate=0.1,
                         max_cat_threshold=64, max_cat_to_onehot=4,
                         max_depth=9, min_child_weight=3,
                         n_estimators=200, random_state=0)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.382 ± 0.0389  23.1 ± 11.71
	valid  0.704 ± 0.0868    3.4 ± 0.11
	test   0.597 ± 0.0129    0.4 ± 0.02
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.709
	test   0.548


In [60]:
regressor = XGBRegressor(colsample_bytree=0.4, eval_metric='rmse',
                         gamma=0.5, learning_rate=0.05,
                         max_cat_threshold=64, max_cat_to_onehot=4, 
                         max_depth=9, min_child_weight=4,
                         n_estimators=200, subsample=0.5,
                         random_state=0)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.472 ± 0.0414  23.6 ± 13.18
	valid  0.745 ± 0.0882    3.8 ± 1.37
	test   0.611 ± 0.0189    0.4 ± 0.03
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.750
	test   0.568


In [61]:
regressor = XGBRegressor(colsample_bytree=0.7, eval_metric='rmse',
                         gamma=0.4,learning_rate=0.1,
                         max_cat_threshold=64, max_cat_to_onehot=4, 
                         max_depth=10, min_child_weight=7,
                         n_estimators=500, random_state=0)

Mean CV scores
----------------------------
	score            rmse          time
	train  0.291 ± 0.0254  92.0 ± 46.48
	valid  0.694 ± 0.0998    9.2 ± 0.73
	test   0.601 ± 0.0176    0.9 ± 0.07
----------------------------
Bagged scores
----------------------------
	score   rmse
	valid  0.701
	test   0.547
