In [1]:
# importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from feature_engine import encoding, creation, transformation, outliers
from sklearn import base, pipeline
from sklearn.model_selection import train_test_split

import xgboost as xgb

from joblib import load, dump

from IPython.display import JSON
import warnings
warnings.filterwarnings('ignore')

# adjust the style to emulate ggplot
plt.style.use('ggplot')

In [2]:
bicing = pd.read_parquet(r'..\data\processed\bicing_full.parquet').set_index('year').drop(index=[2023]).reset_index()
bicing

Unnamed: 0,year,station_id,month,day,hour,ctx_1,ctx_2,ctx_3,ctx_4,post_code,...,min_temp,avg_rel_humidity,acum_precipitation,avg_atm_pressure,global_solar_rad,avg_wind_speed,avg_wind_direction,max_wind_speed,max_streak_wind_direction,percentage_docks_available
0,2019,1,3,28,21,0.150000,0.383333,0.416667,0.466667,8013,...,9.225,51.25,0.0,1006.599976,22.100000,2.166667,131.666672,7.500000,119.000000,0.033333
1,2019,1,3,28,22,0.033333,0.150000,0.383333,0.416667,8013,...,9.225,51.25,0.0,1006.599976,22.100000,2.166667,131.666672,7.500000,119.000000,0.066667
2,2019,1,3,28,23,0.066667,0.033333,0.150000,0.383333,8013,...,9.225,51.25,0.0,1006.599976,22.100000,2.166667,131.666672,7.500000,119.000000,0.033333
3,2019,1,3,29,0,0.033333,0.066667,0.033333,0.150000,8013,...,8.600,57.75,0.0,1006.400024,22.299999,2.533333,73.000000,7.266667,164.333328,0.034483
4,2019,1,3,29,3,0.034483,0.033333,0.066667,0.033333,8013,...,8.600,57.75,0.0,1006.400024,22.299999,2.533333,73.000000,7.266667,164.333328,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14292864,2022,519,12,31,19,0.895833,0.916667,0.875000,0.958333,8032,...,12.175,59.50,0.0,1003.966675,8.600000,2.766667,264.333344,11.433333,295.333344,0.750000
14292865,2022,519,12,31,20,0.750000,0.895833,0.916667,0.875000,8032,...,12.175,59.50,0.0,1003.966675,8.600000,2.766667,264.333344,11.433333,295.333344,0.666667
14292866,2022,519,12,31,21,0.666667,0.750000,0.895833,0.916667,8032,...,12.175,59.50,0.0,1003.966675,8.600000,2.766667,264.333344,11.433333,295.333344,0.583333
14292867,2022,519,12,31,22,0.583333,0.666667,0.750000,0.895833,8032,...,12.175,59.50,0.0,1003.966675,8.600000,2.766667,264.333344,11.433333,295.333344,0.583333


In [19]:
(bicing
 .query('year == 2022')
 .loc[(bicing['station_id'] == 1) & (bicing['month'] == 3) & (bicing['hour'] == 17), 'ctx_1']
 .median()
 )

0.65909094

In [3]:
def tweak_bicing(df_: pd.DataFrame) -> pd.DataFrame:

    return (df_
            .astype({'station_id': 'category'})  # XGBoost: tree_method='gpu_hist', enable_categorical=True
            .drop(columns=['post_code', 'lat', 'lon', 'capacity', 'max_temp', 'min_temp', 'avg_rel_humidity','avg_atm_pressure', 
                           'global_solar_rad', 'avg_wind_direction', 'max_wind_speed', 'max_streak_wind_direction', 'avg_wind_speed'])
            )

In [4]:
bicing = tweak_bicing(bicing)

In [5]:
bicing_pl = pipeline.Pipeline(
    [
     ('cat_oneHot', encoding.OneHotEncoder(variables=['season'], drop_last=True)),
     ('log1p', transformation.LogCpTransformer(variables=['altitude', 'acum_precipitation'], base='10')),
     ('outliers', outliers.Winsorizer(tail='both')),
     ('cat_mean', encoding.MeanEncoder(variables=['station_id'])),
     ('cyclic', creation.CyclicalFeatures(variables=['day', 'month', 'hour', 'weekday'], drop_original=True)),
    ]
)

In [6]:
bicing_pl

In [7]:
def get_rawX_y(df, y_col):
    raw = (df
           .set_index('year')
           .drop(index=[2019, 2020])
           .reset_index()
          )
    
    return raw.drop(columns=['year', y_col]), raw[y_col]

bicing_X, bicing_y = get_rawX_y(bicing, 'percentage_docks_available')

In [8]:
X_raw_train, X_raw_test, y_train, y_test = train_test_split(
    bicing_X, bicing_y, test_size=0.2, shuffle=True, random_state=42
)

X_train = bicing_pl.fit_transform(X_raw_train, y_train)
X_test = bicing_pl.transform(X_raw_test)

In [9]:
dump(bicing_pl, '..\models\pipe_app.joblib')

['..\\models\\pipe_app.joblib']

In [10]:
X_raw_train.dtypes

station_id            category
month                    uint8
day                      uint8
hour                     uint8
ctx_1                  float32
ctx_2                  float32
ctx_3                  float32
ctx_4                  float32
altitude                uint16
is_holiday                bool
weekday                  uint8
weekend                   bool
season                category
avg_temp               float32
acum_precipitation     float32
dtype: object

In [11]:
X_train

Unnamed: 0,station_id,ctx_1,ctx_2,ctx_3,ctx_4,altitude,is_holiday,weekend,avg_temp,acum_precipitation,...,season_autumn,season_winter,day_sin,day_cos,month_sin,month_cos,hour_sin,hour_cos,weekday_sin,weekday_cos
8009447,0.737278,0.296296,0.296296,0.296296,0.296296,2.086360,False,False,12.225,0.000000,...,0,0,0.724793,0.688967,1.000000,6.123234e-17,9.422609e-01,-0.334880,-4.338837e-01,-0.900969
2545965,0.542589,0.262295,0.322581,0.274194,0.451613,0.845098,False,False,15.200,0.361728,...,1,0,-0.988468,0.151428,-0.500000,8.660254e-01,-1.361666e-01,-0.990686,-4.338837e-01,-0.900969
7016793,0.541850,0.296296,0.407407,0.444444,0.481481,1.176091,True,False,18.350,0.374137,...,1,0,0.201299,0.979530,-0.500000,8.660254e-01,7.308360e-01,0.682553,7.818315e-01,0.623490
7146025,0.716477,0.640000,0.840000,0.960000,0.920000,1.826075,False,True,12.350,0.204120,...,0,0,-0.790776,-0.612106,1.000000,6.123234e-17,1.361666e-01,-0.990686,-2.449294e-16,1.000000
6300607,0.519649,0.520000,0.480000,0.520000,0.040000,0.778151,False,True,18.150,0.000000,...,1,0,-0.299363,-0.954139,-0.866025,5.000000e-01,-2.449294e-16,1.000000,-2.449294e-16,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7204212,0.444107,0.937500,0.843750,0.812500,0.656250,1.230449,False,False,15.725,0.041393,...,0,1,0.201299,0.979530,0.866025,5.000000e-01,-1.361666e-01,-0.990686,7.818315e-01,0.623490
2234489,0.659754,0.333333,0.333333,0.333333,0.333333,1.000000,False,False,20.400,0.000000,...,1,0,0.848644,0.528964,-0.866025,5.000000e-01,8.878852e-01,0.460065,4.338837e-01,-0.900969
4304572,0.542344,0.441176,0.764706,0.705882,0.676471,1.414973,False,False,19.900,0.014240,...,1,0,-0.651372,-0.758758,-0.866025,5.000000e-01,-9.790841e-01,0.203456,9.749279e-01,-0.222521
6550634,0.513879,0.961538,0.961538,0.961538,0.961538,0.954243,False,False,11.900,0.828230,...,0,1,-0.968077,-0.250653,0.500000,8.660254e-01,9.422609e-01,-0.334880,-9.749279e-01,-0.222521


In [12]:
(X_train
 .columns
 .values
)

array(['station_id', 'ctx_1', 'ctx_2', 'ctx_3', 'ctx_4', 'altitude',
       'is_holiday', 'weekend', 'avg_temp', 'acum_precipitation',
       'season_spring', 'season_autumn', 'season_winter', 'day_sin',
       'day_cos', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos',
       'weekday_sin', 'weekday_cos'], dtype=object)

In [13]:
X_train.dtypes

station_id            float64
ctx_1                 float32
ctx_2                 float32
ctx_3                 float32
ctx_4                 float32
altitude              float32
is_holiday               bool
weekend                  bool
avg_temp              float32
acum_precipitation    float64
season_spring           int32
season_autumn           int32
season_winter           int32
day_sin               float64
day_cos               float64
month_sin             float64
month_cos             float64
hour_sin              float64
hour_cos              float64
weekday_sin           float64
weekday_cos           float64
dtype: object

In [14]:
xg = xgb.XGBRegressor()
xg.fit(X_train, y_train)
xg.score(X_test, y_test)

0.8079798963854897

In [15]:
dump(xg, '..\models\model_app.joblib')

['..\\models\\model_app.joblib']