In [1]:
import pandas as pd

cycling = pd.read_csv("../datasets/bike_rides.csv", index_col=0,
                      parse_dates=True)
cycling.index.name = ""
target_name = "power"
data, target = cycling.drop(columns=target_name), cycling[target_name]
data

Unnamed: 0,heart-rate,cadence,speed,acceleration,slope
,,,,,
2020-08-18 14:43:19,102.0,64.0,4.325,0.0880,-0.033870
2020-08-18 14:43:20,103.0,64.0,4.336,0.0842,-0.033571
2020-08-18 14:43:21,105.0,66.0,4.409,0.0234,-0.033223
2020-08-18 14:43:22,106.0,66.0,4.445,0.0016,-0.032908
2020-08-18 14:43:23,106.0,67.0,4.441,0.1144,0.000000
...,...,...,...,...,...
2020-09-13 14:55:57,130.0,0.0,1.054,0.0234,0.000000
2020-09-13 14:55:58,130.0,0.0,0.829,0.0258,0.000000
2020-09-13 14:55:59,129.0,0.0,0.616,-0.1686,0.000000


In [15]:
import numpy as np

speed_cubed = data['speed'] **3
sin_slope_angle = data['slope'].apply(lambda x:np.sin(np.arctan(x)))
speed_sin_angle = data['speed'] *sin_slope_angle

In [16]:
acceleration = data['acceleration'].apply(lambda x:0 if x<0 else x)
speed_acceleration = data['speed'] *acceleration

In [24]:
data_matrix = pd.concat([speed_cubed, data['speed'], speed_sin_angle, speed_acceleration], axis=1)
data_matrix.columns = ['speed^3', 'speed', 'speed*sin(alpha)', 'speed*accel']

In [28]:
data_matrix

Unnamed: 0,speed^3,speed,speed*sin(alpha),speed*accel
,,,,
2020-08-18 14:43:19,80.901828,4.325,-0.146402,0.380600
2020-08-18 14:43:20,81.520685,4.336,-0.145482,0.365091
2020-08-18 14:43:21,85.707790,4.409,-0.146398,0.103171
2020-08-18 14:43:22,87.824421,4.445,-0.146198,0.007112
2020-08-18 14:43:23,87.587538,4.441,0.000000,0.508050
...,...,...,...,...
2020-09-13 14:55:57,1.170905,1.054,0.000000,0.024664
2020-09-13 14:55:58,0.569723,0.829,0.000000,0.021388
2020-09-13 14:55:59,0.233745,0.616,0.000000,0.000000


In [30]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, cross_validate
from sklearn.pipeline import make_pipeline

cv= ShuffleSplit(n_splits=4)
model = make_pipeline(StandardScaler(), RidgeCV())

cv_results = cross_validate(model, data_matrix, target,cv=cv, scoring = 'neg_mean_absolute_error', return_estimator=True, return_train_score=True)

cv_results

{'fit_time': array([0.04152203, 0.041049  , 0.03296804, 0.04697394]),
 'score_time': array([0.01234174, 0.00365663, 0.0079577 , 0.00417781]),
 'estimator': [Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('ridgecv', RidgeCV(alphas=array([ 0.1,  1. , 10. ])))]),
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('ridgecv', RidgeCV(alphas=array([ 0.1,  1. , 10. ])))]),
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('ridgecv', RidgeCV(alphas=array([ 0.1,  1. , 10. ])))]),
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('ridgecv', RidgeCV(alphas=array([ 0.1,  1. , 10. ])))])],
 'test_score': array([-74.23116539, -72.22753107, -72.95867731, -73.45785631]),
 'train_score': array([-72.2878851 , -72.53268394, -72.44235451, -72.3723377 ])}

In [36]:
print(f"mean speed*sin(alpha): {data_matrix['speed*sin(alpha)'].mean():.4f}")

mean speed*sin(alpha): -0.0027


In [51]:
[est['ridgecv'].coef_ for est in cv_results['estimator']]

[array([ 5.19136111, 33.06703851, 80.25483746, 11.38803929]),
 array([ 6.36685109, 32.37560784, 81.06855419, 11.56531574]),
 array([ 5.02397492, 33.33475832, 80.54871   , 11.57142953]),
 array([ 4.77587651, 33.26326353, 79.94075576, 11.36851742])]