## Load data and imports

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse

import warnings
warnings.filterwarnings('ignore')

In [19]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
train.head()

Unnamed: 0,timestamp,active_power_calculated_by_converter,active_power_raw,ambient_temperature,generator_speed,generator_winding_temp_max,grid_power10min_average,nc1_inside_temp,nacelle_temp,reactice_power_calculated_by_converter,reactive_power,wind_direction_raw,wind_speed_raw,wind_speed_turbulence,turbine_id,Target
0,2021-02-19 20:18:00,816.636759,834.917206,31.69438,1159.616602,65.954214,917.897085,31.881972,31.504713,141.457644,165.501518,280.864782,7.057,0.544082,Turbine_108,47.582787
1,2021-04-27 04:55:00,419.107829,421.050873,12.894948,928.747996,59.571319,445.55425,32.423705,32.75577,89.186457,113.835236,299.55246,5.474937,0.469031,Turbine_18,46.070328
2,2021-01-25 06:26:00,1303.530558,1337.566142,16.648388,1201.219775,61.270498,1364.716003,11.446849,18.332985,230.622309,281.452253,84.960106,8.092457,0.622318,Turbine_105,39.989236
3,2021-10-30 03:47:00,61.494872,53.481008,28.388141,769.806122,40.674348,14.324897,34.253204,32.662889,66.211015,75.017531,87.261119,4.071032,0.760719,Turbine_15,46.056587
4,2021-03-15 00:39:00,593.514364,611.659108,31.519527,1046.916768,64.341763,599.020172,32.405586,31.466387,137.163938,160.202421,313.724818,6.357943,0.346068,Turbine_01,54.346095


In [20]:
org_cols = train.columns
org_cols

Index(['timestamp', 'active_power_calculated_by_converter', 'active_power_raw',
       'ambient_temperature', 'generator_speed', 'generator_winding_temp_max',
       'grid_power10min_average', 'nc1_inside_temp', 'nacelle_temp',
       'reactice_power_calculated_by_converter', 'reactive_power',
       'wind_direction_raw', 'wind_speed_raw', 'wind_speed_turbulence',
       'turbine_id', 'Target'],
      dtype='object')

In [21]:
power_fts = [
    'active_power_calculated_by_converter',
    'active_power_raw',
    'ambient_temperature',
    'generator_speed',
    'generator_winding_temp_max',
    'grid_power10min_average'
]

temp_fts = [
    'nc1_inside_temp',
    'nacelle_temp',
    'reactice_power_calculated_by_converter',
    'reactive_power',
    'wind_direction_raw',
    'wind_speed_raw',
    'wind_speed_turbulence'
]

In [74]:
from xgboost import XGBRegressor
xgb = XGBRegressor(tree_method='gpu_hist', gpu_id=0, random_state=4567, n_estimators=3000)
print(xgb)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=None, gpu_id=0,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, n_estimators=3000, n_jobs=None,
             num_parallel_tree=None, predictor=None, random_state=4567,
             reg_alpha=None, reg_lambda=None, ...)


In [23]:
from sklearn.feature_selection import mutual_info_regression
from feature_engine.creation import RelativeFeatures


In [24]:
train = train.where(train.values != 0, 0.000_001)
test = test.where(test.values != 0, 0.000_001)

In [25]:
rf = RelativeFeatures(variables=power_fts, reference=temp_fts, func=['sub', 'div', 'mul'])
target = train.Target
train = rf.fit_transform(train.drop(['Target'], axis=1))
test = rf.transform(test)
train['Target'] = target

train.shape, test.shape

((909604, 142), (303202, 141))

In [26]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909604 entries, 0 to 909603
Columns: 142 entries, timestamp to Target
dtypes: float64(140), object(2)
memory usage: 985.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303202 entries, 0 to 303201
Columns: 141 entries, timestamp to grid_power10min_average_mul_wind_speed_turbulence
dtypes: float64(139), object(2)
memory usage: 326.2+ MB


(None, None)

In [27]:
train_std = train.std().sort_values(ascending=False)
train_std

generator_speed_div_reactive_power                         8.625707e+44
grid_power10min_average_div_reactive_power                 7.041014e+44
active_power_raw_div_reactive_power                        4.389797e+44
active_power_calculated_by_converter_div_reactive_power    3.520137e+44
generator_winding_temp_max_div_reactive_power              1.204655e+44
                                                               ...     
generator_winding_temp_max_div_nc1_inside_temp             4.067042e-01
generator_winding_temp_max_div_nacelle_temp                3.400218e-01
wind_speed_turbulence                                      3.175132e-01
ambient_temperature_div_nc1_inside_temp                    2.499078e-01
ambient_temperature_div_nacelle_temp                       2.363909e-01
Length: 140, dtype: float64

In [28]:
(train_std < 1).sum()

6

In [29]:
mi = mutual_info_regression(train.drop(['timestamp','turbine_id' 'target'], axis=1), train['Target'], random_state=4567)
mi = pd.Series(mi, index=train.drop(['timestamp','turbine_id'], axis=1).columns)
mi

active_power_calculated_by_converter                     0.027049
active_power_raw                                         0.021578
ambient_temperature                                      0.285187
generator_speed                                          0.031859
generator_winding_temp_max                               0.106297
                                                          ...    
ambient_temperature_mul_wind_speed_turbulence            0.071666
generator_speed_mul_wind_speed_turbulence                0.018410
generator_winding_temp_max_mul_wind_speed_turbulence     0.027255
grid_power10min_average_mul_wind_speed_turbulence        0.024110
Target                                                  12.441978
Length: 140, dtype: float64

In [31]:
mi.sort_values(ascending=False)

Target                                                                12.441978
ambient_temperature_div_nc1_inside_temp                                0.327660
ambient_temperature_div_nacelle_temp                                   0.304028
nacelle_temp                                                           0.292042
ambient_temperature_mul_nacelle_temp                                   0.287322
                                                                        ...    
grid_power10min_average_div_reactive_power                             0.000008
active_power_raw_div_reactive_power                                    0.000007
active_power_calculated_by_converter_div_reactive_power                0.000005
generator_winding_temp_max_div_reactive_power                          0.000005
grid_power10min_average_div_reactice_power_calculated_by_converter     0.000000
Length: 140, dtype: float64

In [58]:
# 9. Which turbine has most variation in target temp?
turb_grps = train.groupby(['turbine_id']).Target
turb_grps.agg(['std', 'mean', 'min', 'max', 'count']).sort_values(by='std')

Unnamed: 0_level_0,std,mean,min,max,count
turbine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Turbine_15,1.723327,46.388378,29.212526,50.914379,58048
Turbine_123,1.787342,44.642011,32.616718,49.816531,56225
Turbine_19,1.799486,44.463866,26.78309,48.853159,56346
Turbine_103,1.819612,45.604759,33.507112,50.616139,56944
Turbine_108,1.83495,46.549006,32.470976,51.78484,57401
Turbine_14,1.883557,47.722849,29.793389,52.393883,56934
Turbine_97,1.924943,45.834314,29.824291,51.91077,57683
Turbine_139,2.015872,45.643603,25.86532,50.511377,56930
Turbine_158,2.035856,45.563365,30.032194,50.926113,57470
Turbine_18,2.044525,45.178387,36.297964,50.637619,57892


In [39]:
train.timestamp = pd.to_datetime(train.timestamp)
train.timestamp

0        2021-02-19 20:18:00
1        2021-04-27 04:55:00
2        2021-01-25 06:26:00
3        2021-10-30 03:47:00
4        2021-03-15 00:39:00
                 ...        
909599   2021-04-25 19:12:00
909600   2021-02-20 17:37:00
909601   2021-10-22 14:18:00
909602   2021-02-08 22:03:00
909603   2021-04-09 14:28:00
Name: timestamp, Length: 909604, dtype: datetime64[ns]

In [41]:
train['month'] = train.timestamp.dt.month
train.shape

(909604, 143)

In [46]:
month_grps = train.groupby(['month'])

In [54]:
month_grps.Target.agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,71811,43.375493
2,65314,45.817704
3,75301,47.407431
4,73283,48.057015
5,74521,47.840367
6,83492,47.321511
7,83062,46.741961
8,83611,46.295482
9,72233,46.045047
10,71824,46.705162


In [62]:
month_grps.get_group(1).Target

2         39.989236
6         38.088093
19        40.523605
30        44.038269
47        40.909250
            ...    
909535    40.820821
909542    43.153711
909544    46.534601
909550    41.409630
909551    44.771032
Name: Target, Length: 71811, dtype: float64

In [60]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import StandardScaler

cat_cols = ['turbine_id']

tme = TargetEncoder()       # target mean encoder
scaler = StandardScaler()
ct = make_column_transformer(
    (tme, cat_cols),
    remainder = 'passthrough'
)
model_pipe = make_pipeline(
    ct,
    scaler,
    xgb
)
model_pipe

In [64]:
from sklearn.model_selection import train_test_split
for m in range(1, 13):
    print('month:', m)
    X = month_grps.get_group(m)
    X_train, X_val, y_train, y_val = train_test_split(X.drop(['timestamp', 'Target'], axis=1), X.Target)
    model_pipe.fit(X_train, y_train)
    preds = model_pipe.predict(X_val)
    print(mape(preds, y_val), mse(preds, y_val))
    print('-----------------------------------')

month: 1
0.00993129991870626 0.4198157014861962
-----------------------------------
month: 2
0.011166069144330512 0.5532733669930855
-----------------------------------
month: 3
0.012355680646811208 0.6741026065487115
-----------------------------------
month: 4
0.01198951100933056 0.7576356104854106
-----------------------------------
month: 5
0.012545973673864065 0.7774153010115682
-----------------------------------
month: 6
0.011386693134265382 0.5180687302235336
-----------------------------------
month: 7
0.012797795576469425 0.6345474671196344
-----------------------------------
month: 8
0.012038787963377325 0.5702241232702852
-----------------------------------
month: 9
0.012403138974260942 0.7273663975433581
-----------------------------------
month: 10
0.011180733213459705 0.5933244335250335
-----------------------------------
month: 11
0.010120321393013585 0.41269322733282515
-----------------------------------
month: 12
0.009501370790604122 0.3693519948101119
--------------

In [65]:
m_cv = [0.00993129991870626, 0.011166069144330512, 0.012355680646811208, 0.01198951100933056, 0.012545973673864065, 0.011386693134265382 , 0.012797795576469425, 0.012038787963377325 , 0.012403138974260942 , 0.011180733213459705, 0.010120321393013585, 0.009501370790604122 ]
m_cv = np.array(m_cv)
m_cv

array([0.0099313 , 0.01116607, 0.01235568, 0.01198951, 0.01254597,
       0.01138669, 0.0127978 , 0.01203879, 0.01240314, 0.01118073,
       0.01012032, 0.00950137])

In [68]:
m_cv.argmin()+1, m_cv.argmax()+1   # best and worst month

(12, 7)

In [69]:
m_cv.mean()

0.011451447953207758

In [70]:
model_pipe[2].feature_importances_

array([0.21118481, 0.0007314 , 0.00075187, 0.00380395, 0.00080677,
       0.00302866, 0.00484863, 0.02317337, 0.19827779, 0.0017328 ,
       0.00149836, 0.00427588, 0.0021427 , 0.00173286, 0.00155837,
       0.00123059, 0.01495331, 0.00169727, 0.00586544, 0.00171183,
       0.00099609, 0.00120373, 0.01245279, 0.00200463, 0.00888179,
       0.00350236, 0.00188636, 0.00127583, 0.00168946, 0.00136681,
       0.00262313, 0.01017716, 0.00156799, 0.00243574, 0.0032032 ,
       0.00254966, 0.00486151, 0.00581506, 0.00184858, 0.00238085,
       0.00500304, 0.00575856, 0.01026905, 0.00390056, 0.00214282,
       0.00113211, 0.00590185, 0.00134379, 0.0040983 , 0.0010717 ,
       0.00183133, 0.00112838, 0.01043691, 0.00218839, 0.00808138,
       0.00140073, 0.0029983 , 0.00265305, 0.0097741 , 0.01946298,
       0.00899539, 0.00359691, 0.00210869, 0.00194419, 0.01319855,
       0.01637017, 0.00809508, 0.02575278, 0.00238134, 0.00281812,
       0.00575046, 0.00340915, 0.00200689, 0.0029321 , 0.     

In [75]:
model_pipe_3k = make_pipeline(
    ct,
    scaler,
    xgb
)
model_pipe_3k

In [76]:
from sklearn.model_selection import train_test_split
for m in range(1, 13):
    print('month:', m)
    X = month_grps.get_group(m)
    X_train, X_val, y_train, y_val = train_test_split(X.drop(['timestamp', 'Target'], axis=1), X.Target)
    model_pipe_3k.fit(X_train, y_train)
    preds = model_pipe_3k.predict(X_val)
    print(mape(preds, y_val), mse(preds, y_val))
    print('-----------------------------------')

month: 1
0.009953848011079826 0.43967435142016004
-----------------------------------
month: 2
0.010764169464244423 0.5191379879458096
-----------------------------------
month: 3
0.011996619435031166 0.6439889401734509
-----------------------------------
month: 4
0.01174268545751649 0.7539812989378012
-----------------------------------
month: 5
0.012530407667194873 0.7938420971564809
-----------------------------------
month: 6
0.011241744233847615 0.5065866891843583
-----------------------------------
month: 7
0.012671114233577526 0.6379780161247299
-----------------------------------
month: 8
0.012147487185665742 0.5692000814124712
-----------------------------------
month: 9
0.012215480060089287 0.7057539322758911
-----------------------------------
month: 10
0.010949806829297394 0.5805665396391495
-----------------------------------
month: 11
0.01016977400286615 0.4140757104589995
-----------------------------------
month: 12
0.009367425682195962 0.36184875476498946
-------------

In [None]:
# month: 1
# 0.00993129991870626 0.4198157014861962
# -----------------------------------
# month: 2
# 0.011166069144330512 0.5532733669930855
# -----------------------------------
# month: 3
# 0.012355680646811208 0.6741026065487115
# -----------------------------------
# month: 4
# 0.01198951100933056 0.7576356104854106
# -----------------------------------
# month: 5
# 0.012545973673864065 0.7774153010115682
# -----------------------------------
# month: 6
# 0.011386693134265382 0.5180687302235336
# -----------------------------------
# month: 7
# 0.012797795576469425 0.6345474671196344
# -----------------------------------
# month: 8
# 0.012038787963377325 0.5702241232702852
# -----------------------------------
# month: 9
# 0.012403138974260942 0.7273663975433581
# -----------------------------------
# month: 10
# 0.011180733213459705 0.5933244335250335
# -----------------------------------
# month: 11
# 0.010120321393013585 0.41269322733282515
# -----------------------------------
# month: 12
# 0.009501370790604122 0.3693519948101119
# -----------------------------------