In [1]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, OneHotEncoder

import shap
import optuna
import category_encoders as ce
import optuna.visualization as vis

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from pycaret.regression import *
# from pycaret.classification import *

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [15]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [50]:
ig_feats = ['tracking_id', 'datetime']

data_setup = setup(
    data=train,
    target='windmill_generated_power(kW/h)',
    ignore_features=ig_feats,
    feature_selection=True,
    transform_target=True,
    imputation_type='simple',
    numeric_imputation='median',
    categorical_imputation='mode',
    fold_shuffle=True,
)

Unnamed: 0,Description,Value
0,session_id,8225
1,Target,windmill_generated_power(kW/h)
2,Original Data,"(27993, 22)"
3,Missing Values,True
4,Numeric Features,17
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(19595, 34)"


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

In [46]:
lgb = create_model('lightgbm')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.3212,0.2725,0.522,0.9625,0.0732,0.0578
1,0.3121,0.2328,0.4825,0.9674,0.0686,0.0584
2,0.3279,0.385,0.6205,0.9473,0.0744,0.0574
3,0.3187,0.2824,0.5314,0.9595,0.075,0.0607
4,0.3383,0.3066,0.5537,0.9594,0.0745,0.061
5,0.3316,0.412,0.6419,0.9412,0.0812,0.0599
6,0.3452,0.4356,0.66,0.9411,0.0856,0.0619
7,0.3198,0.2925,0.5408,0.96,0.07,0.0584
8,0.3297,0.2989,0.5467,0.9589,0.0756,0.0594
9,0.3255,0.2817,0.5308,0.9611,0.0792,0.0621


In [32]:
cat = create_model('catboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.3221,0.2732,0.5227,0.9642,0.0742,0.061
1,0.3101,0.3105,0.5573,0.9568,0.0784,0.0587
2,0.3235,0.295,0.5431,0.9591,0.0762,0.0614
3,0.3119,0.2388,0.4887,0.9673,0.072,0.0597
4,0.3094,0.2587,0.5086,0.9634,0.0738,0.0592
5,0.3225,0.2629,0.5127,0.965,0.0712,0.0584
6,0.3012,0.2713,0.5209,0.9611,0.0697,0.0554
7,0.3172,0.2512,0.5012,0.9646,0.0718,0.0605
8,0.3091,0.2352,0.485,0.9671,0.0684,0.0582
9,0.3107,0.2538,0.5038,0.9657,0.0687,0.0585


In [33]:
evaluate_model(cat)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [39]:
ensembled_model = ensemble_model(cat)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.3222,0.2741,0.5235,0.9641,0.0748,0.0616
1,0.3094,0.2989,0.5467,0.9584,0.077,0.0585
2,0.3282,0.3063,0.5535,0.9575,0.078,0.0626
3,0.3111,0.2424,0.4924,0.9668,0.072,0.0596
4,0.3127,0.2658,0.5156,0.9624,0.0739,0.0598
5,0.3212,0.2628,0.5126,0.965,0.0726,0.0589
6,0.2986,0.2718,0.5213,0.961,0.0697,0.0551
7,0.3181,0.2484,0.4984,0.965,0.0728,0.0616
8,0.3091,0.2401,0.49,0.9665,0.0695,0.0588
9,0.3095,0.2542,0.5042,0.9657,0.0692,0.0588


In [40]:
final_model = finalize_model(ensembled_model)

In [41]:
preds = predict_model(final_model, data=test)

In [42]:
print(preds.shape)
preds.head()

(12086, 22)


Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),Label
0,WM_19817,2019-04-17 08:53:20,94.324266,17.641186,89.714193,51.146788,40.46056,39.594734,1073.202715,66.830037,...,43.756693,445.976992,1664.222023,21.912243,BA,Medium,3.185837,0.403965,25.572431,2.462297
1,WM_18723,2019-03-30 07:43:20,10.08887,13.978119,43.272846,46.516394,40.027788,41.17686,517.43643,37.284163,...,42.728174,499.595287,1165.111992,-35.050093,A,Medium,3.016603,0.444755,24.371823,2.347521
2,WM_34552,2019-08-10 11:33:20,347.15209,31.423035,41.07664,26.931602,43.109122,43.439556,1480.716492,70.010762,...,43.256122,245.432231,1667.720491,27.195302,B2,Medium,2.611941,0.387368,27.654677,3.223367
3,WM_28570,2019-06-26 03:53:20,24.471997,-99.0,14.375078,66.513953,13.741253,15.577472,887.979475,41.445258,...,13.501595,,1329.74474,15.245757,BBB,Low,2.866805,0.450478,24.189426,7.29045
4,WM_36934,2019-08-27 16:43:20,96.997026,33.281836,41.405192,1.843112,121.572907,43.934587,2053.916354,68.007787,...,-99.0,442.425744,691.408996,34.257024,A,Low,3.549672,0.368355,4.88544,3.629213


In [43]:
sub = pd.DataFrame({
    'tracking_id': test['tracking_id'],
    'datetime': test['datetime'],
    'windmill_generated_power(kW/h)': preds['Label'],
})

print(sub.shape)
sub.head()

(12086, 3)


Unnamed: 0,tracking_id,datetime,windmill_generated_power(kW/h)
0,WM_19817,2019-04-17 08:53:20,2.462297
1,WM_18723,2019-03-30 07:43:20,2.347521
2,WM_34552,2019-08-10 11:33:20,3.223367
3,WM_28570,2019-06-26 03:53:20,7.29045
4,WM_36934,2019-08-27 16:43:20,3.629213


In [44]:
sub.to_csv('../submissions/pycaret_catboost_ensemble.csv', index=False)