In [40]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

train = pd.read_csv('data/single_turbine_data/train_reduced_unskewed.csv')
test = pd.read_csv('data/single_turbine_data/test_reduced_unskewed.csv')

label = ['1_Gear oil temperature (°C)']

X_train = train.drop(label, axis=1)
y_train = train[label]
X_test = test.drop(label, axis=1)
y_test = test[label]

# convert to datetime
X_train['# Date and time'] = pd.to_datetime(X_train['# Date and time'])
X_test['# Date and time'] = pd.to_datetime(X_test['# Date and time'])
# y_train['# Date and time'] = pd.to_datetime(y_train['# Date and time'])
# y_test['# Date and time'] = pd.to_datetime(y_test['# Date and time'])

# Setting the index
X_train.set_index('# Date and time', inplace=True)
X_test.set_index('# Date and time', inplace=True)
# y_train.set_index('# Date and time', inplace=True)
# y_test.set_index('# Date and time', inplace=True)

original_cols = ['1_Wind direction (°)',
       '1_Nacelle position (°)', '1_Power (kW)',
       '1_Front bearing temperature (°C)', '1_Rear bearing temperature (°C)',
       '1_Stator temperature 1 (°C)', '1_Nacelle ambient temperature (°C)',
       '1_Nacelle temperature (°C)', '1_Transformer temperature (°C)',
       '1_Generator bearing rear temperature (°C)',
       '1_Generator bearing front temperature (°C)', '1_Temp. top box (°C)',
       '1_Hub temperature (°C)', '1_Ambient temperature (converter) (°C)',
       '1_Rotor bearing temp (°C)', '1_Transformer cell temperature (°C)', '1_Generator RPM (RPM)']
extras = ['month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'curtailed', 'offline']
unskewed = [col for col in train.columns if col.endswith('unsk')]

# Create a set for each list of features
original_cols_set = set(original_cols)
extras_set = set(extras)
unskewed_set = set(unskewed)

# Remove the '_unsk' suffix from the unskewed features
unskewed_without_suffix = {feat.replace('unsk', '') for feat in unskewed_set}

# Subtract the unskewed set (without suffix) from the original set
# This will give you only the features in the original set that don't have an unskewed version
original_cols_without_unskewed = original_cols_set - unskewed_without_suffix

# Now create the final list of features
# This contains all the extra features, the unskewed features, and the original features that don't have an unskewed version
final_features = list(original_cols_without_unskewed | extras_set | unskewed_set)

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import numpy as np
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt

features = original_cols + extras
X_train = X_train[features]
X_test = X_test[features]

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=False).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.005, 
                                            random_state=1,
                                            fit_intercept=True,
                                            warm_start=True,
                                            max_iter=10000,
                                            ))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.005, 
                                                l1_ratio=.9, 
                                                random_state=3))

KRR = KernelRidge(alpha=0.006, kernel='polynomial', degree=2, coef0=1)


GBoost = GradientBoostingRegressor(n_estimators=10000, learning_rate=0.01,
                                   max_depth=12, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='squared_error', 
                                   random_state =5)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.01, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=10000,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)



model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.01, n_estimators=10000,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

model_cat = CatBoostRegressor(
    iterations=10000,
    learning_rate=0.01,
    depth=7,
    loss_function='RMSE',
    random_seed=42,
    verbose=100  # Output every 100th iteration
)

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  
    
averaged_models = AveragingModels(models = (
                                            # ENet, 
                                            # GBoost, 
                                            model_xgb,
                                            model_cat,
                                            model_lgb,
                                            # KRR, 
                                            # lasso,
                                            ))


In [41]:

averaged_models.fit(X_train.values, y_train)

y_pred = averaged_models.predict(X_test.values)

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"RMSE on test set: {rmse_test}")

Parameters: { "silent" } are not used.

0:	learn: 5.5195085	total: 5.58ms	remaining: 55.8s
100:	learn: 2.5508464	total: 541ms	remaining: 53s
200:	learn: 1.4758835	total: 1.08s	remaining: 52.5s
300:	learn: 1.0858027	total: 1.61s	remaining: 52s
400:	learn: 0.9375821	total: 2.13s	remaining: 51.1s
500:	learn: 0.8553834	total: 2.65s	remaining: 50.3s
600:	learn: 0.7975573	total: 3.19s	remaining: 49.8s
700:	learn: 0.7537241	total: 3.71s	remaining: 49.3s
800:	learn: 0.7182871	total: 4.24s	remaining: 48.7s
900:	learn: 0.6883085	total: 4.78s	remaining: 48.2s
1000:	learn: 0.6642717	total: 5.3s	remaining: 47.7s
1100:	learn: 0.6437271	total: 5.83s	remaining: 47.1s
1200:	learn: 0.6262437	total: 6.35s	remaining: 46.5s
1300:	learn: 0.6107250	total: 6.88s	remaining: 46s
1400:	learn: 0.5965876	total: 7.4s	remaining: 45.4s
1500:	learn: 0.5840892	total: 7.92s	remaining: 44.9s
1600:	learn: 0.5728052	total: 8.45s	remaining: 44.3s
1700:	learn: 0.5625956	total: 8.97s	remaining: 43.8s
1800:	learn: 0.5535807	to

  y = column_or_1d(y, warn=True)


RMSE on test set: 0.5822915161106266


In [None]:
score = rmsle_cv(model_cat)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [20]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 1.0046 (0.0768)



In [21]:
# score = rmsle_cv(KRR)
# print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [38]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Xgboost score: 0.6744 (0.0715)



In [39]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)


LGBM score: 0.9437 (0.1279)

