In [17]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

train = pd.read_csv('data/single_turbine_data/train_reduced_unskewed.csv')
test = pd.read_csv('data/single_turbine_data/test_reduced_unskewed.csv')
# test2 = pd.read_csv('data/single_turbine_data/test_reduced_unskewed.csv')

label = ['1_Gear oil temperature (°C)']

X_train = train.drop(label, axis=1)
y_train = train[label]
X_test = test.drop(label, axis=1)
y_test = test[label]

# convert to datetime
X_train['# Date and time'] = pd.to_datetime(X_train['# Date and time'])
X_test['# Date and time'] = pd.to_datetime(X_test['# Date and time'])
# y_train['# Date and time'] = pd.to_datetime(y_train['# Date and time'])
# y_test['# Date and time'] = pd.to_datetime(y_test['# Date and time'])

# Setting the index
X_train.set_index('# Date and time', inplace=True)
X_test.set_index('# Date and time', inplace=True)
# y_train.set_index('# Date and time', inplace=True)
# y_test.set_index('# Date and time', inplace=True)

In [18]:
original_cols = ['1_Wind direction (°)',
       '1_Nacelle position (°)', '1_Power (kW)',
       '1_Front bearing temperature (°C)', '1_Rear bearing temperature (°C)',
       '1_Stator temperature 1 (°C)', '1_Nacelle ambient temperature (°C)',
       '1_Nacelle temperature (°C)', '1_Transformer temperature (°C)',
       '1_Generator bearing rear temperature (°C)',
       '1_Generator bearing front temperature (°C)', '1_Temp. top box (°C)',
       '1_Hub temperature (°C)', '1_Ambient temperature (converter) (°C)',
       '1_Rotor bearing temp (°C)', '1_Transformer cell temperature (°C)', '1_Generator RPM (RPM)']
extras = ['month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'curtailed', 'offline']
unskewed = [col for col in train.columns if col.endswith('unsk')]

# Create a set for each list of features
original_cols_set = set(original_cols)
extras_set = set(extras)
unskewed_set = set(unskewed)

# Remove the '_unsk' suffix from the unskewed features
unskewed_without_suffix = {feat.replace('unsk', '') for feat in unskewed_set}

# Subtract the unskewed set (without suffix) from the original set
# This will give you only the features in the original set that don't have an unskewed version
original_cols_without_unskewed = original_cols_set - unskewed_without_suffix

# Now create the final list of features
# This contains all the extra features, the unskewed features, and the original features that don't have an unskewed version
final_features = list(original_cols_without_unskewed | extras_set | unskewed_set)

In [20]:
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold

leave_out = ['1_Wind speed (m/s', '# Date and time']
features = original_cols + extras
X_test = X_test[features]
X_train = X_train[features]

# combine training and testing data
# X_train = pd.concat([X_train, X_test])
# y_train = pd.concat([y_train, y_test])

# Initialize variables
iterations = 12000
learning_rate = 0.01
depth = 7
kf = KFold(n_splits=5, shuffle=False, 
# random_state=42
)

# Create an empty list to store RMSE scores and models
cv_scores = []
saved_models = []

# Manually implementing k-fold cross-validation
for train_index, val_index in kf.split(X_train):
    print(train_index, val_index)
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    print(len(X_train_fold), len(X_val_fold))

    train_pool = Pool(X_train_fold, y_train_fold)
    val_pool = Pool(X_val_fold, y_val_fold)
    
    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        loss_function='RMSE',
        random_seed=42,
        verbose=100
    )
    
    model.fit(train_pool, eval_set=val_pool, verbose=100)
    
    y_val_pred = model.predict(X_val_fold)
    rmse_val = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
    
    print(f'Validation RMSE for this fold: {rmse_val}')
    
    # Save model to disk
    model_file_path = Path(f'model_fold_{len(cv_scores) + 1}.cbm')
    model.save_model(model_file_path)
    saved_models.append(model_file_path)
    
    cv_scores.append(rmse_val)

# Calculate mean and std of RMSE scores
mean_rmse = np.mean(cv_scores)
std_rmse = np.std(cv_scores)
print(f'CV RMSE: {mean_rmse} ± {std_rmse}')

import pandas as pd
from pathlib import Path

# Define a path to the results file
results_file = Path('results.csv')

# Create a DataFrame to hold the results
results_df = pd.DataFrame(
    {
        'Model': ['CatBoost'],
        'Training RMSE': [np.nan],
        'Validation RMSE': [mean_rmse],
        'Iterations': [iterations],
        'Learning Rate': [learning_rate],
        'Depth': [depth],
        'Loss Function': ['RMSE'],
        'Features': [features],
        'Folds': [5],
    }
)

# If the results file exists, load it and append the new results
if results_file.exists():
    existing_df = pd.read_csv(results_file)
    results_df = pd.concat([existing_df, results_df])

# Save the results DataFrame to CSV
results_df.to_csv(results_file, index=False)

with open('saved_models.txt', 'w') as f:
    for model_path in saved_models:
        f.write(f'{model_path}\n')

[ 31747  31748  31749 ... 158728 158729 158730] [    0     1     2 ... 31744 31745 31746]
126984 31747
0:	learn: 5.4463074	test: 5.8127739	best: 5.8127739 (0)	total: 5.4ms	remaining: 1m 4s
100:	learn: 2.5255171	test: 2.7895944	best: 2.7895944 (100)	total: 526ms	remaining: 1m 1s
200:	learn: 1.4697841	test: 1.6490316	best: 1.6490316 (200)	total: 1.03s	remaining: 1m
300:	learn: 1.0881774	test: 1.1986937	best: 1.1986937 (300)	total: 1.52s	remaining: 59.1s
400:	learn: 0.9392144	test: 1.0118608	best: 1.0118608 (400)	total: 2s	remaining: 57.9s
500:	learn: 0.8578267	test: 0.9146032	best: 0.9146032 (500)	total: 2.48s	remaining: 57.1s
600:	learn: 0.8001069	test: 0.8522126	best: 0.8522126 (600)	total: 2.97s	remaining: 56.3s
700:	learn: 0.7570551	test: 0.8049970	best: 0.8049970 (700)	total: 3.45s	remaining: 55.6s
800:	learn: 0.7210704	test: 0.7658167	best: 0.7658167 (800)	total: 3.94s	remaining: 55.1s
900:	learn: 0.6919029	test: 0.7378451	best: 0.7378451 (900)	total: 4.43s	remaining: 54.6s
1000:	l

In [21]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from pathlib import Path

# Load the list of saved model paths from the text file
saved_model_paths = []
with open('saved_models.txt', 'r') as f:
    for line in f:
        saved_model_paths.append(Path(line.strip()))

# Initialize an array to store the predictions from each model
ensemble_preds = np.zeros(len(X_test))

# Load each saved model and make predictions on the validation set
for model_path in saved_model_paths:
    model = CatBoostRegressor()
    model.load_model(model_path)
    
    preds = model.predict(X_test)
    ensemble_preds += preds

# Average the predictions to get the ensemble prediction
ensemble_preds /= len(saved_model_paths)

# Calculate RMSE for the ensemble predictions
rmse_ensemble = np.sqrt(mean_squared_error(y_test, ensemble_preds))

print(f'Ensemble RMSE: {rmse_ensemble}')

Ensemble RMSE: 0.53238857340108


In [3]:
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from pathlib import Path

leave_out = ['1_Wind speed (m/s', '# Date and time']
features = [col for col in X_train.columns if (col not in leave_out) and (col.endswith('unsk'))] + ['curtailed_0.001', 'curtailed_True', 'offline_0.001', 'offline_True']
X_test = X_test[features]
X_train = X_train[features]

iterations = 12000
learning_rate = 0.01
depth = 7

# Initialize CatBoostRegressor
model = CatBoostRegressor(
    iterations=iterations,
    learning_rate=learning_rate,
    depth=depth,
    loss_function='RMSE',
    random_seed=42,
    verbose=100
)

# Define the training Pool
train_pool = Pool(X_train, y_train)

# Define the parameters for cross-validation
params = {
    'iterations': iterations,
    'learning_rate': learning_rate,
    'depth': depth,
    'loss_function': 'RMSE',
    'custom_metric': ['RMSE'],
    'random_seed': 42
}

# Perform cross-validation
cv_scores = cv(
    params=params,
    pool=train_pool,
    fold_count=5,  # 5-fold CV
    verbose=100,  # Output every 100th iteration
    stratified=False,
    plot=False,
)

# Extract the RMSE at the best iteration
best_iteration = np.argmin(cv_scores['test-RMSE-mean'])
best_score = cv_scores['test-RMSE-mean'][best_iteration]
print(f'Best validation RMSE score: {best_score}±{cv_scores["test-RMSE-std"][best_iteration]} on step {best_iteration+1}')

# Train the model on the full training set with the best number of iterations
model.fit(train_pool, verbose=100)

# Make predictions on the training set and calculate RMSE
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f'Train RMSE: {rmse_train}')
y_test_pred = model.predict(X_test)
rmse_val = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'Validation RMSE: {rmse_val}')

import pandas as pd
from pathlib import Path

# Define a path to the results file
results_file = Path('results.csv')

# Create a DataFrame to hold the results
results_df = pd.DataFrame(
    {
        'Model': ['CatBoost'],
        'Training RMSE': [rmse_train],
        'Validation RMSE': [rmse_val],
        'Iterations': [iterations],
        'Learning Rate': [learning_rate],
        'Depth': [depth],
        'Loss Function': ['RMSE'],
        'Features': [features],
        'Folds': [5],
    }
)

# If the results file exists, load it and append the new results
if results_file.exists():
    existing_df = pd.read_csv(results_file)
    results_df = pd.concat([existing_df, results_df])

# Save the results DataFrame to CSV
results_df.to_csv(results_file, index=False)

Training on fold [0/5]
0:	learn: 52.9167084	test: 52.9173808	best: 52.9173808 (0)	total: 67.5ms	remaining: 11m 14s
100:	learn: 19.5332316	test: 19.5408741	best: 19.5408741 (100)	total: 916ms	remaining: 1m 29s
200:	learn: 7.3453843	test: 7.3547553	best: 7.3547553 (200)	total: 1.81s	remaining: 1m 28s
300:	learn: 2.9761358	test: 2.9845520	best: 2.9845520 (300)	total: 2.7s	remaining: 1m 26s
400:	learn: 1.5131018	test: 1.5191095	best: 1.5191095 (400)	total: 3.69s	remaining: 1m 28s
500:	learn: 1.0731996	test: 1.0781731	best: 1.0781731 (500)	total: 4.67s	remaining: 1m 28s
600:	learn: 0.9228568	test: 0.9288213	best: 0.9288213 (600)	total: 5.57s	remaining: 1m 27s
700:	learn: 0.8465893	test: 0.8536446	best: 0.8536446 (700)	total: 6.46s	remaining: 1m 25s
800:	learn: 0.7940164	test: 0.8020664	best: 0.8020664 (800)	total: 7.29s	remaining: 1m 23s
900:	learn: 0.7552644	test: 0.7641273	best: 0.7641273 (900)	total: 8.11s	remaining: 1m 21s
1000:	learn: 0.7242678	test: 0.7338461	best: 0.7338461 (1000)	to

In [7]:
model.save_model('catboost_model_cv.bin')

NameError: name 'loaded_model' is not defined

In [6]:
loaded_model = CatBoostRegressor()
loaded_model.load_model('catboost_model_cv.bin')

In [8]:
# plot y_test against best model predictions
import matplotlib.pyplot as plt

predictions = loaded_model.predict(X_test)
# test['GearOilunsk'] = test['1_Gear oil temperature (°C)'].apply(lambda x: x**(1/0.06))
# predictions = predictions**(0.06)

plt.scatter(predictions, test2['1_Gear oil temperature (°C)'])

In [10]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)


GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)



model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)



In [11]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 1.7530 (0.2705)



In [12]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 1.7529 (0.2704)



In [13]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

: 

: 

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  

In [None]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))