In [1]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

train = pd.read_csv('data/single_turbine_data/train_reduced.csv')
test = pd.read_csv('data/single_turbine_data/test_reduced.csv')

label = ['1_Gear oil temperature (°C)']

X_train = train.drop(label, axis=1)
y_train = train[label]
X_test = test.drop(label, axis=1)
y_test = test[label]

# convert to datetime
X_train['# Date and time'] = pd.to_datetime(X_train['# Date and time'])
X_test['# Date and time'] = pd.to_datetime(X_test['# Date and time'])
# y_train['# Date and time'] = pd.to_datetime(y_train['# Date and time'])
# y_test['# Date and time'] = pd.to_datetime(y_test['# Date and time'])

# Setting the index
X_train.set_index('# Date and time', inplace=True)
X_test.set_index('# Date and time', inplace=True)
# y_train.set_index('# Date and time', inplace=True)
# y_test.set_index('# Date and time', inplace=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_time_features(df):
    """
    Function to create time features from a date column in a dataframe.
    
    Parameters:
    df (pandas.DataFrame): Dataframe containing the data
    date_col (str): Column in the dataframe containing the date information
    
    Returns:
    pandas.DataFrame: Dataframe with added time features (hour, day of week, month)
    """
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    return df

In [7]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

leave_out = ['1_Wind speed (m/s', '# Date and time', 'curailed', 'offine']
features = [col for col in X_train.columns if col not in leave_out]
X_test = X_test[features]

iterations = 2500
learning_rate = 0.05
depth = 12

# Initialize CatBoostRegressor, lower learning rate is better in catboost, there is no decay
model = CatBoostRegressor(
    iterations=iterations,
    learning_rate=learning_rate,
    depth=depth,
    loss_function='RMSE',
    random_seed=42,
    verbose=100  # Output every 100th iteration
)

# Fit model
model.fit(
    X_train, 
    y_train,
    eval_set=(X_test, y_test),
    use_best_model=True
)

# Make predictions on the training set and calculate RMSE
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f'Train RMSE: {rmse_train}')
y_test_pred = model.predict(X_test)
rmse_val = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'Validation RMSE: {rmse_val}')

import pandas as pd
from pathlib import Path

# Define a path to the results file
results_file = Path('results.csv')

# Create a DataFrame to hold the results
results_df = pd.DataFrame(
    {
        'Model': ['CatBoost'],
        'Training RMSE': [rmse_train],
        'Validation RMSE': [rmse_val],
        'Iterations': [iterations],
        'Learning Rate': [learning_rate],
        'Depth': [depth],
        'Loss Function': ['RMSE'],
        'Features': [features],
    }
)

# If the results file exists, load it and append the new results
if results_file.exists():
    existing_df = pd.read_csv(results_file)
    results_df = pd.concat([existing_df, results_df])

# Save the results DataFrame to CSV
results_df.to_csv(results_file, index=False)

0:	learn: 5.3175853	test: 5.0450120	best: 5.0450120 (0)	total: 26.6ms	remaining: 1m 6s
100:	learn: 0.6732265	test: 0.7249609	best: 0.7249609 (100)	total: 2.93s	remaining: 1m 9s
200:	learn: 0.5396177	test: 0.6370435	best: 0.6370435 (200)	total: 5.85s	remaining: 1m 6s
300:	learn: 0.4766334	test: 0.6067961	best: 0.6067961 (300)	total: 8.78s	remaining: 1m 4s
400:	learn: 0.4333432	test: 0.5929293	best: 0.5929293 (400)	total: 11.7s	remaining: 1m 1s
500:	learn: 0.4024137	test: 0.5834574	best: 0.5834302 (498)	total: 14.7s	remaining: 58.7s
600:	learn: 0.3780874	test: 0.5785996	best: 0.5785996 (600)	total: 17.7s	remaining: 55.8s
700:	learn: 0.3588520	test: 0.5749937	best: 0.5749905 (698)	total: 20.6s	remaining: 52.9s
800:	learn: 0.3417001	test: 0.5722783	best: 0.5722622 (799)	total: 23.6s	remaining: 50s
900:	learn: 0.3274303	test: 0.5700461	best: 0.5700461 (900)	total: 26.6s	remaining: 47.1s
1000:	learn: 0.3148262	test: 0.5680595	best: 0.5680595 (1000)	total: 29.5s	remaining: 44.2s
1100:	learn: 