In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
import joblib
import warnings
warnings.filterwarnings("ignore")

In [17]:
df = pd.read_csv("train.csv")
print(df.shape)
print(df.columns.tolist())
print(df.head(2))

(909604, 16)
['timestamp', 'active_power_calculated_by_converter', 'active_power_raw', 'ambient_temperature', 'generator_speed', 'generator_winding_temp_max', 'grid_power10min_average', 'nc1_inside_temp', 'nacelle_temp', 'reactice_power_calculated_by_converter', 'reactive_power', 'wind_direction_raw', 'wind_speed_raw', 'wind_speed_turbulence', 'turbine_id', 'Target']
             timestamp  active_power_calculated_by_converter  \
0  2021-02-19 20:18:00                            816.636759   
1  2021-04-27 04:55:00                            419.107829   

   active_power_raw  ambient_temperature  generator_speed  \
0        834.917206            31.694380      1159.616602   
1        421.050873            12.894948       928.747996   

   generator_winding_temp_max  grid_power10min_average  nc1_inside_temp  \
0                   65.954214               917.897085        31.881972   
1                   59.571319               445.554250        32.423705   

   nacelle_temp  reactice_p

In [18]:
target = "Target"
y = df[target]
X = df.drop(columns=[target])


In [19]:
print(df.head(2))
print("Target sample:", y.head(3))


             timestamp  active_power_calculated_by_converter  \
0  2021-02-19 20:18:00                            816.636759   
1  2021-04-27 04:55:00                            419.107829   

   active_power_raw  ambient_temperature  generator_speed  \
0        834.917206            31.694380      1159.616602   
1        421.050873            12.894948       928.747996   

   generator_winding_temp_max  grid_power10min_average  nc1_inside_temp  \
0                   65.954214               917.897085        31.881972   
1                   59.571319               445.554250        32.423705   

   nacelle_temp  reactice_power_calculated_by_converter  reactive_power  \
0     31.504713                              141.457644      165.501518   
1     32.755770                               89.186457      113.835236   

   wind_direction_raw  wind_speed_raw  wind_speed_turbulence   turbine_id  \
0          280.864782        7.057000               0.544082  Turbine_108   
1          299.55

In [20]:
# 3️⃣ Feature engineering
# --- Time-based ---
X["timestamp"] = pd.to_datetime(X["timestamp"], errors="coerce")
X["hour"] = X["timestamp"].dt.hour
X["dayofyear"] = X["timestamp"].dt.dayofyear
X["hour_sin"] = np.sin(2 * np.pi * X["hour"] / 24)
X["hour_cos"] = np.cos(2 * np.pi * X["hour"] / 24)
X["day_sin"] = np.sin(2 * np.pi * X["dayofyear"] / 365)
X["day_cos"] = np.cos(2 * np.pi * X["dayofyear"] / 365)

# --- Physics-based ---
X["wind_speed_sq"] = X["wind_speed_raw"] ** 2
X["wind_speed_cu"] = X["wind_speed_raw"] ** 3
X["power_coefficient"] = X["active_power_raw"] / (X["wind_speed_raw"]**3 + 1e-6)

# --- Thermal / efficiency ---
X["temp_diff_nacelle_ambient"] = X["nacelle_temp"] - X["ambient_temperature"]
X["temp_diff_generator_ambient"] = X["generator_winding_temp_max"] - X["ambient_temperature"]
X["avg_internal_temp"] = (X["nacelle_temp"] + X["nc1_inside_temp"]) / 2



In [21]:
# --- Interaction features ---
X["wind_times_gen"] = X["wind_speed_raw"] * X["generator_speed"]
X["wind_temp_ratio"] = X["wind_speed_raw"] / (X["ambient_temperature"] + 273.15)

# --- Turbine encoding ---
X["turbine_id"] = X["turbine_id"].astype(str)
X["turbine_code"] = pd.factorize(X["turbine_id"])[0]

# Drop raw timestamp column (we extracted features)
X = X.drop(columns=["timestamp"])

print("🔧 Features after engineering:", len(X.columns))

🔧 Features after engineering: 29


In [22]:
# 4️⃣ Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (727683, 29) Test: (181921, 29)


In [24]:
# 5️⃣ Preprocessing pipelines
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__missing__')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features)
], remainder='drop', sparse_threshold=0)

In [25]:
def make_pipeline(model):
    return Pipeline([('preproc', preprocessor), ('model', model)])

In [26]:
# 6️⃣ Baseline models
rf = RandomForestRegressor(n_estimators=200, max_depth=20, n_jobs=-1, random_state=42)
hgb = HistGradientBoostingRegressor(max_iter=300, max_depth=12, random_state=42)

models = {
    "RandomForest": rf,
    "HistGradientBoosting": hgb
}

In [27]:
# 7️⃣ Train & evaluate
results = {}
trained_pipes = {}


In [29]:
# initialize dictionaries before training
for name, model in models.items():
    print(f"\n🚀 Training {name} ...")
    pipe = make_pipeline(model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}
    trained_pipes[name] = pipe
    print(f"{name} → RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")



🚀 Training RandomForest ...
RandomForest → RMSE=0.4823, MAE=0.3138, R2=0.9659

🚀 Training HistGradientBoosting ...
HistGradientBoosting → RMSE=0.7203, MAE=0.5324, R2=0.9240
