In [25]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import joblib
from google.cloud import bigquery, storage
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [23]:
# Connect to BigQuery and load your data
client = bigquery.Client(project='eternal-entity-465517-d6')

# This SQL query selects all the data from your table
# Assuming 'AT', 'V', 'AP', 'RH', 'PE' are your actual column names
sql_query = """
SELECT `AT`, V, AP, RH, PE
FROM `eternal-entity-465517-d6.power_plant_project.hourly_readings`
"""

In [10]:
# Run the query and load the results into a Pandas DataFrame
df = client.query(sql_query).to_dataframe()
print("Successfully loaded data from BigQuery.")
print(df.head())

Successfully loaded data from BigQuery.
      AT      V       AP      RH      PE
0  10.73  25.36  1009.35  100.15  469.43
1  10.73  25.36  1009.35  100.15  469.43
2  10.73  25.36  1009.35  100.15  469.43
3  10.73  25.36  1009.35  100.15  469.43
4  10.73  25.36  1009.35  100.15  469.43


In [11]:
# Define features (X) and the target (y)
# X contains the sensor readings (the inputs to the model)
X = df[['AT', 'V', 'AP', 'RH']]
# y is the energy output we want to predict
y = df['PE']

In [12]:
# Split data into training and testing sets
# This function also randomly shuffles the data, which is very important.
# We use 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Data split into 38272 training samples and 9568 testing samples.


In [13]:
# Initialize and train the model
# RandomForestRegressor is a powerful and reliable model for this kind of problem.
model = RandomForestRegressor(n_estimators=100, random_state=42)

print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training the model...
Model training complete.


In [14]:
# Make predictions and evaluate the model's performance
# The model makes predictions on the test data it has never seen before.
predictions = model.predict(X_test)

In [15]:
# Result
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"\nModel Performance on the Test Set:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f} MW")


Model Performance on the Test Set:
Root Mean Squared Error (RMSE): 0.3056 MW


In [26]:
# Save the model
LOCAL_TEMP_FILENAME = 'temp_model.joblib'
joblib.dump(model, LOCAL_TEMP_FILENAME)
print(f"Model temporarily saved to {LOCAL_TEMP_FILENAME}")


Model temporarily saved to temp_model.joblib


In [27]:
# Upload the local file to GCS
GCS_BUCKET_NAME = 'dinova-projects'
GCS_MODEL_PATH = 'power-plant-project/models/regression_model_20250711.joblib'

client = storage.Client()
bucket = client.bucket(GCS_BUCKET_NAME)
blob = bucket.blob(GCS_MODEL_PATH)

# Start upload
blob.upload_from_filename(LOCAL_TEMP_FILENAME)
print(f"Model uploaded to gs://{GCS_BUCKET_NAME}/{GCS_MODEL_PATH}")

# Delete temp file
os.remove(LOCAL_TEMP_FILENAME)
print(f"Cleaned up local temporary file: {LOCAL_TEMP_FILENAME}")

Model uploaded to gs://dinova-projects/power-plant-project/models/regression_model_20250711.joblib
Cleaned up local temporary file: temp_model.joblib
