In [1]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# from pathlib import Path
# from colorama import Fore, Style
# from dateutil.parser import parse

# from power.params import *
# from power.ml_ops.data import get_data_with_cache, load_data_to_bq, clean_pv_data
# from power.ml_ops.model import init_RNN, init_baseline_mean
# from power.ml_ops.registry import load_model, save_model, save_results
# from power.ml_ops.cross_val import get_X_y_seq

# from tensorflow.keras.callbacks import EarlyStopping

import numpy as np
import pandas as pd

from pathlib import Path
from colorama import Fore, Style
from dateutil.parser import parse

from power.params import *
from power.ml_ops.data import get_data_with_cache, load_data_to_bq, clean_pv_data
from power.ml_ops.model import initialize_model, compile_model, train_model
from power.ml_ops.registry import load_model, save_model, save_results
from power.ml_ops.cross_val import get_X_y_seq

2024-03-08 16:23:20.913953: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-08 16:23:21.369791: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-08 16:23:21.374005: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

# Query raw data from BUCKET BigQuery using `get_data_with_cache`
query = f"""
    SELECT *
    FROM {GCP_PROJECT}.{BQ_DATASET}.raw_pv
    ORDER BY _0
"""

# Retrieve data using `get_data_with_cache`
data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"raw_pv.csv")
data_query = get_data_with_cache(
    query=query,
    gcp_project=GCP_PROJECT,
    cache_path=data_query_cache_path,
    data_has_header=True
)

# Process data
data_clean = clean_pv_data(data_query)


load_data_to_bq(
    data_clean,
    gcp_project=GCP_PROJECT,
    bq_dataset=BQ_DATASET,
    table=f'processed_pv',
    truncate=True
)

print("✅ preprocess() done \n")


[35m
 ⭐️ Use case: preprocess[0m
[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (376944, 8)
# data cleaned
[34m
Save data to BigQuery @ linen-sun-411222.power.processed_pv...:[0m

Write linen-sun-411222.power.processed_pv (376944 rows)
✅ Data saved to bigquery, with shape (376944, 3)
✅ preprocess() done 



In [None]:
# Plot training & validation MAE values
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plotting R-squared for Train and Validation
#plt.subplot(1, 2, 2)
plt.plot(history.history['r_squared'], label='Train R-squared')
plt.plot(history.history['val_r_squared'], label='Validation R-squared')
plt.title('Model R-squared')
plt.xlabel('Epochs')
plt.ylabel('R-squared')
plt.legend()

plt.tight_layout()
plt.show()

In [3]:
def train(
        min_date:str = '2009-01-01',
        max_date:str = '2015-01-01',
        split_ratio: float = 0.02, # 0.02 represents ~ 1 month of validation data on a 2009-2015 train set
        learning_rate=0.02,
        batch_size = 32,
        patience = 5
    ) -> float:

    """
    - Download processed data from your BQ table (or from cache if it exists)
    - Train on the preprocessed dataset (which should be ordered by date)
    - Store training results and model weights

    Return val_mae as a float
    """

    print(Fore.MAGENTA + "\n⭐️ Use case: train" + Style.RESET_ALL)
    print(Fore.BLUE + "\nLoading preprocessed validation data..." + Style.RESET_ALL)


    # Load processed data using `get_data_with_cache` in chronological order

    query = f"""
        SELECT *
        FROM {GCP_PROJECT}.{BQ_DATASET}.processed_pv
        ORDER BY utc_time
    """

    data_processed_cache_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_pv.csv")
    data_processed = get_data_with_cache(
        gcp_project=GCP_PROJECT,
        query=query,
        cache_path=data_processed_cache_path,
        data_has_header=True
    )

    data_processed = data_processed.rename(columns={'electricity': 'power'})

    data_processed.utc_time = pd.to_datetime(data_processed.utc_time,utc=True)

    if data_processed.shape[0] < 240:
        print("❌ Not enough processed data retrieved to train on")
        return None


    # Split the data into training and testing sets
    train = data_processed[data_processed['utc_time'] < '2020-01-01']
    test = data_processed[data_processed['utc_time'] >= '2020-01-01']

    train = train[['power']]
    test = test[['power']]

    X_train, y_train = get_X_y_seq(train,
                                   number_of_sequences=10_000,
                                   input_length=48,
                                   output_length=24)

    X_test, y_test = get_X_y_seq(test,
                                 number_of_sequences=1_000,
                                 input_length=48,
                                 output_length=24)


    # Train model using `model.py`
    model = load_model()

    if model is None:
        model = initialize_model(X_train, y_train, n_unit=24)

    model = compile_model(model, learning_rate=learning_rate)
    model, history = train_model(model,
                                X_train,
                                y_train,
                                validation_split = 0.3,
                                batch_size = 32,
                                epochs = 50
                                )

    val_mae = np.min(history.history['val_mae'])

    params = dict(
        context="train",
        training_set_size='40 years worth of data',
        row_count=len(X_train),
    )

    # Save results on the hard drive using taxifare.ml_logic.registry
    save_results(params=params, metrics=dict(mae=val_mae))

    # Save model weight on the hard drive (and optionally on GCS too!)
    save_model(model=model)

    print("✅ train() done \n")

    return val_mae

In [4]:
val_mae = train()

[35m
⭐️ Use case: train[0m
[34m
Loading preprocessed validation data...[0m
[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (376944, 3)
[34m
Load latest model from local registry...[0m
[34m
Load latest model from disk...[0m


2024-03-08 16:24:12.855363: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:02:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-08 16:24:12.855970: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-03-08 16:24:13.263772: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/conca

ValueError: Unknown metric function: 'r_squared'. Please ensure you are using a `keras.utils.custom_object_scope` and that this object is included in the scope. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.

In [None]:
from datetime import datetime

In [None]:
a = '2013-05-08 09:00:00'

In [None]:
a = datetime.strptime(a, '%Y-%m-%d %H:%M:%S')

In [None]:
reference_datetime = datetime.strptime("1980-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')

In [None]:
# Calculate the difference between the two datetime objects
time_difference = a - reference_datetime

In [None]:
time_difference_hours = time_difference.total_seconds() / 3600

In [None]:
time_difference_hours

In [None]:
X_test

In [None]:
X_test

model.predict()

In [None]:
def extract_pv_data(input_date: str, n_days=10):
    pv_data_clean = data_clean
    input_timestamp = pd.Timestamp(input_date, tz='UTC')
    idx = pv_data_clean[pv_data_clean.utc_time == input_timestamp].index[0]

    n_rows = 24 * n_days
    if idx <= n_rows:
        df = pv_data_clean.iloc[0:idx+24,:]
    else:
        df = pv_data_clean.iloc[idx-n_rows:idx+24,:].reset_index()
    extracted_data = {
        'utc_time':df.get('utc_time').tolist(),
        'local_time':df.get('local_time').tolist(),
        'electricity':df.get('electricity').tolist()
    }
    return extracted_data

In [None]:
df = pd.DataFrame(extract_pv_data('2013-05-08 09:00:00', 1))

In [None]:
df

In [None]:
model = load_model()

In [None]:
model

In [None]:
model.predict(df)

In [None]:
query = f"""
        SELECT *
        FROM {GCP_PROJECT}.{BQ_DATASET}.processed_pv
        ORDER BY utc_time
        """

data_processed_cache_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_pv.csv")
data_processed = get_data_with_cache(
    gcp_project=GCP_PROJECT,
    query=query,
    cache_path=data_processed_cache_path,
    data_has_header=True
)

data_processed = data_processed.rename(columns={'electricity': 'power'})

data_processed.utc_time = pd.to_datetime(data_processed.utc_time,utc=True)