# Imports and Define Paths

In [1]:
import numpy as np
import pandas as pd

import glob
import os
import time
import pickle

from colorama import Fore, Style
from tensorflow import keras
from google.cloud import storage


from pathlib import Path
from colorama import Fore, Style
from dateutil.parser import parse
from typing import Dict, List, Tuple, Sequence
from datetime import datetime

from power.params import *
from power.ml_ops.data import get_data_with_cache, load_data_to_bq, clean_pv_data
from power.ml_ops.model import initialize_model, compile_model, train_model
from power.ml_ops.registry import load_model, save_model, save_results
from power.ml_ops.cross_val import get_Xi_yi, get_X_y_seq

2024-03-09 13:08:17.520719: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-09 13:08:17.619626: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-09 13:08:17.622630: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Package Functions as reference

In [None]:
def preprocess(min_date:str = '2009-01-01', max_date:str = '2015-01-01') -> None:
    """
    - Query the raw dataset from Le Wagon's BigQuery dataset
    - Cache query result as a local CSV if it doesn't exist locally
    - Process query data
    - Store processed data on your personal BQ (truncate existing table if it exists)
    - No need to cache processed data as CSV (it will be cached when queried back from BQ during training)
    """

    print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

    # Query raw data from BUCKET BigQuery using `get_data_with_cache`
    query = f"""
        SELECT *
        FROM {GCP_PROJECT}.{BQ_DATASET}.raw_pv
        ORDER BY _0
    """

    # Retrieve data using `get_data_with_cache`
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"raw_pv.csv")
    data_query = get_data_with_cache(
        query=query,
        gcp_project=GCP_PROJECT,
        cache_path=data_query_cache_path,
        data_has_header=True
    )

    # Process data
    data_clean = clean_pv_data(data_query)


    load_data_to_bq(
        data_clean,
        gcp_project=GCP_PROJECT,
        bq_dataset=BQ_DATASET,
        table=f'processed_pv',
        truncate=True
    )

    print("✅ preprocess() done \n")

# Load Processed data and split into train and test sets

In [None]:
print(Fore.MAGENTA + "\n⭐️ Use case: train" + Style.RESET_ALL)
print(Fore.BLUE + "\nLoading preprocessed validation data..." + Style.RESET_ALL)


# Load processed data using `get_data_with_cache` in chronological order

query = f"""
    SELECT *
    FROM {GCP_PROJECT}.{BQ_DATASET}.processed_pv
    ORDER BY utc_time
"""

data_processed_cache_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_pv.csv")
data_processed = get_data_with_cache(
    gcp_project=GCP_PROJECT,
    query=query,
    cache_path=data_processed_cache_path,
    data_has_header=True
)

# the model uses power as feature -> fix that in raw data
data_processed = data_processed.rename(columns={'electricity': 'power'})
# the processed data form bq needs to be converted to datetime object
data_processed.utc_time = pd.to_datetime(data_processed.utc_time,utc=True)

if data_processed.shape[0] < 240:
    print("❌ Not enough processed data retrieved to train on")
    # return None


# Split the data into training and testing sets
train = data_processed[data_processed['utc_time'] < '2020-01-01']
test = data_processed[data_processed['utc_time'] >= '2020-01-01']

train = train[['power']]
test = test[['power']]


In [None]:
X_train, y_train = get_X_y_seq(train,
                                number_of_sequences=10_000,
                                input_length=48,
                                output_length=24)

# Define Model

In [None]:
import pandas as pd

from tensorflow.keras import models, layers, optimizers, metrics
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda
from tensorflow.keras.callbacks import EarlyStopping

def initialize_model(X_train, y_train, n_unit=24):

    # 1 - RNN architecture
    # ======================
    model = models.Sequential()

    ## 1.1 - Recurrent Layer
    model.add(layers.LSTM(n_unit,
                          activation='tanh',
                          return_sequences = False,
                          input_shape=(X_train.shape[1],X_train.shape[2])
                          ))
    ## 1.2 - Predictive Dense Layers
    output_length = y_train.shape[1]
    model.add(layers.Dense(output_length, activation='linear'))

    return model

def compile_model(model, learning_rate=0.02):

    # def r_squared(y_true, y_pred):
    #     ss_res = K.sum(K.square(y_true - y_pred))
    #     ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
    #     return (1 - ss_res/(ss_tot + K.epsilon()))

    adam = optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='mse', optimizer=adam, metrics=['mae']) #, r_squared])

    return model

def train_model(model,
                X_train,
                y_train,
                validation_split = 0.3,
                batch_size = 32,
                epochs = 50):
    es = EarlyStopping(monitor = "val_mae",
                       mode = "min",
                       patience = 5,
                       restore_best_weights = True)
    history = model.fit(X_train, y_train,
                        validation_split=validation_split,
                        shuffle=False,
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks = [es],
                        verbose = 0)
    return model, history

# Train model

In [None]:
model = initialize_model(X_train, y_train, n_unit=24)

In [None]:
model.summary()

In [None]:
model = compile_model(model, learning_rate=0.02)

In [None]:
model, history = train_model(model,
                                X_train,
                                y_train,
                                validation_split = 0.3,
                                batch_size = 32,
                                epochs = 50
                                )

In [None]:
val_mae = np.min(history.history['val_mae'])
val_mae


In [None]:
params = dict(
    context="train",
    training_set_size='40 years worth of data',
    row_count=len(X_train),
)
params

# Attempt to save & load model

In [None]:
import glob
import os
import time
import pickle

from colorama import Fore, Style
from tensorflow import keras
from google.cloud import storage

from power.params import *

timestamp = time.strftime("%Y%m%d-%H%M%S")

timestamp

## 1. Save

In [None]:
# Save model locally
model_path = os.path.join(LOCAL_REGISTRY_PATH, "models", f"{timestamp}.h5")
model.save(model_path)

print("✅ Model saved locally")

## 2. Load

In [None]:
print(Fore.BLUE + f"\nLoad latest model from local registry..." + Style.RESET_ALL)

# Get the latest model version name by the timestamp on disk
local_model_directory = os.path.join(LOCAL_REGISTRY_PATH, "models")
local_model_paths = glob.glob(f"{local_model_directory}/*")

if not local_model_paths:
    print('None')

most_recent_model_path_on_disk = sorted(local_model_paths)[-1]

print(Fore.BLUE + f"\nLoad latest model from disk..." + Style.RESET_ALL)

latest_model = keras.models.load_model(most_recent_model_path_on_disk)

print("✅ Model loaded from local disk")

In [None]:
latest_model.summary()

# Test Pred function

In [None]:
def pred(X_pred:str = '2013-05-08 12:00:00') -> np.ndarray:
    """
    Make a prediction using the latest trained model
    """

    print("\n⭐️ Use case: predict")

    # X_pred = datetime.strptime(X_pred, '%Y-%m-%d %H:%M:%S')
    # reference_datetime = datetime.strptime("1980-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')
    # time_difference = X_pred - reference_datetime
    # time_difference_hours = time_difference.total_seconds() / 3600
    # input_date = X_test[time_difference_hours-47: time_difference_hours+1]



    # if X_pred is None:
    #     X_pred = pd.DataFrame(dict(
    #     pickup_datetime=[pd.Timestamp("2013-07-06 17:18:00", tz='UTC')],
    #     pickup_longitude=[-73.950655],
    #     pickup_latitude=[40.783282],
    #     dropoff_longitude=[-73.984365],
    #     dropoff_latitude=[40.769802],
    #     passenger_count=[1],
    # ))

    model = load_model()
    assert model is not None

    # X_processed = preprocess_features(X_pred)
    # y_pred = model.predict(X_processed)

    # print("\n✅ prediction done: ", y_pred, y_pred.shape, "\n")
    print("\n✅ prediction done: \n")
    return model

In [None]:
new_model = load_model()

In [None]:
new_model.summary()