In [1]:
import numpy as np
import pandas as pd
import os

from google.cloud import bigquery, storage

from pathlib import Path
from colorama import Fore, Style
from dateutil.parser import parse

from power.ml_ops.data import get_pv_data, clean_pv_data

from power.params import *
# from power.ml_ops.data import get_data_with_cache, clean_data, load_data_to_bq
# from power.ml_ops.model import initialize_model, compile_model, train_model, evaluate_model
# from power.ml_ops.preprocessor import preprocess_features
# from power.ml_ops.registry import load_model, save_model, save_results
# from power.ml_ops.registry import mlflow_run, mlflow_transition_model

## Get Data with Cache

In [2]:


def get_data_with_cache(
        gcp_project:str,
        query:str,
        cache_path:Path,
        data_has_header=True
    ) -> pd.DataFrame:
    """
    Retrieve `query` data from BigQuery, or from `cache_path` if the file exists
    Store at `cache_path` if retrieved from BigQuery for future use
    """
    if cache_path.is_file():
        print(Fore.BLUE + "\nLoad data from local CSV..." + Style.RESET_ALL)
        df = pd.read_csv(cache_path, header='infer' if data_has_header else None)
    else:
        print(Fore.BLUE + "\nLoad data from BigQuery server..." + Style.RESET_ALL)
        client = bigquery.Client(project=gcp_project)
        query_job = client.query(query)
        result = query_job.result()
        df = result.to_dataframe()

        # Store as CSV if the BQ query returned at least one valid line
        if df.shape[0] > 1:
            df.to_csv(cache_path, header=data_has_header, index=False)

    print(f"✅ Data loaded, with shape {df.shape}")

    return df


In [9]:
print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)


query = f"""
    SELECT *
    FROM {GCP_PROJECT_WAGON}.{BQ_DATASET}.raw_pv
    ORDER BY _0
"""
query

[35m
 ⭐️ Use case: preprocess[0m


'\n    SELECT *\n    FROM le-wagon-data-411310.power.raw_pv\n    ORDER BY _0\n'

In [11]:
# Retrieve data using `get_data_with_cache`
data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"raw_pv.csv")
data_query = get_data_with_cache(
    query=query,
    gcp_project=GCP_PROJECT,
    cache_path=data_query_cache_path,
    data_has_header=True
)

[34m
Load data from BigQuery server...[0m
✅ Data loaded, with shape (376944, 8)


## Load Data to BigQuery

In [12]:
def load_data_to_bq(
        data: pd.DataFrame,
        gcp_project:str,
        bq_dataset:str,
        table: str,
        truncate: bool
    ) -> None:
    """
    - Save the DataFrame to BigQuery
    - Empty the table beforehand if `truncate` is True, append otherwise
    """

    assert isinstance(data, pd.DataFrame)
    full_table_name = f"{gcp_project}.{bq_dataset}.{table}"
    print(Fore.BLUE + f"\nSave data to BigQuery @ {full_table_name}...:" + Style.RESET_ALL)

    # Load data onto full_table_name
    client = bigquery.Client()

    # Define write mode and schema
    write_mode = "WRITE_TRUNCATE" if truncate else "WRITE_APPEND"
    job_config = bigquery.LoadJobConfig(write_disposition=write_mode)

    print(f"\n{'Write' if truncate else 'Append'} {full_table_name} ({data.shape[0]} rows)")

    # Load data
    job = client.load_table_from_dataframe(data, full_table_name, job_config=job_config)
    result = job.result()  # wait for the job to complete

    print(f"✅ Data saved to bigquery, with shape {data.shape}")


### Clean data

In [13]:
data_query.head()


Unnamed: 0,_0-1,_0,local_time,electricity,irradiance_direct,irradiance_diffuse,temperature,source
0,,315532800000,1980-01-01 01:00:00+01:00,0.0,0.0,0.0,-1.296,data/pv_data/1980_pv.csv
1,,315536400000,1980-01-01 02:00:00+01:00,0.0,0.0,0.0,-1.216,data/pv_data/1980_pv.csv
2,,315540000000,1980-01-01 03:00:00+01:00,0.0,0.0,0.0,-1.005,data/pv_data/1980_pv.csv
3,,315543600000,1980-01-01 04:00:00+01:00,0.0,0.0,0.0,-1.063,data/pv_data/1980_pv.csv
4,,315547200000,1980-01-01 05:00:00+01:00,0.0,0.0,0.0,-1.227,data/pv_data/1980_pv.csv


In [14]:
clean_data = clean_pv_data(data_query)
clean_data

# data cleaned


Unnamed: 0,utc_time,local_time,electricity
0,1980-01-01 00:00:00+00:00,1980-01-01 01:00:00+01:00,0.0
1,1980-01-01 01:00:00+00:00,1980-01-01 02:00:00+01:00,0.0
2,1980-01-01 02:00:00+00:00,1980-01-01 03:00:00+01:00,0.0
3,1980-01-01 03:00:00+00:00,1980-01-01 04:00:00+01:00,0.0
4,1980-01-01 04:00:00+00:00,1980-01-01 05:00:00+01:00,0.0
...,...,...,...
376939,2022-12-31 19:00:00+00:00,2022-12-31 20:00:00+01:00,0.0
376940,2022-12-31 20:00:00+00:00,2022-12-31 21:00:00+01:00,0.0
376941,2022-12-31 21:00:00+00:00,2022-12-31 22:00:00+01:00,0.0
376942,2022-12-31 22:00:00+00:00,2022-12-31 23:00:00+01:00,0.0


In [15]:
load_data_to_bq(
        clean_data,
        gcp_project=GCP_PROJECT,
        bq_dataset=BQ_DATASET,
        table=f'processed_pv',
        truncate=True
    )

[34m
Save data to BigQuery @ le-wagon-data-411310.power.processed_pv...:[0m

Write le-wagon-data-411310.power.processed_pv (376944 rows)
✅ Data saved to bigquery, with shape (376944, 3)


## Save model in GCS

In [None]:
def save_model(model: keras.Model = None) -> None:
    """
    Persist trained model locally on the hard drive at f"{LOCAL_REGISTRY_PATH}/models/{timestamp}.h5"
    - if MODEL_TARGET='gcs', also persist it in your bucket on GCS at "models/{timestamp}.h5" --> unit 02 only
    - if MODEL_TARGET='mlflow', also persist it on MLflow instead of GCS (for unit 0703 only) --> unit 03 only
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")

    # Save model locally
    model_path = os.path.join(LOCAL_REGISTRY_PATH, "models", f"{timestamp}.h5")
    model.save(model_path)

    print("✅ Model saved locally")



    model_filename = model_path.split("/")[-1] # e.g. "20230208-161047.h5" for instance
    client = storage.Client()
    bucket = client.bucket(BUCKET_NAME)
    blob = bucket.blob(f"models/{model_filename}")
    blob.upload_from_filename(model_path)

    print("✅ Model saved to GCS")

    return None