In [1]:
import pandas as pd
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from power.ml_ops.data import get_data_with_cache, get_stats_table, postprocess
from power.ml_ops.registry import load_model
from power.ml_ops.model import evaluate_model
from power.ml_ops.cross_val import get_X_y_seq
from power.interface.main import pred #, postprocess

from pathlib import Path
from power.params import *
from power.utils import compress

import datetime
import tensorflow as tf

2024-03-14 11:07:17.214849: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-14 11:07:17.292302: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-14 11:07:17.293439: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
max_date = '2019-12-31 23:00:00'
model = load_model()

query = f"""
    SELECT *
    FROM {GCP_PROJECT}.{BQ_DATASET}.processed_pv
    ORDER BY utc_time
"""

data_processed_cache_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_pv.csv")
data_processed = get_data_with_cache(
    gcp_project=GCP_PROJECT,
    query=query,
    cache_path=data_processed_cache_path,
    data_has_header=True
)

if data_processed.shape[0] == 0:
    print("❌ No data to evaluate on")
    # return None

test = data_processed[data_processed['utc_time'] >= max_date]
test = test[['electricity']]

X_test, y_test = get_X_y_seq(test,
                                number_of_sequences=1_000,
                                input_length=48,
                                output_length=24,
                                gap_hours=12)


metrics_dict = evaluate_model(model=model, X=X_test, y=y_test)
mae = metrics_dict["mae"]


In [None]:
y_test.shape

In [2]:
data_processed_cache_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_pv.csv")
query = f"""
    SELECT *
    FROM {GCP_PROJECT}.{BQ_DATASET}.processed_pv
    ORDER BY utc_time
"""

data_processed = get_data_with_cache(
    gcp_project=GCP_PROJECT,
    query=query,
    cache_path=data_processed_cache_path,
    data_has_header=True
)
data_processed.utc_time = pd.to_datetime(data_processed.utc_time,utc=True)

[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (376944, 3)


In [3]:
data_processed.shape

(376944, 3)

In [6]:
def mean_historical_power(X: pd.DataFrame, input_date: str):
        """
        takes a date as a string input
        returns the mean power produced on that day.
        Mean over the 40 years of training
        should remove 24 data points
        """
        input_date_dt = datetime.datetime.strptime(input_date, '%Y-%m-%d') + datetime.timedelta(days=1)
        #filter by month
        df_month = X[X.utc_time.dt.month == input_date_dt.month]
        #filter by day
        df_day = df_month[df_month.utc_time.dt.day == input_date_dt.day].reset_index()
        array = df_day['electricity'].groupby(df_day.utc_time.dt.hour).mean().to_numpy()
        tensor = tf.convert_to_tensor(array)
        tensor = tf.expand_dims(tensor, axis=-1)
        tensor = tf.expand_dims(tensor, axis=-1)
        tensor = tf.expand_dims(tensor, axis=0)
        return array

array = mean_historical_power(data_processed, '2021-06-28')

In [8]:
pd.Series(array)

0     0.000000
1     0.000000
2     0.000000
3     0.012442
4     0.048395
5     0.110093
6     0.238907
7     0.375279
8     0.479395
9     0.528279
10    0.550535
11    0.540209
12    0.496605
13    0.425000
14    0.328814
15    0.225047
16    0.124279
17    0.053977
18    0.017698
19    0.000000
20    0.000000
21    0.000000
22    0.000000
23    0.000000
dtype: float64

In [None]:
def get_stats_table(
  years_df: pd.DataFrame,
  capacity= False,
  min_date = '2020-01-01 00:00:00',
  max_date = '2022-12-29 23:00:00') -> pd.DataFrame:
  """
  Creates a table with statistics for electricity and optional capacity factor
  for every hour of the year (8784).
  Input:
    - Cleaned df that contains at least electricity
    as column. The df should span several years, because every
    year is one sample for the statictics.
    - Optional flag for capacity factor
  Output:
    - df with 8784 hours of the years (including leap years) as rows. The df has
    multilevel index because statistics are returned for electricity and
    capacity factor.
  """
  years_df =  years_df[years_df['utc_time'] < min_date]
  print(years_df.shape)
  years_df['hour_of_year'] = years_df.utc_time.\
                           apply(lambda x: x.strftime("%m%d%H"))
  if capacity:
    stats_df = years_df[['hour_of_year', 'cap_fac']]\
                    .groupby(['hour_of_year']).agg(['mean','median','std',
                                                    'skew','min','max','count'])
  else:
    stats_df = years_df[['hour_of_year', 'electricity']]\
                    .groupby(['hour_of_year']).agg(['mean','median','std',
                                                    'skew','min','max','count'])
  return stats_df

In [None]:
get_stats_table(data_processed)