In [1]:
import pandas as pd
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from power.ml_ops.data import get_data_with_cache, get_stats_table, postprocess
from power.ml_ops.registry import load_model
from power.interface.main import pred #, postprocess

from pathlib import Path
from power.params import *
from power.utils import compress

import datetime
import tensorflow as tf

2024-03-13 17:16:30.692738: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 17:16:31.756607: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 17:16:31.773463: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_processed_cache_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_pv.csv")
query = f"""
    SELECT *
    FROM {GCP_PROJECT}.{BQ_DATASET}.processed_pv
    ORDER BY utc_time
"""

data_processed = get_data_with_cache(
    gcp_project=GCP_PROJECT,
    query=query,
    cache_path=data_processed_cache_path,
    data_has_header=True
)
data_processed.utc_time = pd.to_datetime(data_processed.utc_time,utc=True)

[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (376944, 3)


In [3]:
data_processed.shape

(376944, 3)

In [4]:
def mean_historical_power(X: pd.DataFrame, input_date: str):
        """
        takes a date as a string input
        returns the mean power produced on that day.
        Mean over the 40 years of training
        should remove 24 data points
        """
        input_date_dt = datetime.datetime.strptime(input_date, '%Y-%m-%d') + datetime.timedelta(days=1)
        #filter by month
        df_month = X[X.utc_time.dt.month == input_date_dt.month]
        #filter by day
        df_day = df_month[df_month.utc_time.dt.day == input_date_dt.day].reset_index()
        array = df_day['electricity'].groupby(df_day.utc_time.dt.hour).mean().to_numpy()
        tensor = tf.convert_to_tensor(array)
        tensor = tf.expand_dims(tensor, axis=0)
        return tensor

mean_historical_power(data_processed, '2021-07-28')

<tf.Tensor: shape=(1, 24), dtype=float64, numpy=
array([[0.        , 0.        , 0.        , 0.00109302, 0.03118605,
        0.08644186, 0.20974419, 0.33897674, 0.44625581, 0.51509302,
        0.55332558, 0.53672093, 0.48744186, 0.42188372, 0.3274186 ,
        0.22039535, 0.1114186 , 0.03918605, 0.00402326, 0.        ,
        0.        , 0.        , 0.        , 0.        ]])>

In [12]:
def get_stats_table(
  years_df: pd.DataFrame,
  capacity= False,
  min_date = '2020-01-01 00:00:00',
  max_date = '2022-12-29 23:00:00') -> pd.DataFrame:
  """
  Creates a table with statistics for electricity and optional capacity factor
  for every hour of the year (8784).
  Input:
    - Cleaned df that contains at least electricity
    as column. The df should span several years, because every
    year is one sample for the statictics.
    - Optional flag for capacity factor
  Output:
    - df with 8784 hours of the years (including leap years) as rows. The df has
    multilevel index because statistics are returned for electricity and
    capacity factor.
  """
  years_df =  years_df[years_df['utc_time'] < min_date]
  print(years_df.shape)
  years_df['hour_of_year'] = years_df.utc_time.\
                           apply(lambda x: x.strftime("%m%d%H"))
  if capacity:
    stats_df = years_df[['hour_of_year', 'cap_fac']]\
                    .groupby(['hour_of_year']).agg(['mean','median','std',
                                                    'skew','min','max','count'])
  else:
    stats_df = years_df[['hour_of_year', 'electricity']]\
                    .groupby(['hour_of_year']).agg(['mean','median','std',
                                                    'skew','min','max','count'])
  return stats_df

In [13]:
get_stats_table(data_processed)

(350640, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  years_df['hour_of_year'] = years_df.utc_time.\


Unnamed: 0_level_0,electricity,electricity,electricity,electricity,electricity,electricity,electricity
Unnamed: 0_level_1,mean,median,std,skew,min,max,count
hour_of_year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
010100,0.0,0.0,0.0,0.0,0.0,0.0,40
010101,0.0,0.0,0.0,0.0,0.0,0.0,40
010102,0.0,0.0,0.0,0.0,0.0,0.0,40
010103,0.0,0.0,0.0,0.0,0.0,0.0,40
010104,0.0,0.0,0.0,0.0,0.0,0.0,40
...,...,...,...,...,...,...,...
123119,0.0,0.0,0.0,0.0,0.0,0.0,40
123120,0.0,0.0,0.0,0.0,0.0,0.0,40
123121,0.0,0.0,0.0,0.0,0.0,0.0,40
123122,0.0,0.0,0.0,0.0,0.0,0.0,40
