In [1]:
from auxiliary.download_data import download_from_switch
from auxiliary.read_gries_data import read_gries_txt_data
from auxiliary.read_gletsch_data import read_gletsch_csv_data
from auxiliary.read_swissgrid_data import read_spot_price_swissgrid, read_balancing_price_swissgrid, read_fcr_price_swissgrid, read_frr_price_swissgrid
from auxiliary.auxiliary import read_pyarrow_data
import polars as pl
import tqdm
from sqlalchemy import create_engine
from schema.schema import Irradiation, WindSpeed, Temperature, DischargeFlow, MarketPrice
from federation.generate_forecast import generate_dataframe_forecast
from sqlalchemy import create_engine
from schema.schema import Base
from uuid import uuid4
import polars as pl
import os
from datetime import datetime
import tqdm
import polars.selectors as cs
# from federation.generate_scenarios import initialize_time_series, generate_scenarios, query_time_series_data, fill_null_remove_outliers
from schema.schema import (
    DischargeFlow,
    Irradiation,
    WindSpeed,
    Temperature,
    MarketPrice,
)
import polars as pl
import tqdm
from sqlalchemy import create_engine
from schema.schema import Irradiation, WindSpeed, Temperature, DischargeFlow, MarketPrice
from federation_2.generate_scenarios import generate_scenarios, fill_null_remove_outliers
from federation_2.generate_forecast import day_ahead_forecast_arima_with_lag
from datetime import timedelta
import warnings
import pmdarima as pm
import polars as pl
from pmdarima.pipeline import Pipeline
# from federation.generate_forecast import generate_dataframe_forecast
from datetime import timedelta
from auxiliary.plot_results import plot_forecast_results

In [2]:
"""
Initialize price signals
"""
db_cache_file = r".cache/interim/time_series_schema.db"
table = MarketPrice


engine = create_engine(f"sqlite+pysqlite:///{db_cache_file}", echo=False)
con = engine.connect()

d_time=timedelta(hours=1)
total_data_forecast_df: pl.DataFrame = pl.DataFrame()

table_name = table.__tablename__
if table_name == "MarketPrice":
    type_col = "market"
elif table_name == "DischargeFlow":
    type_col = "river"
else:
    type_col = "alt"
non_negative = table_name in ["Irradiation", "WindSpeed", "river"] 
z_score = 4 if table_name in ["Irradiation", "Temperature"] else 10

with tqdm.tqdm(total=1, ncols=100, desc="Read {} table in sqlite database".format(table_name)) as pbar: 
    raw_data_df = pl.read_database(query="""SELECT {f1}.* FROM {f1}""".format(f1=table_name), connection=con)  
    pbar.update()  


Read MarketPrice table in sqlite database: 100%|██████████████████████| 1/1 [00:05<00:00,  5.46s/it]


In [3]:

# for type_value in tqdm.tqdm(raw_data_df[type_col].unique(), desc="Clean data and generate scenarios of " + table_name):
type_value ="DA price"
cleaned_data_df: pl.DataFrame = fill_null_remove_outliers(
    raw_data_df.filter(pl.col(type_col) == type_value), d_time=d_time, z_score=z_score
)
data_scenarios_df, fill_null= generate_scenarios(cleaned_data_df, d_time=d_time)


In [4]:

data_forecast_df: pl.DataFrame = day_ahead_forecast_arima_with_lag(
    data_scenarios_df, d_time=d_time, non_negative=non_negative
)
plot_forecast_results(data_scenarios_df=data_scenarios_df, data_forecast_df=data_forecast_df, file_name=table_name + "_" + str(type_value))

In [5]:
cleaned_data_df

timestamp,year,week,time_step,value,delta_t
datetime[μs],i64,i64,i64,f64,duration[μs]
2015-01-01 00:00:00,2015,1,73,44.94,1h
2015-01-01 01:00:00,2015,1,74,43.43,1h
2015-01-01 02:00:00,2015,1,75,38.08,1h
2015-01-01 03:00:00,2015,1,76,35.47,1h
2015-01-01 04:00:00,2015,1,77,30.83,1h
2015-01-01 05:00:00,2015,1,78,28.26,1h
2015-01-01 06:00:00,2015,1,79,25.36,1h
2015-01-01 07:00:00,2015,1,80,26.93,1h
2015-01-01 08:00:00,2015,1,81,24.95,1h
2015-01-01 09:00:00,2015,1,82,26.71,1h


In [6]:
type_value = "FCR"
time_step_per_day: int = int(timedelta(days=1)/d_time)
raw_data_df = raw_data_df.filter(pl.col(type_col) == type_value)
cleaned_data_df = raw_data_df.select([
    pl.col("timestamp").str.strptime(dtype=pl.Datetime, format="%Y-%m-%d %H:%M:%S%.6f").dt.truncate(d_time),
    pl.col("value")   
]).sort("timestamp")\
.group_by("timestamp", maintain_order=True).agg(pl.col("value").mean())

# Remove outlier using z_score test
cleaned_data_df = cleaned_data_df\
.with_columns(
    (((pl.col("value") - pl.col("value").mean()) / pl.col("value").std()).abs()).alias("z_score")
).filter(pl.col("z_score") < z_score)

# fill missing values with data of previous, next day and week
cleaned_data_df = cleaned_data_df.sort("timestamp")\
.upsample(time_column="timestamp", every=d_time, maintain_order=True)\
.with_columns(
    pl.col("value").fill_null(pl.col("value").shift(n = time_step_per_day))
    .fill_null(pl.col("value").shift(n = -time_step_per_day))
    .fill_null(pl.col("value").shift(n = 7*time_step_per_day))
    .fill_null(pl.col("value").shift(n = -7*time_step_per_day))
    .fill_null(strategy="forward"),
    pl.col("timestamp").dt.year().cast(pl.Int64).alias("year"),
    pl.col("timestamp").dt.week().cast(pl.Int64).alias("week"),  
    pl.col("timestamp").dt.weekday().cast(pl.Int64).alias("weekday"),    
    pl.col("timestamp").dt.month().cast(pl.Int64).alias("month"),  
)

# Remove 53 week if exist and change year of first and last week in order to have consistent year 
if cleaned_data_df["year"].unique().shape[0] != 1:
    cleaned_data_df = cleaned_data_df\
    .filter(pl.col("week") <= 52)\
    .with_columns(
        pl.when((pl.col("week") == 52) & (pl.col("month") == 1))
        .then(pl.col("year") - 1)
        .when((pl.col("week") == 1) & (pl.col("month") == 12))
        .then(pl.col("year") + 1)
        .otherwise(pl.col("year")).alias("year")
    )
else:
    cleaned_data_df = cleaned_data_df.sort(["year","week",  "weekday","timestamp"])
# define time step
cleaned_data_df = cleaned_data_df.with_columns([
cleaned_data_df.group_by(['year', 'week', 'weekday'], maintain_order=True)\
.agg(pl.int_range(0, pl.len()).alias("time_step"))\
.explode("time_step")["time_step"]
]).with_columns(
    (pl.col("time_step") + (pl.col("weekday") - 1)*time_step_per_day).alias("time_step")
).filter(pl.struct(["year","week", "time_step"]).is_first_distinct()).sort(["year","week", "time_step"])


In [7]:

cleaned_data_df = cleaned_data_df\
.select([
    pl.col("timestamp"), 
    pl.col(["year", "week", "time_step"]).cast(pl.Int64),
    pl.col("value"), 
    pl.lit(d_time).alias("delta_t")
]).sort(["year", "week", "time_step"])

In [8]:
raw_data_df = data_scenarios_dict[d] 
scenario_list = raw_data_df["scenario"].unique().to_list()

raw_data_df = raw_data_df.pivot(values="value", index=["week", "time_step"], columns="scenario").sort(["week", "time_step"])

predicted_data_df = raw_data_df.with_columns([
    predict_years(values=raw_data_df[scenario], step_per_day=step_per_day, days_to_predict=days_to_predict, lag=4).alias(scenario)
    for scenario in scenario_list
])




NameError: name 'data_scenarios_dict' is not defined

In [None]:
d_time

In [None]:

cleaned_data_df = cleaned_data_df\
.select([
    pl.col("timestamp"), 
    pl.col(["year", "week", "time_step"]).cast(pl.Int64),
    pl.col("value"), 
    pl.lit(d_time).alias("delta_t")
])
    

In [None]:
# for key, values in data.items():
#     data[key] = data[key].with_columns(pl.col("week").cast(pl.UInt32))



    # data_forecast[d] = generate_dataframe_forecast(data_scenarios[d], d_time="1h", non_negative=non_negative)
#     data_scenarios[d] = data_scenarios[d].select(["week", "time_step", "scenario", "delta_t", "value"]).with_columns([pl.lit(d).alias(additional_column), pl.lit("RT").alias("horizon")])
#     data_forecast[d] = data_forecast[d].select(["week", "time_step", "scenario", "delta_t", "value"]).with_columns([pl.lit(d).alias(additional_column), pl.lit("DA").alias("horizon")])
#     result_data = pl.concat([result_data, data_scenarios[d], data_forecast[d]])
# result_data.write_database(table_name=table_schema_name + "Norm", connection=f"sqlite+pysqlite:///{db_cache_file}", if_exists="replace", engine="sqlalchemy")

In [None]:
d_time=timedelta(hours=1)
z_score=10
cleaned_data_df = data[d].select([
    pl.col("timestamp").str.strptime(dtype=pl.Datetime, format="%Y-%m-%d %H:%M:%S%.6f").dt.truncate(d_time),
    pl.col("value")   
]).group_by("timestamp", maintain_order=True).agg(pl.col("value").mean())

# Remove outlier using z_score test
cleaned_data_df = cleaned_data_df\
.with_columns(
    (((pl.col("value") - pl.col("value").mean()) / pl.col("value").std()).abs()).alias("z_score")
).filter(pl.col("z_score") < z_score)

# fill missing values with data of previous, next day and week
cleaned_data_df = cleaned_data_df.sort("timestamp")\
.upsample(time_column="timestamp", every=d_time, maintain_order=True)\
.with_columns(
    pl.col("value").fill_null(pl.col("value").shift(n = int(timedelta(days=1)/d_time)))
    .fill_null(pl.col("value").shift(n = -int(timedelta(days=1)/d_time)))
    .fill_null(pl.col("value").shift(n = int(timedelta(days=7)/d_time)))
    .fill_null(pl.col("value").shift(n = -int(timedelta(days=7)/d_time)))
    .fill_null(strategy="forward"),
    pl.col("timestamp").dt.year().alias("year"),
    pl.col("timestamp").dt.week().alias("week"),  
    (pl.col("timestamp").dt.weekday() - 1).alias("weekday"),    
    pl.col("timestamp").dt.month().alias("month"),  
)
# Remove 53 week if exist and change 
cleaned_data_df = cleaned_data_df\
.filter(pl.col("week") <= 52)\
.with_columns(
    pl.when((pl.col("week") == 52) & (pl.col("month") == 1))
    .then(pl.col("year") - 1)
    .when((pl.col("week") == 1) & (pl.col("month") == 12))
    .then(pl.col("year") + 1)
    .otherwise(pl.col("year")).alias("year")
)


In [None]:
cleaned_data_df

In [None]:

# define time step

In [None]:
raw_data_df

In [None]:
rivers_df = pl.read_database(query="""SELECT {f1}.* FROM {f1}""".format(f1=DischargeFlow.__tablename__), connection=con)
rivers_df

In [None]:
yearly_mean_val = cleaned_data["Short price"].group_by("year").agg(pl.col("value").mean()).sort("value")
year_median = yearly_mean_val.with_columns((pl.col("value")-pl.mean("value"))**2).sort("value")["year"][0]
year_mapping_df = year_mapping_df = pl.DataFrame({
    "year": [yearly_mean_val["year"][0], yearly_mean_val["year"][-1], year_median],
    "scenario" : ["min", "max", "median"]
}).with_columns(pl.col("year").cast(pl.Int64))
year_mapping_df

In [None]:

stat_data_by_year = cleaned_data["Short price"].join(year_mapping_df, on="year", how="inner")
    
stat_data_by_year

In [None]:
cleaned_data[d]

In [None]:
# Time_step : find min max and median for each time step
stat_data_by_time_step =cleaned_data["Short price"].group_by(["week", "time_step"], maintain_order=True).agg([
        pl.col("value").mean().alias("median"),
        pl.col("value").max().alias("max"),
        pl.col("value").min().alias("min"),
        pl.col("delta_t").first()
    ]).melt(
        id_vars =["week", "time_step", "delta_t"], value_name="time_step_value",
        value_vars=["median","max","min"], variable_name="scenario"
    ).sort(["week", "time_step"])
stat_data_by_time_step

In [None]:
# Generate scenario using yearly stat data with null filled with main set stat
scenario_data_df = stat_data_by_time_step.join(stat_data_by_year, on=["week", "time_step", "scenario"], how="outer")\
    .select([
        pl.col(["week", "time_step", "delta_t", "scenario"]),
        pl.col("value").fill_null(pl.col("time_step_value")).alias("value")
    ])
scenario_data_df

In [None]:
for df in cleaned_data.values():
    display(df.group_by("year", maintain_order=True).agg(pl.col("value").count()))
    display(df.filter(pl.col("year") == 2015).group_by("week", maintain_order=True).agg(pl.col("value").count()))

In [None]:
cleaned_data_df = cleaned_data_df.upsample(time_column="timestamp", every=timedelta(minutes=15), maintain_order=True)\
.with_columns(
    pl.col("value").fill_null(pl.col("value").shift(n = int(timedelta(days=1)/d_time)))
    .fill_null(pl.col("value").shift(n = -int(timedelta(days=1)/d_time))),
    pl.col("timestamp").dt.year().alias("year"),
    pl.col("timestamp").dt.week().alias("week"),   
)
print(cleaned_data_df.null_count())
cleaned_data_df

In [None]:

raw_data_df = data["Long price"].select(["timestamp", "value"])
avg = raw_data_df.mean().get_column("value")[0]
std = raw_data_df.std().get_column("value")[0]
raw_data_df = raw_data_df.with_columns(((pl.col("value") - avg) / std).alias("z_score"))
raw_data_df = raw_data_df.filter((pl.col("z_score") < z_score) & (pl.col("z_score") > -z_score))
# fill missing values with data of previous day
raw_data_df = raw_data_df.with_columns(pl.col("timestamp").str.strptime(dtype=pl.Datetime, format="%Y-%m-%d %H:%M:%S%.6f")).set_sorted("timestamp", descending=False).upsample(time_column="timestamp", every=d_time)
raw_data_df

In [None]:
data_df = data_scenarios[d]
d_time="1h"
non_negative=non_negative    
d_time_int = int(d_time.split("h")[0]) if "h" in d_time else int(d_time.split("m")[0]) / 60 if "m" in d_time else RuntimeError
def generate_arbitrary_year(year_col: pl.col) -> pl.col:
    return year_col.dt.strftime("2030-%m-%d %H:%M:%S")\
        .str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S", strict=False).alias("arbitrary_year")

# result_df = pl.DataFrame(schema={"timestamp": pl.Datetime(time_unit="ns"), "value": pl.Float64, "scenario": pl.Utf8})
data_df.with_columns(generate_arbitrary_year(pl.col("timestamp")))

In [None]:
for scen in data_df["scenario"].unique().to_list():
    df_temp = data_df.filter(pl.col("scenario")==scen).with_columns(arbitrary_year).sort("timestamp").select(["timestamp", "value"]).to_pandas().set_index("timestamp", drop=True)
    df_temp = df_temp[~df_temp.index.duplicated()].asfreq('1h', method = 'ffill')
    result_temp = day_ahead_forecast_arima_with_lag(df_temp, non_negative=non_negative)
    data_forecast_temp = pl.from_dict({"timestamp": df_temp.index, "value": result_temp}).with_columns(pl.lit(scen).alias("scenario"))
    print(result_df.head())
    print(data_forecast_temp.head())
    result_df = pl.concat([result_df, data_forecast_temp])
result_df = result_df.with_columns([arbitrary_year.dt.week().alias("week"), pl.col("timestamp").dt.year().alias("year")])
# define time step
result_df = result_df.group_by(["week", "year", "scenario"], maintain_order=True).agg([pl.col("timestamp"), pl.col("value")]).with_columns(pl.col("value").map_elements(lambda x: range(len(x))).alias("time_step"))
result_df = result_df.explode(["timestamp", "time_step", "value"]).select(["timestamp", "year", "week", "time_step", "value", "scenario"])
# remove first and end weeks to have consistent years
result_df = result_df.filter((pl.col("week") < 53) & (pl.col("time_step") < int(168 / d_time_int))).with_columns(pl.lit(d_time_int).alias("delta_t").cast(pl.Float64))

In [None]:
data_scenarios[d]