In [None]:
import os
import numpy as np
import pandas as pd
import datetime as dt
import pytz
import time
import calendar
import numpy as np
from statistics import mean

from uce_resources import get_mms_data, get_applied_forecast, get_current_forecast

In [None]:
from settings.sites import ceg as sites_list

target_year = 2023
target_month = 1
forecasts_types = ['real']

target_folder = 'data/results/{}-{:0>2}/'.format(target_year, target_month)
if not os.path.exists(target_folder):
    os.makedirs(target_folder)

# sites_list = ['Pohrebyshche']

# Data preparation section

In [None]:
from sqlalchemy import create_engine, MetaData
from sqlalchemy.pool import NullPool
from sqlalchemy.sql import select, and_
from settings.db import DO_URL

engine_source = create_engine(DO_URL, poolclass=NullPool)
metadata_source = MetaData()
metadata_source.reflect(bind=engine_source)


from settings.db import WAREHOSUE_URL

engine_warehouse = create_engine(WAREHOSUE_URL, poolclass=NullPool)

In [None]:
sites_data = list()

with engine_source.connect() as connection:
        
    for site in sites_list:
        start = time.time()
        # print('-'*50)
        # print(site)
        site_data = dict()

        sites_table = metadata_source.tables['sites']
        list_to_select = [
            sites_table.c.id, 
            sites_table.c.legal_entity, 
            sites_table.c.location, 
            sites_table.c.region, 
            sites_table.c.cluster,
            sites_table.c.installed_capacity_dc,
            sites_table.c.grid_capacity 
            ]
        query = select(list_to_select).where(sites_table.c.displayable_name == site)
        site_id_response = connection.execute(query).fetchall()[0]

        site_id = site_id_response[0]
        legal_entity_id = site_id_response[1]
        location = site_id_response[2]
        region = site_id_response[3]
        cluster = site_id_response[4]
        capacity_dc = site_id_response[5]
        
        with engine_warehouse.connect() as connection_warehouse:
            query = f"""
            SELECT grid_capacity from dim_site
            WHERE site_name = '{site}';
            """
            response = connection_warehouse.execute(query).fetchall()[0]
            
        grid_capacity = response[0]

        latitude, longitude = map(float, location.replace('(', '').replace(')', '').split(','))
        mms_data, mms_version = get_mms_data(site_id, 
                                             target_year, target_month, 
                                             connection, metadata_source.tables['mms_data'], include_prev=True,)
        mms_data.columns = ['yield']
        # print(mms_data)
        # print('MMS data | {} version | of | {} records |'.format(mms_version, len(mms_data)))
        # print(mms_data.index.max())

        applied_forecast = get_applied_forecast(site_id, target_year, target_month, 
                                                connection=connection, db_table=metadata_source.tables['forecasts_applied'])
        applied_forecast.columns = ['forecast']
        # print('Forecast data of | {} records |'.format(len(applied_forecast)))
        # print(applied_forecast.index.max())
        # print(applied_forecast)

        current_forecast_dates = [applied_forecast.index.max() + dt.timedelta(days=x) for x in range(1,4)]
        current_forecast = get_current_forecast(site_id, current_forecast_dates, connection, metadata_source.tables['forecasts_applied']).to_frame()
        current_forecast.columns = ['forecast']

        # print(current_forecast.index.min())
        # print(current_forecast.index.max())

        forecast = pd.concat([applied_forecast, current_forecast])
        forecast_data = pd.concat([forecast, mms_data.loc[mms_data.index >= forecast.index.min()]], axis=1, join='outer').reindex(columns=['yield', 'forecast'])

        site_series = pd.Series(index=forecast_data.index, data=site)
        latitude_series = pd.Series(index=forecast_data.index, data=latitude)
        longitude_series = pd.Series(index=forecast_data.index, data=longitude)
        region_series = pd.Series(index=forecast_data.index, data=region)
        cluster_series = pd.Series(index=forecast_data.index, data=cluster)
        capacity_dc_series = pd.Series(index=forecast_data.index, data=capacity_dc)
        grid_capacity_series = pd.Series(index=forecast_data.index, data=grid_capacity)

        site_data = pd.concat([
                    site_series, 
                    latitude_series, 
                    longitude_series,
                    region_series, 
                    cluster_series,
                    capacity_dc_series,
                    grid_capacity_series
                ], 
                axis=1
            )
        site_data.columns = ['site', 'latitude', 'longitude', 'region', 'cluster', 'capacity_dc', 'grid_capacity']                       
        site_data['date'] = site_data.index.strftime('%Y-%m-%d')
        site_data['hour'] = site_data.index.hour + 1
        site_data['datetime'] = site_data.index.strftime('%Y-%m-%dT%H:%M')
        site_data['datetime_tz'] = site_data.index.tz_localize(pytz.utc).tz_convert(pytz.timezone('europe/kiev')).strftime('%Y-%m-%dT%H:%M%z')
        
        site_data = pd.concat([site_data, forecast_data], axis=1)

        site_data['error'] = site_data['yield'] - site_data['forecast']
        site_data['error_positive'] = site_data['error'].apply(lambda x: x * (x >= 0))
        site_data['error_negative'] = site_data['error'].apply(lambda x: x * (x < 0))
        site_data['error_abs'] = site_data['error'].apply(abs)
        site_data['error_type'] = site_data['error'].apply(lambda x: 'negative' if x < 0 else 'positive')
        
        sites_data.append(site_data)
        end = time.time()

        print('{}: ок! Processing took {} seconds'.format(site, round(end - start, 2)))

In [None]:
data = pd.concat(sites_data, ignore_index=True).fillna(0)
data = data.drop_duplicates(keep='first')
data.to_csv(target_folder + 'mart_operative_forecasting_results.csv', index=False)

In [None]:
data_26 = data.copy()
# data_26 = data.loc[data.date == '2022-03-26']
site = 'Porohy'
print(data_26.loc[(data_26.site == site) & (data_26['yield'] <= 0) & (data_26['forecast'] < 0)])
data_26.loc[(data_26.site == site) & (data_26['yield'] <= 0) & (data_26['forecast'] < 0)][['yield', 'forecast']].mean()

### Inserting data to data mart

In [None]:
df_full = data.copy()
tuples_full = [tuple(x) for x in df_full.to_numpy()]
# print(tuples_full[-2:])
columns = list(df_full.columns)

columns_unique = [columns[0], *columns[5:7]]
# print(columns_unique)

df_update = df_full.copy().drop(columns=columns_unique)
tuples_update = [tuple(x) for x in df_update.to_numpy()]
columns_update = list(df_update.columns)
# print(columns_update)
# SQL query to execute
query_1 = 'INSERT INTO mart_operative_forecasting_result({}) VALUES {}'.format(','.join(columns), str(tuples_full).replace('[', '').replace(']', ''))
query_2 = '''
ON CONFLICT (site, date, hour) 
DO UPDATE SET
latitude = excluded.latitude,
longitude = excluded.longitude,
region = excluded.region,
cluster = excluded.cluster,
datetime = excluded.datetime,
datetime_tz = excluded.datetime_tz,
yield = excluded.yield,
forecast = excluded.forecast,
error = excluded.error,
error_positive = excluded.error_positive,
error_negative = excluded.error_negative,
error_abs = excluded.error_abs,
error_type = excluded.error_type,
capacity_dc = excluded.capacity_dc,
grid_capacity = excluded.grid_capacity;'''
# print(query_1, query_2)
with engine_warehouse.connect() as connection:
    connection.execute(query_1 + '\n' + query_2)
