In [None]:
import os
import io
import pytz
import pandas as pd
import datetime as dt
import calendar
import numpy as np
from sqlalchemy import create_engine, MetaData, desc
from sqlalchemy.sql import select, and_, or_, not_
from sqlalchemy.pool import NullPool

from settings.db import DO_URL
from settings.sites import ceg as all_sites

In [None]:
import os
os.getcwd()

In [None]:
engine = create_engine(DO_URL, poolclass=NullPool)
metadata = MetaData()
metadata.reflect(bind=engine)

In [None]:
def get_record_index(site_id, date, table, connection):
    index = None

    query_1 = select([table.c.id, table.c.completed]).where(and_(table.c.site == site_id, table.c.year == date.year, table.c.month == date.month))
    record_to_update = connection.execute(query_1).fetchall()

    if record_to_update:
        return record_to_update[0][0], True
    else:
        max_index_response = connection.execute('SELECT MAX(id) FROM public.{0}'.format(table))
        index = list(max_index_response)[0][0]
        index = 1 if index is None else int(index) + 1
        return index, False


def get_site_id(site_name, connection):
    query = 'SELECT id FROM public.sites where displayable_name = \'{0}\''.format(site_name)
    # print(query)
    site_id_response = connection.execute(query)
    site_id = list(site_id_response)[0][0]
    return site_id

def get_zv_code(site_name, connection):
    query = 'SELECT zv_code FROM public.sites where displayable_name = \'{0}\''.format(site_name)
    # print(query)
    zv_code_response = connection.execute(query)
    zv_code = list(zv_code_response)[0][0]
    return zv_code

def get_time_index(start_date, last_data_day, timezone='utc'):
    start = dt.datetime(year=start_date.year, month=start_date.month, day=1, hour=0, minute=30)
    last_month_day_calendar = calendar.monthrange(start_date.year, start_date.month)[-1]
    last_month_day_data = last_data_day
    # print(last_month_day_data, last_month_day_calendar)
    if last_month_day_data == last_month_day_calendar:
        year = start_date.year
        month = start_date.month + 1 
        if month == 13:
            month = 1
            year = year + 1
        end = dt.datetime(year=year, month=month, day=1, hour=0, minute=30) - dt.timedelta(hours=1)
    else:
        end = dt.datetime(year=start_date.year, month=start_date.month, day=last_month_day_data + 1, hour=0, minute=30)
        end = end - dt.timedelta(hours=1)
    index_in_kyiv = pd.date_range(start=start, end=end, freq='1h', tz='europe/kiev')
    index_in_utc = index_in_kyiv.tz_convert('utc').tz_localize(None)
    if timezone == 'utc':
        return index_in_utc
    elif timezone == 'europe/kiev':
        return index_in_kyiv.tz_localize(None)
    else:
        return None


def get_datahub_data(filename, data_version):   
    datahub_data = pd.read_excel(filename)
    datahub_data["zv"] = datahub_data["ЕІС-код"].astype(str).apply(lambda x: x.split(" ")[0])
    datahub_data["direction"] = datahub_data["ЕІС-код"].astype(str).apply(lambda x: "consumption" if x.split(" ")[-1] == "OUT" else "generation")
    datahub_data["site"] = datahub_data["Коротка назва"].astype(str).apply(lambda x: x.split("-")[-1])
    datahub_data = datahub_data.drop(columns=["ЕІС-код", "Коротка назва", "Джерело", "Сума за період"])
  
    time_index = datahub_data.columns[:-3].to_list()
    time_index = map(lambda x: x.split(" ")[0], time_index)
    time_index = map(lambda x:  dt.datetime.strptime(x, "%d.%m.%Y"), time_index)
    time_index = map(lambda x:  x + dt.timedelta(minutes=30), time_index)
    time_index = map(lambda x:  x.date(), time_index)
    time_index = list(time_index)
    index_in_utc = get_time_index(min(time_index), max(time_index).day, timezone='utc').tolist()
    
    prepared_data = dict()
       
    for zv_code in datahub_data["zv"].unique():
        site_data = dict()

        generation = datahub_data.iloc[:, :len(index_in_utc)].loc[(datahub_data.zv == zv_code) & (datahub_data.direction == "generation")].fillna(0).astype(int).values.flatten().tolist()
        consumption = datahub_data.iloc[:, :len(index_in_utc)].loc[(datahub_data.zv == zv_code) & (datahub_data.direction == "consumption")].fillna(0).astype(int).values.flatten().tolist()
        total = [gen - cons for gen, cons in zip(generation, consumption)]
        
        site_data["timestamps_utc"] = index_in_utc
        site_data["year"] = time_index[-1].year
        site_data["month"] = time_index[-1].month       
        site_data[f"generation_{data_version}"] = generation
        site_data[f"consumption_{data_version}"] = consumption
        site_data[f"total_{data_version}"] = total
            
        site_data['completed'] = (max(site_data['timestamps_utc']).day == calendar.monthrange(site_data["year"], site_data["month"])[1]) & (data_version == "v3")
        
        prepared_data[zv_code] = site_data
    return prepared_data

In [None]:
target_period = '2024-05'
mms_data_version = "v1"
mms_yield_data_dir = 'data/datahub/'
march_dlst_days = {2019: 31, 2020: 29, 2021: 28, 2022: 27, 2023: 25, 2024: 30, 2025: 29}

date = dt.datetime.strptime(target_period, '%Y-%m')
print(date)


In [None]:
data_file = [f for f in os.listdir(mms_yield_data_dir) if os.path.isfile(os.path.join(mms_yield_data_dir, f)) and f[-4:] == 'xlsx'][0]
print(data_file)

In [None]:
datahub_data = get_datahub_data(mms_yield_data_dir + data_file, mms_data_version)

In [None]:
# mms_yield_sites = ['Afanasiivka']

for site in all_sites:
    print('-'*30)
    print(site)
    
    

    with engine.connect() as connection:
        table = metadata.tables['mms_data']
        
        zv_code = get_zv_code(site, connection)
        site_data = datahub_data[zv_code]
        site_data['site'] = get_site_id(site, connection)
        site_data['id'], to_update = get_record_index(site_data['site'], date, table, connection)
        
        #complete_period = prices.index.max().day == calendar.monthrange(date.year, date.month)[1]
        print('Columns length: {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |'.format(
            len(site_data.get("timestamps_utc", [])),
            len(site_data.get("generation_v1", [])),
            len(site_data.get("generation_v2", [])),
            len(site_data.get("generation_v3", [])),
            len(site_data.get("consumption_v1", [])),
            len(site_data.get("consumption_v2", [])),
            len(site_data.get("consumption_v3", [])),
            len(site_data.get("total_v1", [])),
            len(site_data.get("total_v2", [])),
            len(site_data.get("total_v3", []))
        ))
            
        print('Ok!') 
        
        if to_update:
            update_statement = table.update().values(**site_data).where(table.c.id == site_data['id'])
            updated_id = connection.execute(update_statement)
            print('Database record updated!')
        else:
            insert_statement = table.insert().values(**site_data)
            inserted_id = connection.execute(insert_statement)
            print('Data inserted to database'.format(inserted_id))

os.rename(mms_yield_data_dir + data_file, mms_yield_data_dir + "archived/" + f"datahub_file_processed_on_{dt.datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.xlsx")