In [None]:
import os
import io
import pandas as pd
import datetime as dt
import calendar
import numpy as np
from sqlalchemy import create_engine, MetaData, desc
from sqlalchemy.sql import select, and_, or_, not_
from sqlalchemy.pool import NullPool

from settings.db import DO_URL

In [None]:
engine = create_engine(DO_URL, poolclass=NullPool)
metadata = MetaData()
metadata.reflect(bind=engine)


def clean_file(file_name):
    output_file = io.StringIO()
    with open(file_name, encoding='utf8') as read_file:
        lines = read_file.readlines()
        for line in lines:
            line = list(line)
            date = line[:16]
            line[0:4] = date[6:10]
            line[4:8] = date[2:6]
            line[8:10] = date[0:2]
            #line[10] = '-'
            line[14] = '3'
            del line[16:24]
            line = ''.join(line)
            line = line.replace('.', '-')
            line = line.replace('\u00A0', '')
            line = line.replace('\u0412', '')
            #print(line)
            output_file.write(line)
    output_file.seek(0)
    #print(output_file.getvalue())
    return output_file


def get_record_index(site_id, date, table, connection):
    index = None

    query_1 = select([table.c.id, table.c.completed]).where(and_(table.c.site == site_id, table.c.year == date.year, table.c.month == date.month))
    record_to_update = connection.execute(query_1).fetchall()

    if record_to_update:
        return record_to_update[0][0], True
    else:
        max_index_response = connection.execute('SELECT MAX(id) FROM public.{0}'.format(table))
        index = list(max_index_response)[0][0]
        index = 1 if index is None else int(index) + 1
        return index, False


def get_site_id(site_name, connection):
    query = 'SELECT id FROM public.sites where displayable_name = \'{0}\''.format(site_name)
    # print(query)
    site_id_response = connection.execute(query)
    site_id = list(site_id_response)[0][0]
    return site_id

def get_time_index(start_date, last_data_day, timezone='utc'):
    start = dt.datetime(year=start_date.year, month=start_date.month, day=1, hour=0, minute=30)
    last_month_day_calendar = calendar.monthrange(start_date.year, start_date.month)[-1]
    last_month_day_data = last_data_day
    # print(last_month_day_data, last_month_day_calendar)
    if last_month_day_data == last_month_day_calendar:
        year = start_date.year
        month = start_date.month + 1 
        if month == 13:
            month = 1
            year = year + 1
        end = dt.datetime(year=year, month=month, day=1, hour=0, minute=30) - dt.timedelta(hours=1)
    else:
        end = dt.datetime(year=start_date.year, month=start_date.month, day=last_month_day_data + 1, hour=0, minute=30)
        end = end - dt.timedelta(hours=1)
    index_in_kyiv = pd.date_range(start=start, end=end, freq='1H', tz='europe/kiev')
    index_in_utc = index_in_kyiv.tz_convert('utc').tz_localize(None)
    if timezone == 'utc':
        return index_in_utc
    elif timezone == 'europe/kiev':
        return index_in_kyiv.tz_localize(None)
    else:
        return None


def get_site_data(filename, date):
    data_header = ['datetime', 'generation_v1', 'generation_v2', 'generation_v3', 'consumption_v1', 'consumption_v2', 'consumption_v3']
    data_stream = clean_file(filename)
    data = pd.read_csv(data_stream, sep=';', decimal=',', encoding='utf8', 
                        skiprows=1, header=None, names=data_header, na_values=['f', 'ff'])
    data['day'] = data['datetime'].apply(lambda datetime: int(datetime[8:10]))
    # print(data)
    index_in_utc = get_time_index(date, data['day'].max())
    #print(squized.tail(52))
    data.index = index_in_utc
    data.index.name = 'timestamp_utc'
    data = data.drop(['datetime', 'day'], axis=1)
    data = data.dropna(how='all')
    site_data = dict()
    site_data['generation_v1'] = list(data['generation_v1'].astype(int)) if not data['generation_v1'].isnull().values.any() else list()
    site_data['generation_v2'] = list(data['generation_v2'].astype(int)) if not data['generation_v2'].isnull().values.any() else list()
    site_data['generation_v3'] = list(data['generation_v3'].astype(int)) if not data['generation_v3'].isnull().values.any() else list()
    site_data['consumption_v1'] = list(data['consumption_v1'].astype(int)) if not data['consumption_v1'].isnull().values.any() else list()
    site_data['consumption_v2'] = list(data['consumption_v2'].astype(int)) if not data['consumption_v2'].isnull().values.any() else list()
    site_data['consumption_v3'] = list(data['consumption_v3'].astype(int)) if not data['consumption_v3'].isnull().values.any() else list()
    site_data['total_v1'] = [gen - cons for gen, cons in zip(site_data['generation_v1'], site_data['consumption_v1'])]
    site_data['total_v2'] = [gen - cons for gen, cons in zip(site_data['generation_v2'], site_data['consumption_v2'])]
    site_data['total_v3'] = [gen - cons for gen, cons in zip(site_data['generation_v3'], site_data['consumption_v3'])]
    site_data['timestamps_utc'] = list(data.index.to_pydatetime())
    site_data['year'] = date.year
    site_data['month'] = date.month
    completed = (max(site_data['timestamps_utc']).day == calendar.monthrange(date.year, date.month)[1]) \
                & (bool(site_data['total_v3']))
    site_data['completed'] = completed
    return site_data

In [None]:
target_period = '2022-04_1-4'
mms_yield_data_dir = 'data/mms/' + target_period + '/'
march_dlst_days = {2019: 31, 2020: 29, 2021: 28, 2022: 27}

date = target_period.split('_')[0]
date = dt.datetime.strptime(date, '%Y-%m')
print(date)


In [None]:
data_files = [f for f in os.listdir(mms_yield_data_dir) if os.path.isfile(os.path.join(mms_yield_data_dir, f)) and f[-3:] == 'csv']
mms_yield_files = dict()

for data_file in data_files:
    site = data_file.split('_')[0].replace('-', '_')
    mms_yield_files.update({site: data_file})
print(len(mms_yield_files.keys()))

#mms_yield_sites = ['Afanasiivka']

for site in mms_yield_files.keys():
    print('-'*30)
    print(site)
    
    site_data = get_site_data(mms_yield_data_dir + mms_yield_files[site], date)

    #complete_period = prices.index.max().day == calendar.monthrange(date.year, date.month)[1]
    print('Columns length:| {} | {} | {} | {} | {} | {} | {} |'.format(len(site_data['timestamps_utc']),
                                                                       len(site_data['generation_v1']),
                                                                       len(site_data['generation_v2']),
                                                                       len(site_data['generation_v3']),
                                                                       len(site_data['consumption_v1']),
                                                                       len(site_data['consumption_v2']),
                                                                       len(site_data['consumption_v3']),
                                                                       len(site_data['total_v1']),
                                                                       len(site_data['total_v2']),
                                                                       len(site_data['total_v3'])))
    print('Ok!')

    with engine.connect() as connection:
        table = metadata.tables['mms_data']
        site_data['site'] = get_site_id(site, connection)
        site_data['id'], to_update = get_record_index(site_data['site'], date, table, connection)

        if to_update:
            update_statement = table.update().values(**site_data).where(table.c.id == site_data['id'])
            updated_id = connection.execute(update_statement)
            print('Database record updated!')
        else:
            insert_statement = table.insert().values(**site_data)
            inserted_id = connection.execute(insert_statement)
            print('Data inserted to database'.format(inserted_id))

