In [None]:
##%load water.md
from IPython.display import Markdown, display

display(Markdown("water.md"))

## Set-Up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "MA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_dir = 'water/'
bills = pd.read_excel(data_dir+'20230523-Water bills FOIA request with parcels.xlsx',skiprows=2)
##extra row at end for 2023-05-23
bills=bills.iloc[0:-1,:]

In [None]:
def normalize_water_bills(bills):

    #assert(((bills.duplicated(['Account Number','Current Read Date','Total Billed']))&(bills['Service Code']==20000)).any()==False)
    assert(((bills.duplicated(['Account Number','Current Read Date','Total Amounts']))&(bills['Service Code']==20000)).any()==False)
    keys = ['Account Number','Current Read Date']
    
    bills=bills.replace({'UNKNOWN':np.nan})
    for col in ['Previous Read Date','Current Read Date']:
        bills[col]=bills[col].ffill()#.loc[pd.isnull(bills[col]),col]
    
    mask = (bills['Service Code']>=10000) & (bills['Service Code']<=19999)
    # water_usage = bills[mask].groupby(keys).agg({'Date\n01/23/2023\nBilled Usage':sum})
    water_usage = bills[mask].groupby(keys).agg({'Billed Usage':sum})
    water_usage.columns = ['water_usage']
    water_usage=water_usage.reset_index()

    mask = (bills['Service Code']>=20000) & (bills['Service Code']<=29999)
    # sewer_usage = bills[mask].groupby(keys).agg({'Date\n01/23/2023\nBilled Usage':sum})
    sewer_usage = bills[mask].groupby(keys).agg({'Billed Usage':sum})
    sewer_usage.columns = ['sewer_usage']
    sewer_usage=sewer_usage.reset_index()

    usage=water_usage.merge(sewer_usage,how='outer',on=keys)
    usage.columns = ['account','date','water_usage','sewer_usage']
    print('Accounts: ','water',len(water_usage),'sewer',len(sewer_usage),'combo',len(usage))
#     print('\nSeptic systems - 35\n')
#     print(usage[pd.isnull(usage['sewer_usage'])].to_markdown())

    ##bill amount
    mask = (bills['Service Code']>=10000) & (bills['Service Code']<=19999)
    # water_cost = bills[mask].groupby(keys).agg({'Total Billed':sum})
    water_cost = bills[mask].groupby(keys).agg({'Total Amounts':sum})
    water_cost.columns = ['water_cost']
    water_cost=water_cost.reset_index()

    mask = (bills['Service Code']>=20000) & (bills['Service Code']<=29999)
    #sewer_cost = bills[mask].groupby(keys).agg({'Total Billed':sum})
    sewer_cost = bills[mask].groupby(keys).agg({'Total Amounts':sum})
    sewer_cost.columns = ['sewer_cost']
    sewer_cost=sewer_cost.reset_index()

    mask = (bills['Service Code']>=30000)
    # admin = bills[mask].groupby(keys).agg({'Total Billed':sum})
    admin = bills[mask].groupby(keys).agg({'Total Amounts':sum})
    admin.columns = ['admin_cost']
    admin=admin.reset_index()

    # cost = bills.groupby(keys).agg({'Total Billed':sum})
    cost = bills.groupby(keys).agg({'Total Amounts':sum})
    cost.columns = ['admin_cost']
    cost=cost.reset_index()

    cost=water_cost.merge(sewer_cost,how='outer',on=keys)\
                    .merge(admin,how='outer',on=keys)\
                    .merge(cost,how='outer',on=keys)
    cost.columns = ['account','date','water_cost','sewer_cost','admin_cost','amount']

    df = usage.merge(cost,how='outer',on=['account','date']).replace(np.nan,0)
    for col in ['water_usage','sewer_usage']:
        df[col]=df[col].astype(int)
        
    df.date=pd.to_datetime(df.date)
        
    #df['date']=date
    return df




In [None]:
nbills = normalize_water_bills(bills)

nbills['usage'] = 0
mask = nbills.water_usage!=nbills.sewer_usage
nbills.loc[mask,'usage']=nbills.loc[mask,'water_usage']+nbills.loc[mask,'sewer_usage']
nbills.loc[~mask,'usage']=nbills.loc[~mask,'water_usage']
updated_bills = nbills[['account','date','usage','amount']]

In [None]:
updated_bills=updated_bills[~updated_bills.duplicated('account',keep='last')]
updated_bills[updated_bills.date!='2023-02-28'].sort_values('date')

In [None]:
updated_bills.loc[:,'date']='2023-02-28'

In [None]:
updated_bills.to_sql('water_bills',schema='infrastructure',con=cnx,
    if_exists='append',index=False)


## Needs work

1. Update accounts
2. find new PIDs
3. need to truncate; s/b just updates

## Load

In [None]:
query = """
    SELECT * FROM infrastructure.water_bills ORDER BY account, date
"""

water = pd.read_sql_query(query,cnx)
ttm = water[['usage','amount','account']].groupby('account').rolling(4).sum().reset_index(drop=True)
ttm.columns = ['usage_ttm','amount_ttm']
for col in ttm.columns:
    water[col]=ttm[col]
water['usage_chg_YoY' ] = water[['account','usage']] .groupby('account').pct_change(4)
water['amount_chg_YoY'] = water[['account','amount']].groupby('account').pct_change(4)


In [None]:
water.to_sql('water_bills',schema='infrastructure',con=cnx,
    if_exists='append',index=False)