### Set-Up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "ArlingtonMA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')


### Extract

In [None]:
import pandas as pd
url = 'https://files-cdn.masscec.com/uploads/Residential%20ASHP%20Data_For%20Website%20%282%29.xlsx'
data = pd.read_excel(url)

### Transform

In [None]:
df = data[data['Project Town']=='ARLINGTON'].copy()
df=df.rename(columns={'Date of Installation':'date','Total Costs':'total',
                                    'Current Heating Fuel':'fuel','Rebate Amount $':'rebate',
                                    'Current A/C':'ac','Current Heating $':'heating_costs',
                                    '# Units (# of Outdoor Units)':'outdoor_units',
                                    'Total # of indoor units':'indoor_units',
                                    'Installed Capacity at 5°F':'capacity',
                                    'Project Town':'town',
                                    '  Footage':'footage'
                                   })

cols = ['date','town','total','Occupants','footage','fuel','rebate','ac','heating_costs','outdoor_units','indoor_units','capacity']
cols = cols + ['Primary Installer Company Name', 'Backup heat for heat pump (if applicable)', 'New Construction or Retrofit?',
       'Project Street Adress', 'Manufacturer','Received Date']
df[cols].to_csv('heatpumps2.tsv',sep='\t',index=False)#.plot('date','total')

In [None]:
#str in total costs s/b fixed
mask = data['Total Costs'].apply(type).isin([int,float])
heatpumps = data[mask].copy()
heatpumps=heatpumps.rename(columns={'Received Date':'date','Total Costs':'total',
                                    'Current Heating Fuel':'fuel','Rebate Amount $':'rebate',
                                    'Current A/C':'ac','Current Heating $':'heating_costs',
                                    '# Units (# of Outdoor Units)':'outdoor_units',
                                    'Total # of indoor units':'indoor_units',
                                    'Installed Capacity at 5°F':'capacity',
                                    'Project Town':'town',
                                    '  Footage':'footage'
                                   })

mask = ~pd.isnull(heatpumps['Site City (if different)'])
heatpumps['city']=heatpumps['System Owner City']
heatpumps.loc[mask,'city']=heatpumps.loc[mask,'Site City (if different)']

cols = ['date','town','city','total','Occupants','footage','fuel','rebate','ac','heating_costs','outdoor_units','indoor_units','capacity']
heatpumps=heatpumps[cols]
print('footage',heatpumps[heatpumps.footage.apply(type)!=str].footage.median())
print('total',heatpumps[heatpumps.total.apply(type)!=str].total.median())
print('rebate',heatpumps[heatpumps.rebate.apply(type)!=str].rebate.median())
print('heating_costs',heatpumps[heatpumps.heating_costs.apply(type)!=str].heating_costs.median())
print('capacity',heatpumps[heatpumps.capacity.apply(type)!=str].capacity.median())

heatpumps.loc[heatpumps.rebate.apply(type)==str,'rebate']=0
fixit = dict(zip(['one', '3 (this field shows as $s / error)', 'two', 'Two', 'four',
       'One', 'one (1)', '3', '-'],[1,3,2,2,4,1,1,3,0]
        ))
heatpumps.indoor_units=heatpumps.indoor_units.replace(fixit)

import numpy as np
heatpumps=heatpumps.groupby(heatpumps.date.dt.strftime('%Y%m')).agg({
    'town':len,
    'total':[sum,np.mean],
    'rebate':[sum,np.mean],
    'indoor_units':[max,np.mean],
    'outdoor_units':[max,np.mean],
    #'footage':[min,max,np.mean],
    #'Occupants':[min,max,np.mean],
    'capacity':[min,max,np.mean]
})
heatpumps.columns=['installs','total','avg_cost','rebates','avg_rebate',
                   'indoor_units_max','avg_indoor_units',
                   'outdoor_units_max','avg_outdoor_units',
#                  'footage_min','footage_max','avg_footage',
#                  'Occupants_min','Occupants_max','avg_Occupants'                   
                  'capacity_min','capacity_max','avg_capacity'
                  ]
heatpumps=heatpumps.reset_index()
##dont care if units mean is ewacked
heatpumps=heatpumps[heatpumps.date<='201903'].replace({pd.isnull:0,np.nan:0}).astype(int)

In [None]:
hp = pd.read_csv('heatpumps.tsv',sep='\t')

In [None]:
assess = pd.read_sql('select streetname,streetnum,unit,pid from property.assessments where year = 2018;',con=cnx)
assess.columns=assess.columns.str.replace('streetnum','streetNum')

hp['streetname'] = (hp['streetName'] + ' ' + hp['streetSuffix']).str.strip()
hp.streetNum=hp.streetNum.astype(str)
hp.merge(assess,how='left',on=['streetname','streetNum','unit']).sort_values(['streetName','streetNum'])[['address','pid']].to_csv('/data/foo.tsv',sep='\t',index=False)


### Load

In [None]:
table_create_query = \
"""
        DROP TABLE IF EXISTS property.heatpumps;
        CREATE TABLE property.heatpumps (
            "received" DATE NOT NULL,
            "installed" DATE NOT NULL,
            "total" REAL NOT NULL,
            "occupants" SMALLINT,
            "footage" SMALLINT,           
            "fuel" VARCHAR(50),
            "rebate" REAL ,
            "ac" VARCHAR(50),

            "heating_costs" INTEGER,
            "outdoor_units" SMALLINT,
            "indoor_units" SMALLINT,
            "capacity" INTEGER,
            
            "installer" VARCHAR(255) ,
            "backup" VARCHAR(255) ,
            "address" VARCHAR(255) ,
            "manufacturer" VARCHAR(255) ,
            "streetNum" SMALLINT,
            "streetName" VARCHAR(255),
            "streetSuffix" VARCHAR(20),
            "unit" VARCHAR(5),
            "pid" CHAR(17)
        );
        
    CREATE INDEX heatpumps_idx 
        ON property.heatpumps(pid);
    CREATE INDEX heatpumps_date_idx 
        ON property.heatpumps(installed);
    """
cnx.execute(table_create_query)


In [None]:

hp.to_sql('heatpumps',schema='property',con=cnx,if_exists='append',index=False)