In [None]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import plotly.express as px

In [None]:
## Load in a GeoJSON file containing the geometry information for US counties, where feature.id is a FIPS code. 
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

# Notebook Outline
This notebook uses the annual aggregate dataset from the Comstock commercial building energy consumption 'End-Use Load Profiles' (https://www.nrel.gov/buildings/end-use-load-profiles.html) to calculate the emissions intensity in absolute terms, per capita, and per employee for a subset of commercial industry sectors. The steps involved are as follows:
1. Prepare comstock and employment datasets
2. Calculate emissions
3. Compute emissions per capita and emissions per employee

Note that all energy consumption fields are specified in kWh.

# 1 Prepare datasets

## 1.1 Prepare Comstock data

NREL uses Comstock to publish 'End-Use Load Profiles for the U.S. Building Stock' (https://www.nrel.gov/buildings/end-use-load-profiles.html). These are timeseries load profiles for all building types, derived using the Comstock and Restock models. To get these timeseries profiles, total estimates of fuel usage of different commercial buildings in different counties is derived and fed into the timeseries models. These estimates are what we need for our analysis, and are available in the 'metadata' file in the Comstock data lake (overall data lake here: https://data.openei.org/submissions/4520. Readme for comstock datafiles here, including changelog: https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/README.md. Metadata file here: https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=nrel-pds-building-stock%2Fend-use-load-profiles-for-us-building-stock%2F2021%2Fcomstock_amy2018_release_1%2Fmetadata%2F). Comstock data is for 2018. Note that in 2022 a series of revisions and updates were made to the Comstock metadata files - should be sure to use most up-to-date files. See Readme for changes.

### 1.1.1. Read in and clean data

In [None]:
# Import annual energy usage data ('metadata')
metadata = pq.read_table('../Input/metadata.parquet').to_pandas()

# Read in data dictionary
data_dictionary = pd.read_csv('../Input/data_dictionary.tsv', delimiter='\t')

# Isolate revelant data from metadata for a) emissions calculations, and b) occupancy regression
# A)
all_cols = metadata.columns
qual_cols = ['in.county', 'in.state_abbreviation',
             'in.building_type', 'in.building_subtype', 'in.sqft']
emissions_calcs_cols = ['in.hvac_system_type', 'in.heating_fuel', 'in.service_water_heating_fuel',
                        'out.electricity.heating.energy_consumption', 'out.electricity.water_systems.energy_consumption',
                        'out.electricity.total.energy_consumption', 'out.natural_gas.heating.energy_consumption',
                        'out.natural_gas.interior_equipment.energy_consumption',
                        'out.natural_gas.water_systems.energy_consumption', 'out.other_fuel.heating.energy_consumption',
                        'out.other_fuel.water_systems.energy_consumption', 'out.district_cooling.total.energy_consumption',
                        'out.district_heating.total.energy_consumption', 'out.natural_gas.total.energy_consumption',
                        'out.other_fuel.total.energy_consumption', 'weight']

comstock_raw = metadata[qual_cols + emissions_calcs_cols]

# Read in and merge county names onto datapoints
#  Add county and state FIPS fields from in.county field
comstock_raw['FIPS'] = comstock_raw['in.county'].apply(lambda x: x[1:3] + x[4:-1])
comstock_raw['STATEFIPS'] = comstock_raw['FIPS'].apply(lambda x: x[:2])

#  Read in county fips labels [NOTE: this file has been edited from that used in other code to ensure county names and their formatting match]
fips = pd.read_excel('../Temp/fips_edited.xlsx',
                     usecols=[1, 2, 3, 4],
                     header=None,
                     names=['FIPS', 'County', 'State Name', 'State'],
                     dtype={'FIPS': str}
                     )
fips['County'] = fips['County'].str.lower()

#  Merge county names onto FIPS
comstock_raw = pd.merge(comstock_raw, fips[['FIPS', 'County']], how='left', on='FIPS')

#  Drop and rename fields
comstock_raw = comstock_raw.drop(columns=['in.county']).rename(
    columns={'in.state_abbreviation': 'State'})

# Replace FIPS 02270 Wade hampton (AK) with 02158 Kusilvak, and 46113 Shannon with 46102 Oglala lakota, according to July 2015 changes
for i in np.arange(len(comstock_raw)):
    if comstock_raw.loc[i, 'FIPS'] == '02270':
        comstock_raw.loc[i, 'FIPS'] = '02158'
        comstock_raw.loc[i, 'County'] = 'kusilvak'
        comstock_raw.loc[i, 'State'] = 'AK'
    elif comstock_raw.loc[i, 'FIPS'] == '46113':
        comstock_raw.loc[i, 'FIPS'] = '46102'
        comstock_raw.loc[i, 'County'] = 'oglala lakota'
        comstock_raw.loc[i, 'State'] = 'SD'

#  Check that all county names were correctly merge
if len(comstock_raw[comstock_raw['County'].isna()]) == 0:
    print('Successful merge. Continue.')
else:
    print('Error in merge. FIPS affected:',
          comstock_raw[comstock_raw['County'].isna()]['FIPS'].unique())

### 1.1.2 Determine NAICS-building type crosswalk to use when attributing employment data to emissions data

Comstock indicates industry through building type rather than industry classification (e.g. NAICS), however employment data is given per NAICS industry. Therefore, manually constructed 8 commercial categories that span the building types included in comstock, and determined the corresponding NAICS codes that fit into each of these categories. Note that while some categories map directly to NAICS codes (e.g. hospital), others (e.g. office) are simply building type categories. This is because these building types can't be feasibly mapped to any specific industry.

In [None]:
# Read in industry classifications and their mapping to comstock's building types
comstock_ind_crosswalk = pd.read_excel(
    '../Temp/comstock_ind_crosswalk.xlsx', dtype={'ind_code': str})
# Merge these categories    onto comstock dataframe
comstock = pd.merge(comstock_raw, comstock_ind_crosswalk, how='left', on='in.building_type')

# 2 Calculate commercial emissions

Comstock gives energy consumption in kWh. Use the following emissions factors to determine the associated CO2e emissions for each fuel type:
- Natural gas: 116.65 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- Electricity: Use eGRID 2016 electricity carbon intensity at the eGRID subregion level (see '~/industrial/Scripts/2023-06-27_countyElecIntensity.ipynb')
- District heating: Assume heating comes from gas-fired CHP, therefore use natural gas emissions factor.
- District cooling: Assume district cooling system uses an electric chiller, therefore use electricity emissions factor.
- Other fuel: Assume to be distillate fuel oil (aka heating fuel), 163.45 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)

## 2.1 From electricity

### 2.1.1 Specify emissions factors based on eGRID subregions

In [None]:
# Read in electricity carbon intensity and merge onto manufacturing energy consumption data
counties_elec_intensity = pd.read_csv(
    '../../industrial/Temp/counties_elec_intensity.csv',
    dtype={'id': str, 'STATE': str, 'COUNTY': str}
).drop(columns='Unnamed: 0')
counties_elec_intensity['SRC2ERTA'] = counties_elec_intensity['SRC2ERTA'] / 1000
counties_elec_intensity = counties_elec_intensity.rename(
    columns={'id': 'FIPS', 'SRC2ERTA': 'lbCO2e_perkWh_elec'})

# Merge electricity carbon intensity onto comstock data by FIPS code
comstock = comstock.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perkWh_elec']],
    how='left',
    on='FIPS'
)

### 2.1.2 Perform calculation

Should calculate electricity consumption for heating, for water heating, and in total (which includes heating + water heating + general electricity consumption).

In [None]:
# Derive general electricity consumption (i.e. that not used for heating or water systems)
comstock['derived.electricity.general.energy_consumption'] = comstock['out.electricity.total.energy_consumption'] - \
    comstock['out.electricity.heating.energy_consumption'] - \
    comstock['out.electricity.water_systems.energy_consumption']

# For each electricity consumption column, calculate the CO2e emissions associated.
comstock['lbCO2e_elec_heat'] = comstock['out.electricity.heating.energy_consumption'] * \
    comstock['lbCO2e_perkWh_elec']
comstock['lbCO2e_elec_water'] = comstock['out.electricity.water_systems.energy_consumption'] * \
    comstock['lbCO2e_perkWh_elec']
comstock['lbCO2e_elec_general'] = comstock['derived.electricity.general.energy_consumption'] * \
    comstock['lbCO2e_perkWh_elec']
comstock['lbCO2e_elec_total'] = comstock['out.electricity.total.energy_consumption'] * \
    comstock['lbCO2e_perkWh_elec']

## 2.2 From natural gas

In [None]:
# Natural gas emissions factor
ef_ng = 116.65 / 293.07107  # lbCO2e/MMBTU / kWh/MMBtu

# Calculate emissions for each natural gas consumption column
comstock['lbCO2e_ng_heat'] = comstock['out.natural_gas.heating.energy_consumption'] * ef_ng
comstock['lbCO2e_ng_water'] = comstock['out.natural_gas.water_systems.energy_consumption'] * ef_ng
comstock['lbCO2e_ng_intequip'] = comstock['out.natural_gas.interior_equipment.energy_consumption'] * ef_ng
comstock['lbCO2e_ng_total'] = comstock['out.natural_gas.total.energy_consumption'] * ef_ng

## 2.3 District heating/cooling and other fuels
Emissions for NG water heating taken from https://www.researchgate.net/publication/291951427_Greenhouse_gas_emissions_from_domestic_hot_water_Heat_pumps_compared_to_most_commonly_used_systems.
Used base case for a storage gas heater (98% of U.S. water heating had storage tanks in 2009).

In [None]:
comstock['lbCO2e_dist_heat'] = comstock['out.district_heating.total.energy_consumption'] * ef_ng
comstock['lbCO2e_dist_cool'] = comstock['out.district_cooling.total.energy_consumption'] * \
    comstock['lbCO2e_perkWh_elec']

comstock['lbCO2e_other_heat'] = comstock['out.other_fuel.heating.energy_consumption'] * \
    163.45 / 293.07107  # lbCO2e/MMBTU / kWh/MMBtu
comstock['lbCO2e_other_water'] = comstock['out.other_fuel.water_systems.energy_consumption'] * \
    163.45 / 293.07107  # lbCO2e/MMBTU / kWh/MMBtu
comstock['lbCO2e_other_total'] = comstock['out.other_fuel.total.energy_consumption'] * \
    163.45 / 293.07107  # lbCO2e/MMBTU / kWh/MMBtu

## 2.4 Compute weighted emissions and totals

Each building in the dataset has a weight, which is meant to represent the number of 'identical' buildings in that county. Therefore, all emissions figures should be multiplied by the weight to obtain a representative value of the total emissions for a given building type in a given county.<br>
Additionally, it might be interesting to understand the extent to which emissions can be attributed to heating, general electricity consumption, water heating etc (i.e. by energy use), as well as the extent to which the emissions are from natural gas vs electricity vs fuel oil etc. consumption (i.e. by fuel type). Should compute these totals here.

In [None]:
# For each lbCO2e column, calculate a corresponding weighted emissions column, denoted by '_w'
weighted_col_names = []
i = 0

for col in list(comstock.columns)[-13:]:
    weighted_col_names.append(col + '_w')
    comstock[weighted_col_names[i]] = comstock[col] * comstock['weight']
    i += 1

# Already have by fuel type, but should now compute total by end use
comstock['lbCO2e_heat_total_w'] = comstock['lbCO2e_elec_heat_w'] + \
    comstock['lbCO2e_ng_heat_w'] + comstock['lbCO2e_dist_heat_w'] + comstock['lbCO2e_other_heat_w']
comstock['lbCO2e_water_total_w'] = comstock['lbCO2e_elec_water_w'] + \
    comstock['lbCO2e_ng_water_w'] + comstock['lbCO2e_other_water_w']

# Finally, compute total co2e per building type per county
comstock['lbCO2e_total_w'] = comstock['lbCO2e_elec_total_w'] + comstock['lbCO2e_ng_total_w'] + comstock['lbCO2e_dist_heat_w'] + \
    comstock['lbCO2e_dist_cool_w'] + comstock['lbCO2e_other_total_w']
comstock['tonCO2e_total_w'] = comstock['lbCO2e_total_w'] / 2000

## 2.5 Perform different groupings of comstock data

As in the industrial analyses, should try and group data at different granularities so that different analyses are possible. Building subtype, HVAC system type and sqaure footage should be aggregated up in all cases, as this info is too granular, and all geographic granularity should be kept at the county level. Should create a dictionary with the following dataframes:
1. Full energy usage data for each comstock building type, heating fuel, water heating fuel etc.
2. Grouped energy usage data per comstock building type, aggregating usage/emissions across different heating fuels etc.
3. Energy usage data using employment building categories (i.e. 'ind_name' and 'ind_code').
4. Energy usage data for the entire commercial sector (i.e. no building-type specification, just aggregate at county level).

### 2.5.1 Construct dataframe 1 (full data)

In [None]:
# Group by all categorical fields except building subtype, square footage and hvac system type
comstock_grouped = comstock.groupby(by=['State', 'in.building_type', 'in.heating_fuel',
                                        'in.service_water_heating_fuel', 'FIPS',
                                        'STATEFIPS', 'County', 'ind_name', 'ind_code',
                                        'SUBRGN', 'lbCO2e_perkWh_elec'],
                                    as_index=False).sum()

### 2.5.2 Construct dataframe 2 (per building type)

In [None]:
# Group dataframe 1 by building type
comstock_buildtype = comstock_grouped.groupby(
    by=['State', 'in.building_type', 'FIPS', 'STATEFIPS', 'County',
        'ind_name', 'ind_code', 'SUBRGN', 'lbCO2e_perkWh_elec'],
    as_index=False
).sum()
# comstock_buildtype.to_csv('../Temp/comstock_buildtype.csv')

### 2.5.3 Construct dataframe 3 (per employment building category)

In [None]:
# Group dataframe 1 by 'ind_name' and 'ind_code'
comstock_indtype = comstock_grouped.groupby(
    by=['State', 'FIPS', 'STATEFIPS', 'County', 'ind_name',
        'ind_code', 'SUBRGN', 'lbCO2e_perkWh_elec'],
    as_index=False
).sum()

### 2.5.4 Construct dataframe 4 (total per county)

In [None]:
# Group dataframe 1 by county
comstock_all = comstock_grouped.groupby(
    by=['State', 'FIPS', 'STATEFIPS', 'County',
        'SUBRGN', 'lbCO2e_perkWh_elec'],
    as_index=False
).sum()

### 2.5.5 Hold all dataframes in a dictionary

In [None]:
comm_dfs = {'full': comstock_grouped,
            'buildtype': comstock_buildtype,
            'indtype': comstock_indtype,
            'total': comstock_all}

# 3 Compute emissions per capita and emissions per employee

## 3.1 Per capita

For non-census years, the U.S. Census Bureau's Population Estimates Program uses a cohort-component method to estimate the change in county-level population by tallying recorded births, deaths and migrations since the last population estimate and using them to derive a new county-level estimate. More details here https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html, documentation here: https://www2.census.gov/programs-surveys/popest/technical-documentation/methodology/2010-2019/natstcopr-methv2.pdf.<br>
This data can be accessed via the Census Data API. An API pull script has been constructed in another notebook ('CensusBureauPopEstimates.ipynb'), and will be called here to extract the population data.

In [None]:
# Run CensusBureauPopEstimates.ipynb to define function get_pop_estimate() needed to extract Census Population Estimates for a given year data
%run ../../empData/Scripts/CensusBureauPopEstimates.ipynb

# Use get_pop_estimate() function to extract 2016 population estimates for each county
county_pop_2018 = get_pop_estimate(2018, 'county')
county_pop_2018['FIPS'] = county_pop_2018['state'] + county_pop_2018['county']

# Merge population data onto all comm_dfs, and compute total emissions per capita
comm_dfs_pop = comm_dfs.copy()

for df in comm_dfs_pop.keys():

    # Merge population data onto dataframe
    comm_dfs_pop[df] = pd.merge(comm_dfs_pop[df], county_pop_2018[[
                                'FIPS', 'POP']], how='left', on='FIPS')

    # Compute total emissions per capita, and log10 for visualization purposes
    comm_dfs_pop[df]['lbCO2e_percapita_w'] = comm_dfs_pop[df]['lbCO2e_total_w'] / \
        comm_dfs_pop[df]['POP']
    comm_dfs_pop[df]['lbCO2e_percapita_w_log10'] = np.log10(comm_dfs_pop[df]['lbCO2e_percapita_w'])

    comm_dfs_pop[df]['tonCO2e_percapita_w'] = comm_dfs_pop[df]['tonCO2e_total_w'] / \
        comm_dfs_pop[df]['POP']
    comm_dfs_pop[df]['tonCO2e_percapita_w_log10'] = np.log10(
        comm_dfs_pop[df]['tonCO2e_percapita_w'])

## 3.2 Per employee

### 3.2.1 Read in LEHD data

Using the LEHD API pull script we can pull LEHD employment data for each of the NAICS codes we consider in our analysis. The mapping of building types to NAICS codes used for this analysis is below. Any other NAICS codes are not considered in this analysis.
- <b>Office</b>: NAICS 51-55, 92, 561, 425
- <b>Retail</b>: NAICS 44-45
- <b>Warehousing & storage</b>: NAICS 493, 423, 424
- <b>Restaurants</b>: NAICS 722
- <b>Accommodation</b>: NAICS 721
- <b>Schools</b>: NAICS 6111
- <b>Hospitals</b>: NAICS 622
- <b>Oupatient</b>: NAICS 621

In [None]:
# Run LEHD API pull script to call function that pulls LEHD employment data
# %run ../../empData/Scripts/LEHD_API_pull.ipynb

# Run API pulls, or read from CSV
#  Hospitals and outpatient
# lehd_hosp = getLEHDemp('2018','3', '622', write_to_csv = True)
# lehd_outp = getLEHDemp('2018','3', '621', write_to_csv = True)

lehd_hosp = pd.read_csv('../../empData/Temp/emp_ovr_622_3dig_2018.csv', dtype={
                        'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}).drop(columns=['Unnamed: 0'])
lehd_outp = pd.read_csv('../../empData/Temp/emp_ovr_621_3dig_2018.csv', dtype={
                        'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}).drop(columns=['Unnamed: 0'])

#  Schools
# lehd_school = getLEHDemp('2018','4', '6111', write_to_csv = True)
lehd_school = pd.read_csv('../../empData/Temp/emp_ovr_6111_4dig_2018.csv', dtype={
                          'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}).drop(columns=['Unnamed: 0'])

#  Accommodation
# lehd_accom = getLEHDemp('2018', '3', '721', write_to_csv = True)
lehd_accom = pd.read_csv('../../empData/Temp/emp_ovr_721_3dig_2018.csv', dtype={
                         'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}).drop(columns=['Unnamed: 0'])

#  Restaurants
# lehd_rest = getLEHDemp('2018', '3', '722', write_to_csv = True)

lehd_rest = pd.read_csv('../../empData/Temp/emp_ovr_722_3dig_2018.csv', dtype={
                        'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}).drop(columns=['Unnamed: 0'])

#  Warehousing & storage
# lehd_ware1 = getLEHDemp('2018', '3', '493', write_to_csv = True)
# lehd_ware2 = getLEHDemp('2018', '3', '423', write_to_csv = True)
# lehd_ware3 = getLEHDemp('2018', '3', '424', write_to_csv = True)

lehd_ware1 = pd.read_csv(
    '../../empData/Temp/emp_ovr_493_3dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_ware2 = pd.read_csv(
    '../../empData/Temp/emp_ovr_423_3dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_ware3 = pd.read_csv(
    '../../empData/Temp/emp_ovr_424_3dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])

lehd_ware = pd.concat([lehd_ware1, lehd_ware2, lehd_ware3], ignore_index=True)

#  Retail
# lehd_retail = getLEHDemp('2018', '2', '44', write_to_csv = True)

lehd_retail = pd.read_csv(
    '../../empData/Temp/emp_ovr_44_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])

#  Office
# lehd_off1 = getLEHDemp('2018', '2', '51', write_to_csv = True)
# lehd_off2 = getLEHDemp('2018', '2', '52', write_to_csv = True)
# lehd_off3 = getLEHDemp('2018', '2', '53', write_to_csv = True)
# lehd_off4 = getLEHDemp('2018', '2', '54', write_to_csv = True)
# lehd_off5 = getLEHDemp('2018', '2', '55', write_to_csv = True)
# lehd_off6 = getLEHDemp('2018', '2', '92', write_to_csv = True)
# lehd_off7 = getLEHDemp('2018', '3', '561', write_to_csv = True)
# lehd_off8 = getLEHDemp('2018', '3', '425', write_to_csv = True)

lehd_off1 = pd.read_csv(
    '../../empData/Temp/emp_ovr_51_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off2 = pd.read_csv(
    '../../empData/Temp/emp_ovr_52_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off3 = pd.read_csv(
    '../../empData/Temp/emp_ovr_53_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off4 = pd.read_csv(
    '../../empData/Temp/emp_ovr_54_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off5 = pd.read_csv(
    '../../empData/Temp/emp_ovr_55_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off6 = pd.read_csv(
    '../../empData/Temp/emp_ovr_92_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off7 = pd.read_csv(
    '../../empData/Temp/emp_ovr_561_3dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
lehd_off8 = pd.read_csv(
    '../../empData/Temp/emp_ovr_425_3dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])

lehd_off = pd.concat([lehd_off1, lehd_off2, lehd_off3, lehd_off4, lehd_off5,
                     lehd_off6, lehd_off7, lehd_off8], ignore_index=True)
lehd_off_excl_wholesale = pd.concat(
    [lehd_off1, lehd_off2, lehd_off3, lehd_off4, lehd_off5, lehd_off6, lehd_off7],
    ignore_index=True)

# Note that, if not disaggregating into subsectors, can just use 42 instead of separately pulling 424, 423 and 422.
# lehd_wholesale = getLEHDemp('2018', '2', '42', write_to_csv = True)
lehd_wholesale = pd.read_csv(
    '../../empData/Temp/emp_ovr_42_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])

# Same is true for accommodation and restaurants
# lehd_accom_food = getLEHDemp('2018', '2', '72', write_to_csv = True)
lehd_accom_food = pd.read_csv(
    '../../empData/Temp/emp_ovr_72_2dig_2018.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])

In [None]:
# Write dataframe of NAICS-level employment to csv for decomposition analysis
lehd_naics = pd.concat([lehd_off1, lehd_off2, lehd_off3, lehd_off4,
                        lehd_off5, lehd_off6, lehd_off7, lehd_off8,
                        lehd_hosp, lehd_outp, lehd_school, lehd_accom,
                        lehd_rest, lehd_ware, lehd_retail],
                       ignore_index=True)
lehd_naics.to_csv('../Temp/lehd_naics.csv')

In [None]:
# Add ind_name to each dataframe, and group dataframes by FIPS code if they have employment from multiple NAICS codes
#  Hospital and outpatient
lehd_hosp['ind_name'] = lehd_hosp.apply(lambda x: 'hospital', axis=1)
lehd_outp['ind_name'] = lehd_outp.apply(lambda x: 'outpatient', axis=1)

#  Schools
lehd_school['ind_name'] = lehd_school.apply(lambda x: 'school', axis=1)

#  Accommodation
lehd_accom['ind_name'] = lehd_accom.apply(lambda x: 'accommodation', axis=1)

#  Restaurants
lehd_rest['ind_name'] = lehd_rest.apply(lambda x: 'restaurant', axis=1)

#  Warehousing & storage
lehd_ware = lehd_ware.groupby(by='FIPS', as_index=False).sum()
lehd_ware['ind_name'] = lehd_ware.apply(lambda x: 'warehouse_storage', axis=1)

#  Retail
lehd_retail['ind_name'] = lehd_retail.apply(lambda x: 'retail', axis=1)

#  Office
lehd_off = lehd_off.groupby(by='FIPS', as_index=False).sum()
lehd_off['ind_name'] = lehd_off.apply(lambda x: 'office', axis=1)

In [None]:
# Concatenate all employment data into one dataframe
lehd_dfs = [lehd_hosp, lehd_outp, lehd_school, lehd_accom,
            lehd_rest, lehd_ware, lehd_retail, lehd_off]
lehd_emp = pd.concat(lehd_dfs, ignore_index=True)[['FIPS', 'Emp', 'ind_name']]

# Compute total commercial employment per county (to merge onto totals dataframe)
lehd_dfs_totals = [lehd_hosp, lehd_outp, lehd_school,
                   lehd_accom_food, lehd_ware1, lehd_retail, lehd_off_excl_wholesale]
lehd_emp_totals = pd.concat(lehd_dfs_totals, ignore_index=True)[['FIPS', 'Emp', 'ind_name']]
lehd_total = lehd_emp.groupby(by='FIPS', as_index=False).sum()

# lehd_emp.to_csv('../Temp/lehd_comm_buildtypes.csv')

### 3.2.2 Compute emissions per employee

In [None]:
# Merge employment data onto emissions data
comm_dfs_final = comm_dfs_pop.copy()

comm_dfs_final['indtype'] = pd.merge(
    comm_dfs_final['indtype'], lehd_emp, how='left', on=['FIPS', 'ind_name'])
comm_dfs_final['total'] = pd.merge(comm_dfs_final['total'], lehd_total, how='left', on=['FIPS'])

# Understand how many counties have missing data
print('Total number of datapoints:', len(comm_dfs_final['indtype']))
print('Number of datapoints with NaN employment:', len(
    comm_dfs_final['indtype'][~(comm_dfs_final['indtype']['Emp'] >= 0)]))
print('Percent of datapoints with NaN employment:',
      np.round(len(comm_dfs_final['indtype'][~(comm_dfs_final['indtype']
               ['Emp'] >= 0)]) / len(comm_dfs_final['indtype']) * 100, 2),
      '%')
print('Percent of emissions with NaN employment:',
      np.round(comm_dfs_final['indtype'][~(comm_dfs_final['indtype']['Emp'] >= 0)]['tonCO2e_total_w'].sum(
      ) / comm_dfs_final['indtype']['tonCO2e_total_w'].sum() * 100, 2),
      '%')

In [None]:
# Calculate emissions per employee
for df in ['indtype', 'total']:
    #  Set employment in counties where it is reported to be 0 to NaN, as there must be employment there if there are emissions
    comm_dfs_final[df]['Emp'] = comm_dfs_final[df].apply(
        lambda x: np.nan if x.Emp == 0 else x.Emp, axis=1)

    comm_dfs_final[df]['lbCO2e_peremp_w'] = comm_dfs_final[df].apply(
        lambda x: x.lbCO2e_total_w / x.Emp if x.Emp > 0 else -1, axis=1)
    comm_dfs_final[df]['tonCO2e_peremp_w'] = comm_dfs_final[df].apply(
        lambda x: x.tonCO2e_total_w / x.Emp if x.Emp > 0 else -1, axis=1)

    comm_dfs_final[df]['lbCO2e_peremp_w_log10'] = np.log10(comm_dfs_final[df]['lbCO2e_peremp_w'])
    comm_dfs_final[df]['tonCO2e_peremp_w_log10'] = np.log10(comm_dfs_final[df]['tonCO2e_peremp_w'])

In [None]:
# For visualization purposes, compute the log10 of total weighted tonCO2e for every entry in each dataframe
for df in comm_dfs_final.keys():
    comm_dfs_final[df]['tonCO2e_total_w_log10'] = np.log10(comm_dfs_final[df]['tonCO2e_total_w'])

# 5 Write final dataframe to csv for overall analysis

In [None]:
comm_dfs_final['total'].to_csv('../Output/comm_totalCO2_final.csv')