# 0 Set-up

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

# Load in a GeoJSON file containing the geometry information for US counties, where feature.id is a FIPS code.
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

# Read in NREL IET dataset
county_energy_estimates_IEDB = pd.read_csv(
    '../Input/county_energy_estimates_IEDB.gzip',
    dtype={'FIPSTATE': str, 'COUNTY_FIPS': str, 'MECS_FT': str, 'NAICS': str})
county_energy_estimates_IEDB['FIPSTATE'] = county_energy_estimates_IEDB['FIPSTATE'].apply(
    lambda x: '0' + x[:-2] if len(x) == 3 else x[:-2])
county_energy_estimates_IEDB['COUNTY_FIPS'] = county_energy_estimates_IEDB['COUNTY_FIPS'].apply(
    lambda x: '0' + x[:-2] if len(x) == 6 else x[:-2])
county_energy_estimates_IEDB = county_energy_estimates_IEDB.rename(columns={'COUNTY_FIPS': 'FIPS'})

# Keep entries from the most recent year
county_energy_estimates_IEDB = county_energy_estimates_IEDB[county_energy_estimates_IEDB['YEAR'] == 2016]

# Read in county fips labels [NOTE: this file has been edited from that used in other code to ensure county names and their formatting match]
fips = pd.read_csv(
    '../Temp/fips_edited.csv',
    encoding='windows-1252',
    usecols=[1, 2, 3, 4],
    names=['FIPS', 'County', 'State Name', 'State'],
    dtype={'FIPS': str}
)
fips['County'] = fips['County'].str.lower()

# Read in NAICS codebook from https://www.census.gov/naics/?48967 (use 2012 codes as data is from 2016)
NAICS_codebook = pd.read_excel('../Input/2-digit_2012_Codes.xls',
                               header=0,
                               usecols=[1, 2],
                               dtype={'2012 NAICS US   Code': str})
NAICS_codebook = NAICS_codebook.drop(0).rename(
    columns={'2012 NAICS US   Code': 'NAICS', '2012 NAICS US Title': 'NAICS_desc'})

# Merge county names onto dataframe
county_energy_estimates_IEDB = pd.merge(
    county_energy_estimates_IEDB, fips[['FIPS', 'County']], how='left', on='FIPS')

# Replace FIPS 02270 Wade hampton (AK) with 02158 Kusilvak, and 46113 Shannon with 46102 Oglala lakota, according to July 2015 changes
for i in np.arange(len(county_energy_estimates_IEDB)):
    if county_energy_estimates_IEDB.loc[i, 'FIPS'] == '02270':
        county_energy_estimates_IEDB.loc[i, 'FIPS'] = '02158'
        county_energy_estimates_IEDB.loc[i, 'County'] = 'kusilvak'
    elif county_energy_estimates_IEDB.loc[i, 'FIPS'] == '46113':
        county_energy_estimates_IEDB.loc[i, 'FIPS'] = '46102'
        county_energy_estimates_IEDB.loc[i, 'County'] = 'oglala lakota'
    elif county_energy_estimates_IEDB.loc[i, 'FIPS'] == '46102':
        county_energy_estimates_IEDB.loc[i, 'County'] = 'oglala lakota'

In [None]:
# Assess duplication in data
init_datapoints = len(county_energy_estimates_IEDB)
print('There are', len(county_energy_estimates_IEDB[county_energy_estimates_IEDB[['FIPS', 'NAICS', 'MECS_FT']].duplicated()]),
      'FIPS-NAICS-FUEL combos with more than entry for MMBTU consumed. Need to group these.')

# Group data
county_energy_estimates_IEDB = county_energy_estimates_IEDB.groupby(by=[col for col in county_energy_estimates_IEDB.columns if col not in ['MMBTU_TOTAL']],
                                                                    as_index=False).sum().reset_index(drop=True)
final_datapoints = len(county_energy_estimates_IEDB)
print('Duplicate datapoints removed:', init_datapoints - final_datapoints)

# Reassess duplicates
if len(county_energy_estimates_IEDB[county_energy_estimates_IEDB[['FIPS', 'NAICS', 'MECS_FT']].duplicated()]) == 0:
    print('After grouping, there are no more duplicated datapoints. Proceed.')
else:
    print('After grouping, there are still', len(county_energy_estimates_IEDB[county_energy_estimates_IEDB[['FIPS', 'NAICS', 'MECS_FT']].duplicated()]),
          'duplicated datapoints. Reassess.')

In [None]:
# Assess these duplicated datapoints individually.
county_energy_estimates_IEDB[county_energy_estimates_IEDB[[
    'FIPS', 'NAICS', 'MECS_FT']].duplicated(keep=False)]

It seems that there was some errors in the Mining datapoints where the wrong state was listed for a given FIPS. Manually overwrite these.

In [None]:
overwrite_indices = [320348, 320349, 502288, 854619]
correct_states = ['TEXAS', 'TEXAS', 'TEXAS', 'PENNSYLVANIA']

for i in range(len(overwrite_indices)):
    county_energy_estimates_IEDB.loc[overwrite_indices[i],
                                     'FIPSTATE'] = county_energy_estimates_IEDB.loc[overwrite_indices[i], 'FIPS'][:2]
    county_energy_estimates_IEDB.loc[overwrite_indices[i], 'STATE'] = correct_states[i]

county_energy_estimates_IEDB.loc[overwrite_indices, :]

In [None]:
# Group data again
county_energy_estimates_IEDB = county_energy_estimates_IEDB.groupby(by=[col for col in county_energy_estimates_IEDB.columns if col not in ['MMBTU_TOTAL']],
                                                                    as_index=False).sum().reset_index(drop=True)
print('Duplicate datapoints removed:', final_datapoints - len(county_energy_estimates_IEDB))

# Reassess duplicates
if len(county_energy_estimates_IEDB[county_energy_estimates_IEDB[['FIPS', 'NAICS', 'MECS_FT']].duplicated()]) == 0:
    print('After grouping, there are no more duplicated datapoints. Proceed.')
else:
    print('After grouping, there are still', len(county_energy_estimates_IEDB[county_energy_estimates_IEDB[['FIPS', 'NAICS', 'MECS_FT']].duplicated()]),
          'duplicated datapoints. Reassess.')

In [None]:
# Ensure FIPSTATE code matches FIPS code for all entries
county_energy_estimates_IEDB.FIPSTATE = county_energy_estimates_IEDB.apply(
    lambda x: x.FIPS[:2], axis=1)

# For FIPSTATE 34 -> 35 conversion, rename the STATE field manually
county_energy_estimates_IEDB.STATE = county_energy_estimates_IEDB.apply(
    lambda x: 'NEW MEXICO' if x.FIPSTATE == '35' else x.STATE, axis=1)

# Split data into industrial sectors covered
county_IEDB_mf = county_energy_estimates_IEDB[county_energy_estimates_IEDB['IND_SECTOR'] == 'Manufacturing'].reset_index(
    drop=True)
county_IEDB_mining = county_energy_estimates_IEDB[county_energy_estimates_IEDB['IND_SECTOR'] == 'Mining'].reset_index(
    drop=True)
county_IEDB_agri = county_energy_estimates_IEDB[county_energy_estimates_IEDB['IND_SECTOR'] == 'Agriculture'].reset_index(
    drop=True)
county_IEDB_constr = county_energy_estimates_IEDB[county_energy_estimates_IEDB['IND_SECTOR'] == 'Construction'].reset_index(
    drop=True)

# 1 Agriculture energy consumption

In [None]:
# ## Determine the NAICS granularity of each entry
county_IEDB_agri['NAICS_dig'] = county_IEDB_agri.apply(lambda x: len(x.NAICS), axis=1)

# For more granular entries, aggregate to 4-digit granularity
county_IEDB_agri['NAICS_4dig'] = county_IEDB_agri.apply(
    lambda x: x.NAICS[:4] if x.NAICS_dig > 4 else x.NAICS, axis=1)
county_IEDB_agri_4dig = county_IEDB_agri.groupby(
    by=['FIPSTATE', 'FIPS', 'MECS_FT', 'YEAR', 'STATE', 'IND_SECTOR', 'NAICS_4dig'], as_index=False).sum().drop(columns='NAICS_dig')

# Aggregate to 3-digit granularity as well
county_IEDB_agri['NAICS_3dig'] = county_IEDB_agri.apply(
    lambda x: x.NAICS[:3] if x.NAICS_dig > 3 else x.NAICS, axis=1)
county_IEDB_agri_3dig = county_IEDB_agri.groupby(
    by=['FIPSTATE', 'FIPS', 'MECS_FT', 'YEAR', 'STATE', 'IND_SECTOR', 'NAICS_3dig'], as_index=False).sum().drop(columns='NAICS_dig')

county_IEDB_agri = county_IEDB_agri.drop(columns='NAICS_dig')

# 2 Calculate emissions from agricultural energy consumption
Make sure to use 6-digit NAICS codes for calculation so that emissions can be aggregated later. At the end of this section, should have 9 dataframes:
1. Dataframe of the CO2e emissions from a given fuel type for a given 6-digit NAICS code in a given county.
2. Dataframe of the Scope 1 and 2 (i.e. on-site and electricity consumption) CO2e emissions for a given 6-digit NAICS code in a given county.
3. Dataframe of the total CO2e emissions from a given 6-digit NAICS code in a given county.
4. Dataframe of the CO2e emissions from a given fuel type for 3-/4-digit NAICS granularity--depending on 'sectors_to_aggregate'--for a given county.
5. Dataframe of the Scope 1 and 2 CO2e emissions for 3-/4-digit NAICS granularity--depending on 'sectors_to_aggregate'--for a given county.
6. Dataframe of the total CO2e emissions for 3-/4-digit NAICS granularity--depending on 'sectors_to_aggregate'--for a given county.
7. Dataframe of the CO2e emissions from a given fuel type for all of agriculture (NAICS 31-33) in a given county.
8. Dataframe of the Scope 1 and 2 CO2e emissions for all of agriculture in a given county.
9. Dataframe of the total CO2e emissions for all agriculture in a given county.

Calculations for dataframes 2-9 should be conducted using dataframe 1 to minimize error propagation.


<b>Naming convention:</b> county_ag_[NAICS_GRANULARITY]_['fuels'/'scopes'/'totals]_CO2e .  E.g. For 6-digit Scope 1 and 2 emissions: county_ag_6dig_scopes_CO2e. For aggregated total emissions: county_ag_agg_totals_CO2e.

## 2.1 Calculate emissions at highest granularity possible

IEDB gives energy consumption in MMBTU. Can use the following emissions factors to determine the associated CO2e emissions for each fuel type:
- Oil (assumed crude oil): 74.47 million metric tons CO2 per quadrillion Btu (https://www.eia.gov/environment/emissions/co2_vol_mass.php - scroll to bottom and download "Detailed factors" Excel sheet)
- Natural gas: 116.65 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- Coal: 211.87 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- Diesel: 163.45 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- LPG_NGL (interpret this as 'propane'): 138.63 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- Residual fuel oil: 165.55 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- Coke and breeze: Use 'Coke' under coals by type: 250.59 lbCO2e per MMBtu (https://www.eia.gov/environment/emissions/co2_vol_mass.php)
- 'Other' - assume to be LPG as done in the NREL IET documentation (https://www.nrel.gov/docs/fy19osti/71990.pdf, pg. 14)
- Net electricity: Use eGRID 2016 electricity carbon intensity at the eGRID subregion level (see 'countyElecIntensity.ipynb')

In [None]:
# Read in electricity carbon intensity and merge onto agricultural energy consumption data
counties_elec_intensity = pd.read_csv(
    '../Temp/counties_elec_intensity.csv',
    dtype={'id': str, 'STATE': str, 'COUNTY': str}
).drop(columns='Unnamed: 0')
counties_elec_intensity = counties_elec_intensity.rename(columns={'id': 'FIPS'})

county_ag_6dig_fuels_CO2e = pd.merge(
    county_IEDB_agri,
    counties_elec_intensity[['FIPS', 'SUBRGN', 'SRC2ERTA']],
    how='left',
    on='FIPS'
)
county_ag_6dig_fuels_CO2e = county_ag_6dig_fuels_CO2e.rename(
    columns={'SRC2ERTA': 'lbCO2e_perMWh_elec'})

# Calculate emissions for each county-NAICS-fuel combo (i.e. each row) using the above emissions factors
#  Define a dictionary containing all the remaining emissions intensities (aside from electricity)
ef_dict = {'Diesel': 163.45,
           'LPG_NGL': 138.63,
           'Natural_gas': 116.65,
           'Other': 138.63,
           'Residual_fuel_oil': 165.55,
           'Coal': 211.87,
           'Coke_and_breeze': 250.59,
           }
#  Define a function that reads fuel type of each entry and performs the appropraite calculation
def calculate_emissions(fuel_type, ef_dict, mmbtu_total, elec_ef):
    if fuel_type == 'Net_electricity':
        lbCO2e = mmbtu_total * elec_ef * 0.29307107  # MWh/MMBtu
    else:
        lbCO2e = mmbtu_total * ef_dict[fuel_type]
    return lbCO2e


#  Apply function across entire dataframe
county_ag_6dig_fuels_CO2e['lbCO2e'] = county_ag_6dig_fuels_CO2e.apply(
    lambda x: calculate_emissions(x.MECS_FT, ef_dict, x.MMBTU_TOTAL, x.lbCO2e_perMWh_elec), axis=1)
county_ag_6dig_fuels_CO2e['tonCO2e'] = county_ag_6dig_fuels_CO2e['lbCO2e'] / 2000

### 2.1.1 Add field for tonCO2e_[fueltype] for each fuel type
This will allow us to determine the percent of the total emissions attributable to each fuel type, for all sorts of different aggregations that we perform later. This will be particularly useful for calculating the economic burden later on, as the incidence of a carbon tax will depend on the fuel used to produce the carbon.


In [None]:
# Make lists of columns to group by for different aggregations
group_cols_6dig_totals = [col for col in county_ag_6dig_fuels_CO2e.columns if col not in ['YEAR', 'SUBRGN', 'lbCO2e_perMWh_elec', 'MECS_FT', 'IND_SECTOR',
                                                                                          'MMBTU_TOTAL', 'lbCO2e', 'tonCO2e']]
group_cols_6dig_scopes = [
    col for col in group_cols_6dig_totals if col not in ['MECS_FT']] + ['scope']
group_cols_6dig_fuels = group_cols_6dig_totals + ['MECS_FT']

group_cols_agg_fuels = [
    col for col in group_cols_6dig_fuels if col not in ['NAICS_4dig', 'NAICS_3dig']]
group_cols_agg_scopes = [
    col for col in group_cols_6dig_scopes if col not in ['NAICS_4dig', 'NAICS_3dig']]
group_cols_agg_totals = [
    col for col in group_cols_6dig_totals if col not in ['NAICS_4dig', 'NAICS_3dig']]

group_cols_2dig_fuels = [col for col in group_cols_6dig_fuels if col not in [
    'NAICS', 'NAICS_4dig', 'NAICS_3dig']]
group_cols_2dig_scopes = [col for col in group_cols_6dig_scopes if col not in [
    'NAICS', 'NAICS_4dig', 'NAICS_3dig']]
group_cols_2dig_totals = [col for col in group_cols_6dig_totals if col not in [
    'NAICS', 'NAICS_4dig', 'NAICS_3dig']]

# Create new column for emissions corresponding to ach fuel type
for fuel in county_ag_6dig_fuels_CO2e.MECS_FT.unique():
    county_ag_6dig_fuels_CO2e[f'tonCO2e_{fuel}'] = county_ag_6dig_fuels_CO2e.apply(
        lambda x: x.tonCO2e if x.MECS_FT == fuel else np.nan, axis=1)

# Drop unnecessary columns
county_ag_6dig_fuels_CO2e = county_ag_6dig_fuels_CO2e.drop(columns=['YEAR', 'lbCO2e_perMWh_elec'])

## 2.2 Scope 1 and 2 emissions at 6-digit NAICS granularity

In [None]:
# Group emissions into Scope 1 and Scope 2
#  Add scope column to diff fuel df
county_ag_6dig_fuels_CO2e['scope'] = county_ag_6dig_fuels_CO2e.apply(
    lambda x: 'scope2' if x.MECS_FT == 'Net_electricity' else 'scope1', axis=1)

#  Group scopes dataframe by scope
county_ag_6dig_scopes_CO2e = county_ag_6dig_fuels_CO2e.groupby(
    by=group_cols_6dig_scopes, as_index=False).sum()

## 2.3 Total emissions at 6-digit NAICS granularity

In [None]:
county_ag_6dig_totals_CO2e = county_ag_6dig_fuels_CO2e.groupby(
    by=group_cols_6dig_totals, as_index=False).sum()

## 2.4 Emissions for different fuel types at 3-/4-digit NAICS granularity

In [None]:
sectors_to_aggregate = list(county_ag_6dig_fuels_CO2e['NAICS_3dig'].unique())

# Aggregate energy consumption and emissions to 4-digit NAICS code for the identified sectors
county_ag_agg_fuels_CO2e = county_ag_6dig_fuels_CO2e.groupby(
    by=['FIPSTATE', 'FIPS', 'STATE', 'County', 'MECS_FT', 'NAICS_4dig', 'scope'], as_index=False).sum().rename(columns={'NAICS_4dig': 'NAICS'})

## 2.5 Scope 1 and 2 emissions at 3-/4-digit NAICS granularity

In [None]:
county_ag_agg_scopes_CO2e = county_ag_agg_fuels_CO2e.groupby(
    by=group_cols_agg_scopes, as_index=False).sum()

## 2.6 Total emissions at 3-/4-digit NAICS granularity

In [None]:
county_ag_agg_totals_CO2e = county_ag_agg_scopes_CO2e.groupby(by = group_cols_agg_totals, as_index = False).sum()
county_ag_agg_totals_CO2e['lbCO2e_log10'] = np.log10(county_ag_agg_totals_CO2e['lbCO2e'])
county_ag_agg_totals_CO2e['tonCO2e_log10'] = np.log10(county_ag_agg_totals_CO2e['tonCO2e'])

# write to csv for decomposition analysis
county_ag_agg_scopes_CO2e.to_csv('../Temp/county_ag_agg_scopes_CO2e.csv')

## 2.7 Emissions for different fuel types for all agriculture

In [None]:
county_ag_2dig_fuels_CO2e = county_ag_6dig_fuels_CO2e.groupby(
    by=group_cols_2dig_fuels,
    as_index=False
).sum()

## 2.8 Scope 1 and 2 emissions for all agriculture

In [None]:
county_ag_2dig_scopes_CO2e = county_ag_6dig_fuels_CO2e.groupby(
    by=group_cols_2dig_scopes, as_index=False).sum()

## 2.9 Total emissions for all agriculture

In [None]:
county_ag_2dig_totals_CO2e = county_ag_6dig_fuels_CO2e.groupby(
    by=['FIPS', 'County', 'STATE'], as_index=False).sum()

## 2.10 Group all emissions dataframes together in a dictionary

In [None]:
county_ag_CO2e_dict = {'6dig_fuels':  county_ag_6dig_fuels_CO2e,
                       '6dig_scopes': county_ag_6dig_scopes_CO2e,
                       '6dig_totals': county_ag_6dig_totals_CO2e,
                       'agg_fuels':   pd.merge(county_ag_agg_fuels_CO2e,
                                               NAICS_codebook,
                                               how='left',
                                               on='NAICS'),
                       'agg_scopes':  pd.merge(county_ag_agg_scopes_CO2e,
                                               NAICS_codebook,
                                               how='left',
                                               on='NAICS'),
                       'agg_totals':  pd.merge(county_ag_agg_totals_CO2e,
                                               NAICS_codebook,
                                               how='left',
                                               on='NAICS'),
                       '2dig_fuels':  county_ag_2dig_fuels_CO2e,
                       '2dig_scopes': county_ag_2dig_scopes_CO2e,
                       '2dig_totals': county_ag_2dig_totals_CO2e}

# 3 Agriculture Employment

## 3.1 QCEW agriculture employment data

In [None]:
## Run QCEW_data to define function getAnnualCountyIndustryData() needed to extract QCEW data
%run ../../empData/Scripts/QCEW_data.ipynb

## Total 2016 agricultural employment employment, as given by QCEW
total_US_ag_emp = getAnnualCountyIndustryData(
    '2016', 
    '11', 
    print_results=False, 
    return_total_US_emp=True)
print('Total 2016 agriculture employment (QCEW):', total_US_ag_emp)

## Use QCEW script to get total agriculture employment per county
qcew_county_2dig = getAnnualCountyIndustryData('2016', '11')
qcew_county_2dig = qcew_county_2dig[qcew_county_2dig['disclosure_code'] != 'N'].rename(
    columns = {'area_fips': 'FIPS', 'annual_avg_emplvl': 'Emp'})

## 3.2 LEHD agriculture employment data
Pull both 3-digit and 2-digit LEHD data.

### 3.2.1 3-digit

In [None]:
# Run LEHD API pull script to call function that pulls LEHD employment data
# %run ../../empData/Scripts/LEHD_API_pull.ipynb

# Run API pull, or read from CSV
# lehd_4dig_ag_2016 = getLEHDemp(
#     '2016', '4', '11', write_to_csv = True
# ).rename(columns={'industry': 'NAICS'})
lehd_3dig_ag_2016 = pd.read_csv(
    '../../empData/Temp/emp_ovr_11_3dig_2016.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).rename(columns={'industry': 'NAICS'}
         ).drop(columns=['Unnamed: 0'])

### 3.2.2 2-digit

In [None]:
# Run API pull, or read from CSV
# lehd_2dig_ag_2016 = getLEHDemp('2016', '2', '11', write_to_csv = True)
lehd_2dig_ag_2016 = pd.read_csv(
    '../../empData/Temp/emp_ovr_11_2dig_2016.csv',
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).rename(
    columns={'industry': 'NAICS'}
).drop(columns=['Unnamed: 0'])

# 4 Compute emissions per capita and per employee

## 4.1 Emissions per capita

For non-census years, the U.S. Census Bureau's Population Estimates Program uses a cohort-component method to estimate the change in county-level population by tallying recorded births, deaths and migrations since the last population estimate and using them to derive a new county-level estimate. More details here https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html, documentation here: https://www2.census.gov/programs-surveys/popest/technical-documentation/methodology/2010-2019/natstcopr-methv2.pdf.<br>
This data can be accessed via the Census Data API. An API pull script has been constructed in another notebook ('CensusBureauPopEstimates.ipynb'), and will be called here to extract the population data.

In [None]:
## Run CensusBureauPopEstimates.ipynb to define function get_pop_estimate() needed to extract Census Population Estimates for a given year data
%run ../../empData/Scripts/CensusBureauPopEstimates.ipynb

## Use get_pop_estimate() function to extract 2016 population estimates for each county
county_pop_2016 = get_pop_estimate(2016, 'county')
county_pop_2016['FIPS'] = county_pop_2016['state'] + county_pop_2016['county']

## Merge population estimates onto emissions dataframes and calculate emissions per capita
county_ag_CO2e_percapita_dict = county_ag_CO2e_dict.copy()

for df in county_ag_CO2e_percapita_dict.keys():
    # Merge 
    county_ag_CO2e_percapita_dict[df] = pd.merge(county_ag_CO2e_percapita_dict[df], county_pop_2016[['FIPS', 'POP']], how = 'left', on = 'FIPS')
    
    # Calculate emissions per capita
    county_ag_CO2e_percapita_dict[df]['tonCO2e_percapita'] = county_ag_CO2e_percapita_dict[df]['tonCO2e'] / county_ag_CO2e_percapita_dict[df]['POP']
    county_ag_CO2e_percapita_dict[df]['lbCO2e_percapita'] = county_ag_CO2e_percapita_dict[df]['lbCO2e'] / county_ag_CO2e_percapita_dict[df]['POP']
    
    # Calculate log10 of this
    county_ag_CO2e_percapita_dict[df]['tonCO2e_percapita_log10'] = np.log10(county_ag_CO2e_percapita_dict[df]['tonCO2e_percapita'])
    county_ag_CO2e_percapita_dict[df]['lbCO2e_percapita_log10'] = np.log10(county_ag_CO2e_percapita_dict[df]['lbCO2e_percapita'])

## 4.2 Emissions per employee

In [None]:
## Set up dictionary for per employee data
county_ag_CO2e_peremp_dict = {'agg_fuels': county_ag_CO2e_percapita_dict['agg_fuels'], 
                              'agg_scopes': county_ag_CO2e_percapita_dict['agg_scopes'], 
                              'agg_totals': county_ag_CO2e_percapita_dict['agg_totals'],
                              '2dig_fuels': county_ag_CO2e_percapita_dict['2dig_fuels'], 
                              '2dig_scopes': county_ag_CO2e_percapita_dict['2dig_scopes'], 
                              '2dig_totals': county_ag_CO2e_percapita_dict['2dig_totals']}

### 4.2.1 Emissions intensities for entire agriculture sector

#### 4.2.1.1 Calculation

In [None]:
for key in list(county_ag_CO2e_peremp_dict.keys())[3:]:
    # Merge LEHD data onto per employee data
    county_ag_CO2e_peremp_dict[key] = pd.merge(
        county_ag_CO2e_peremp_dict[key],
        lehd_2dig_ag_2016.rename(columns={'Emp': 'LEHD_Emp'})[['FIPS', 'LEHD_Emp']],
        how='left',
        on='FIPS'
    )
    # Merge QCEW data onto per employee data
    county_ag_CO2e_peremp_dict[key] = pd.merge(
        county_ag_CO2e_peremp_dict[key],
        qcew_county_2dig.rename(columns={'Emp': 'QCEW_Emp'})[
            ['FIPS', 'QCEW_Emp', 'total_annual_wages', 'annual_avg_estabs']],
        how='left',
        on='FIPS'
    )
    # Take QCEW data when LEHD data is NaN
    county_ag_CO2e_peremp_dict[key]['Emp'] = county_ag_CO2e_peremp_dict[key].apply(lambda x:
                                                                                   x.LEHD_Emp if x.LEHD_Emp > 0 else x.QCEW_Emp, axis=1
                                                                                   )
    county_ag_CO2e_peremp_dict[key] = county_ag_CO2e_peremp_dict[key].drop(
        columns=['LEHD_Emp', 'QCEW_Emp'])

    #  Compute average annual wages
    county_ag_CO2e_peremp_dict[key]['avg_annual_wage_peremployee'] = county_ag_CO2e_peremp_dict[key]['total_annual_wages'] / \
        county_ag_CO2e_peremp_dict[key]['Emp']

    # Calculate emissions per employee for all agriculture per county (for counties with specified employment data)
    #  Set employment in counties where it is reported to be 0 to NaN, as there must be employment there if there are emissions
    county_ag_CO2e_peremp_dict[key]['Emp'] = county_ag_CO2e_peremp_dict[key].apply(
        lambda x: np.nan if x.Emp == 0 else x.Emp, axis=1)

    #  Calculate emissions per employee
    county_ag_CO2e_peremp_dict[key]['tonCO2e_peremp'] = county_ag_CO2e_peremp_dict[key].apply(
        lambda x: x.tonCO2e / x.Emp if x.Emp != np.nan else np.nan, axis=1)

    #  Calculate log10 of this
    county_ag_CO2e_peremp_dict[key]['tonCO2e_peremp_log10'] = county_ag_CO2e_peremp_dict[key].apply(
        lambda x: np.log1p(x.tonCO2e_peremp) if x.Emp != np.nan else np.nan, axis=1)

### 4.2.2 Emissions intensities for NAICS subsectors

In [None]:
# Set up dummy variable to check whether the a datapoint was merged from LEHD or not after merge
lehd_3dig_ag_2016['LEHD?'] = np.ones(len(lehd_3dig_ag_2016))

# Merge employment data onto aggregated dataframe
for key in list(county_ag_CO2e_peremp_dict.keys())[:3]:
    county_ag_CO2e_peremp_dict[key] = pd.merge(
        county_ag_CO2e_peremp_dict[key],
        lehd_3dig_ag_2016[['NAICS', 'Emp', 'FIPS', 'LEHD?']],
        how='left',
        on=['FIPS', 'NAICS']
    )

    # Calculate emissions per employee for NAICS subsectors
    # Treat emp values of zero as NaN
    county_ag_CO2e_peremp_dict[key]['Emp'] = county_ag_CO2e_peremp_dict[key].apply(
        lambda x: np.nan if x.Emp == 0 else x.Emp, axis=1)

    # Calculate emissions per employee for non-NaN datapoints
    county_ag_CO2e_peremp_dict[key]['tonCO2e_peremp'] = county_ag_CO2e_peremp_dict[key].apply(
        lambda x: x.tonCO2e / x.Emp, axis=1)

    # Calculate log10 of this for visualization
    county_ag_CO2e_peremp_dict[key]['tonCO2e_peremp_log10'] = county_ag_CO2e_peremp_dict[key].apply(
        lambda x: np.log10(x.tonCO2e_peremp), axis=1)

# 5 Save final dataframe to csv for overall analysis

In [None]:
for key in county_ag_CO2e_peremp_dict.keys():
    county_ag_CO2e_peremp_dict[key].to_csv(f'../Output/ag_{key}_peremp_final.csv')
    
for key in county_ag_CO2e_percapita_dict.keys():
    county_ag_CO2e_percapita_dict[key].to_csv(f'../Output/ag_{key}_percapita_final.csv')