In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from functools import reduce
import math
from scipy.stats.mstats import winsorize
import os
import glob

# Load in a GeoJSON file containing the geometry information for US counties, where feature.id is a FIPS code.
from urllib.request import urlopen
import json

with urlopen("https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json") as response:
    counties = json.load(response)


# 1 Consolidate emissions data from other analyses

In [None]:
stateFIPS = pd.read_csv(
    "../Temp/stateFIPS.csv",
    dtype={"FIPS": str}
).drop(columns="State").rename(
    columns={"Abbr": "State", "FIPS": "FIPSTATE"}
)

## 1.1 Industrial energy data


### 1.1.1 Read in all industrial energy csv files, store in dictionary


In [None]:
# set up dictionary to store all dataframes
emissions_raw = {}

# use glob to get all the csv files in the folder
path = os.getcwd() + "\..\..\..\Data\industrial\Output"
csv_files = glob.glob(os.path.join(path, "*_peremp_final.csv"))

# loop over the list of csv files
for f in csv_files:

    filename = f.replace(path + "\\", "").replace("_peremp_final.csv", "")
    print(filename)
    # read the csv file
    df = pd.read_csv(f, dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
                     ).drop(columns="Unnamed: 0")

    # add the dataframe to the emissions dictionary
    emissions_raw[filename] = df

### 1.1.2 Define function to extract a specific dataframe from this dictionary


In [None]:
def getIndustrialData(industry, granularity, category):
    """
    Returns the final dataframe for a given set of specifications, as derived in the analysis 
    scripts using NREL's Industrial Energy DataBook dataset. Takes inputs and navigates through
     dictionary computed above. All dataframes have absolute, per capita (including population)
      and per employee (including no. employees) data.
    -----------------------------------
    Parameters:
        industry (str): one of 'ag' (agriculture), 'cn' (construction), 'mn' (mining) or 'mf' (manufacturing).
        granularity (str): one of '2dig', 'agg', or '6dig'. Indicates the NAICS subsector granularity
                    required. 'agg' is generally 3- or 4-digit granularity, depending on the dataframe.
        category (str): one of 'fuels', 'scopes', or 'totals'. 'fuels' returns data from each fuel source, 
                    'scopes' splits data into scope 1 and scope 2 emissions, and 'totals' returns total 
                    emissions (i.e. scope 1 + scope 2).

    """
    filename = f"{industry}_{granularity}_{category}"
    return emissions_raw[filename]

## 1.2 Commercial sector data


In [None]:
comm_raw = pd.read_csv(
    "../../../Data/comm/Output/comm_totalCO2_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str},
).drop(columns="Unnamed: 0")

## 1.3 Power plants data


In [None]:
pwr_raw = pd.read_csv(
    "../../../Data/pwr/Output/pwr_totalCO2_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str},
).drop(columns="Unnamed: 0")
pwr_plants_raw = pd.read_csv(
    "../../../Data/pwr/Output/powerplant_emissions2019.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str},
).drop(columns="Unnamed: 0")

In [None]:
# Extract crosswalk of counties to NERC regions, to be used later when assigning demand elasticities for different fuels used for power gen.
fips_nerc_crosswalk = pwr_raw[["FIPS", "NERC Region"]]

# Create a crosswalk of states to NERC regions using fips_nerc_crosswalk. Note that some states have counties in several NERC regions
# - use the NERC region with the greatest number of counties
#  Add FIPSTATE column
df = fips_nerc_crosswalk.copy()
df['FIPSTATE'] = df.apply(lambda x: x.FIPS[:2], axis=1)
fipstate_nerc_crosswalk = df.copy()

#  Reduce df to the NERC Regions attributable to each state
fipstate_nerc_crosswalk = fipstate_nerc_crosswalk[['FIPSTATE', 'NERC Region']
                                                  ].drop_duplicates(ignore_index=True)

#  Add count of number of counties in a given NERC Region for each state
fipstate_nerc_crosswalk['no_counties'] = fipstate_nerc_crosswalk.apply(
    lambda x: len(fips_nerc_crosswalk[(df.FIPSTATE == x.FIPSTATE) &
                                      (df['NERC Region'] == x['NERC Region'])]),
    axis=1
)

#  Sort by this count, and drop duplicated FIPSTATE rows leaving only NERC region with most counties per state
fipstate_nerc_crosswalk = fipstate_nerc_crosswalk.sort_values(
    by=['FIPSTATE', 'no_counties'], ignore_index=True)
fipstate_nerc_crosswalk = fipstate_nerc_crosswalk[[
    'FIPSTATE', 'NERC Region']].drop_duplicates(subset='FIPSTATE', keep='last', ignore_index=True)

# Where there are counties missing in fips_nerc_crosswalk, use the NERC region of the state from fipstate_nerc_crosswalk
# Read in dataframe of all US counties
fips = pd.read_csv('../Temp/fips.csv', encoding='unicode_escape',
                   names=['FIPS', 'County', 'State name', 'State'], dtype={'FIPS': str})

# Merge missing counties onto fips_nerc_crosswalk
fips_nerc_crosswalk = pd.merge(
    fips_nerc_crosswalk,
    fips['FIPS'],
    how='outer',
    on='FIPS'
)

# Populate missing NERC fields with state-level NERC data
fips_nerc_crosswalk['FIPSTATE'] = fips_nerc_crosswalk.apply(lambda x: x.FIPS[:2], axis=1)
fips_nerc_crosswalk = pd.merge(
    fips_nerc_crosswalk,
    fipstate_nerc_crosswalk,
    how='left',
    on='FIPSTATE'
)
fips_nerc_crosswalk['NERC Region_x'] = fips_nerc_crosswalk.apply(
    lambda x: x['NERC Region_x'] if type(x['NERC Region_x']) == str else x['NERC Region_y'],
    axis=1
)
fips_nerc_crosswalk = fips_nerc_crosswalk.rename(
    columns={'NERC Region_x': 'NERC Region'}).drop(columns=['NERC Region_y', 'FIPSTATE'])

# Write to csv for future use
fips_nerc_crosswalk.to_csv('../Temp/fips_nerc_crosswalk.csv')

## 1.4 Fossil fuel production data


### 1.4.1 Oil & Gas


In [None]:
# Read in aggregated WellDatabase data
og_raw = pd.read_csv(
    "../../../Data/og/Output/og_totalCO2_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str},
).drop(columns="Unnamed: 0")

# Merge county names used in other analyses onto og_raw for consistency
#  Read in county fips labels [NOTE: this file has been edited from that used in other code to ensure county names and their formatting match]
fips = pd.read_csv("../Temp/fips_edited.csv",
                   encoding="windows-1252",
                   usecols=[1, 2, 3, 4],
                   names=["FIPS", "County", "State Name", "State"],
                   dtype={"FIPS": str},
                   )
fips["County"] = fips["County"].str.lower()

#  Drop existing county names
og_raw = og_raw.drop(columns="County")

#  Merge on new county names
og_raw = pd.merge(og_raw, fips[["FIPS", "County"]], how="left", on="FIPS")

# Convert everything into tonCO2e
cols_to_convert = ["em_gas_county (lbCO2e)", "em_oil_county (lbCO2e)", "em_total_county (lbCO2e)"]

og_raw["tonCO2e_gas"] = og_raw["em_gas_county (lbCO2e)"] / 2000
og_raw["tonCO2e_oil"] = og_raw["em_oil_county (lbCO2e)"] / 2000
og_raw["tonCO2e"] = og_raw["em_total_county (lbCO2e)"] / 2000
og_raw = og_raw.drop(columns=cols_to_convert)

# Rename necessary columns
og_raw = og_raw.rename(columns={"annual_avg_emplvl": "Emp",
                                "Population": "POP",
                                "short ton CO2/employee": "tonCO2e_peremp"}
                       )
og_raw["FIPSTATE"] = og_raw.apply(lambda x: x.FIPS[:2], axis=1)

# Add column containing 'scope3' for all entries
og_raw["scope"] = og_raw.apply(lambda x: "scope3", axis=1)

# If Emp == 0 but tonCO2e > 0, set Emp to NaN
og_raw['Emp'] = og_raw.apply(lambda x: np.nan if (x.tonCO2e > 0 and x.Emp == 0) else x.Emp, axis=1)

### 1.4.2 Coal


In [None]:
coal_raw = pd.read_csv(
    "../../../Data/coal/Output/county_emissions_coal.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str},
)

# Merge state abbreviation and updated county name onto dataframe
coal_raw = pd.merge(
    coal_raw.drop(columns=["Mine County", "Mine State"]),
    fips[["FIPS", "County"]],
    how="left",
    on="FIPS",
)
coal_raw["FIPSTATE"] = coal_raw.apply(lambda x: x.FIPS[:2], axis=1)
coal_raw = pd.merge(coal_raw, stateFIPS, how="left", on="FIPSTATE")

# Rename necessary columns
coal_raw = coal_raw.rename(columns={"short ton CO2": "tonCO2e",
                                    "Average Employees": "Emp",
                                    "short ton CO2/employee": "tonCO2e_peremp",
                                    "Population": "POP",
                                    "short ton CO2/county pop": "tonCO2e_percapita"}
                           )

# Add column containing 'scope3' for all entries
coal_raw["scope"] = coal_raw.apply(lambda x: "scope3", axis=1)

# 2 Create dataframes for analysis and plotting


## 2.1 Read in and consolidate data from each of the sectoral analyses


In [None]:
# Identify cols to keep in each dataframe
cols_to_keep = {
    "ag":   ["FIPS", "FIPSTATE", "County", "STATE", "scope", "tonCO2e", 
             "Emp", "tonCO2e_Diesel", "tonCO2e_LPG_NGL", "tonCO2e_Net_electricity", "tonCO2e_Other",
             "tonCO2e_Residual_fuel_oil", "tonCO2e_Natural_gas", "tonCO2e_Coal"],
    "cn":   ["FIPS", "FIPSTATE", "County", "STATE", "scope", "tonCO2e", 
             "Emp", "tonCO2e_Diesel", "tonCO2e_LPG_NGL", "tonCO2e_Natural_gas", "tonCO2e_Net_electricity"],
    "comm": ["State", "FIPS", "STATEFIPS", "County", "lbCO2e_elec_total_w", "tonCO2e_total_w",
             "POP", "Emp", "lbCO2e_ng_total_w", "lbCO2e_other_total_w", "lbCO2e_dist_heat_w",
             "lbCO2e_dist_cool_w"],
    "mf":   ["FIPS", "FIPSTATE", "County", "STATE", "scope", "tonCO2e",
             "Emp", "tonCO2e_Coal", "tonCO2e_Coke_and_breeze", "tonCO2e_Diesel", "tonCO2e_LPG_NGL",
             "tonCO2e_Natural_gas", "tonCO2e_Net_electricity", "tonCO2e_Other", "tonCO2e_Residual_fuel_oil"],
    "mn":   ["FIPS", "FIPSTATE", "NAICS", "County", "scope", "tonCO2e", 
             "Emp", "tonCO2e_Coal", "tonCO2e_Diesel", "tonCO2e_Natural_gas", "tonCO2e_Net_electricity",
             "tonCO2e_Other", "tonCO2e_Residual_fuel_oil", "tonCO2e_LPG_NGL"],
    "pwr":  ["FIPS", "County", "State", "Tons of CO2 Emissions", 
             "emp_new", "tonCO2e_PET", "tonCO2e_GAS", "tonCO2e_COAL", "tonCO2e_other"],
    "og":   ["FIPS", "FIPSTATE", "County", "scope", "tonCO2e",  
             "Emp", "tonCO2e_gas", "tonCO2e_oil"],
    "coal": ["FIPS", "FIPSTATE", "County", "scope", "tonCO2e", "Emp"],
}

In [None]:
# Isolate O&G extraction and coal mining scope 1 and scope 2 emissions from NAICS subsector dataframe
#  Read in subsector data from NREL IET
df = getIndustrialData("mn", "agg", "scopes")[cols_to_keep["mn"]]
df["tonCO2e_dsl"] = (df["tonCO2e_Diesel"] + df["tonCO2e_Other"])  # treat 'other' as diesel
df = df.drop(columns=["tonCO2e_Diesel", "tonCO2e_Other"])
df = df.rename(columns={"tonCO2e_LPG_NGL": "tonCO2e_lpg",
                        "tonCO2e_Net_electricity": "tonCO2e_elec",
                        "tonCO2e_Residual_fuel_oil": "tonCO2e_residfuel",
                        "tonCO2e_Natural_gas": "tonCO2e_ng",
                        "tonCO2e_Coal": "tonCO2e_coal"}
               )

#  Split up dataframe
og_scope12 = df[df["NAICS"] == "2111"]
coal_scope12 = df[df["NAICS"] == "2121"]
mn_rem_scope12_subsect = df[(df["NAICS"] != "2111") & (df["NAICS"] != "2121")]

# Concatenate O&G extraction and coal mining scope 1/2 emissions dataframes with scope 3 dataframes
coal_scopes = pd.concat([coal_raw[cols_to_keep["coal"]], coal_scope12], ignore_index=False)
coal_scopes["tonCO2e_coal"] = coal_scopes.apply(lambda x: x.tonCO2e
                                                if x.scope == "scope3" else x.tonCO2e_coal, axis=1)
og_scopes = pd.concat([og_raw[cols_to_keep["og"]].rename(columns={"tonCO2e_gas": "tonCO2e_ng", "tonCO2e_oil": "tonCO2e_crude"}),
                       og_scope12], ignore_index=False)

# Merge employment data from scope 3 dataframes onto this new dataframe, as the employent data for coal mining and O&G extraction is more accurate
coal_scopes = pd.merge(coal_scopes, coal_raw[["FIPS", "Emp"]], how="left", on="FIPS")
og_scopes = pd.merge(og_scopes, og_raw[["FIPS", "Emp"]], how="left", on="FIPS")
#  Keep employment data from scope 3, or scope 1/2 if no scope 3 employment available
coal_scopes["Emp"] = coal_scopes.apply(lambda x: x.Emp_y if x.Emp_y > 0 else x.Emp_x, axis=1)
coal_scopes = coal_scopes.drop(columns=["Emp_x", "Emp_y"])
og_scopes["Emp"] = og_scopes.apply(lambda x: x.Emp_y if x.Emp_y > 0 else x.Emp_x, axis=1)
og_scopes = og_scopes.drop(columns=["Emp_x", "Emp_y"])

# Fixed isolated error cases
og_scopes.loc[132, 'Emp'] = og_scopes.loc[1723,'Emp']
og_scopes.loc[25, 'Emp'] = og_scopes.loc[2072,'Emp']
og_scopes.loc[315, 'Emp'] = og_scopes.loc[2108,'Emp']

# Add sector column
coal_scopes["sector"] = "coal"
og_scopes["sector"] = "og"

# Add state column
coal_scopes = pd.merge(coal_scopes, stateFIPS, how="left", on="FIPSTATE")
og_scopes = pd.merge(og_scopes, stateFIPS, how="left", on="FIPSTATE")

# Concatenate these dataframes together
ff_scopes = pd.concat([coal_scopes, og_scopes]).drop(columns="NAICS")
ff_scopes.to_csv('../Temp/ff_scopes.csv')

#  Isolate employment data from this final dataframe
ff_scopes_emp = ff_scopes[["FIPS", "Emp", "sector"]]#.drop_duplicates(ignore_index=True)
ff_scopes_emp = pd.pivot_table(ff_scopes_emp, index="FIPS", columns="sector", values="Emp"
                               ).reset_index()

In [None]:
# Create dataframe for the mining emissions NOT associated with O&G extraction or coal mining
#  Group the dataframe of mining subsectors (excluding coal mining and O&G extraction) to aggregate emissions from the remaining subsectors
mn_rem_scope12 = mn_rem_scope12_subsect.drop(columns="Emp").groupby(
    by=["FIPS", "scope", "FIPSTATE", "County"], as_index=False).sum(numeric_only=True)

mn_rem_scope12 = pd.merge(mn_rem_scope12, stateFIPS, how="left", on="FIPSTATE")
mn_rem_scope12["sector"] = "mn_rest"

#  Calculate employment for the remaining subsectors by pulling total mining employment data and subtracting the employment data for O&G extration and coal mining from this
mn_rem_scope12_emp = getIndustrialData("mn", "2dig", "scopes"
                                       )[["FIPS", "Emp"]].drop_duplicates(ignore_index=True)
mn_rem_emp_final = pd.merge(mn_rem_scope12_emp,
                            ff_scopes_emp,
                            how="left",
                            on="FIPS"
                            ).rename(columns={"Emp": "mn_total"})
mn_rem_emp_final["Emp"] = mn_rem_emp_final.mn_total.fillna(0) - \
    mn_rem_emp_final.coal.fillna(0) - mn_rem_emp_final.og.fillna(0)

mn_rem_emp_final["Emp"] = mn_rem_emp_final.apply(
    lambda x: np.nan if (np.isnan(x.mn_total)) else x.Emp, axis=1
)
mn_rem_emp_final["Emp"] = mn_rem_emp_final.apply(
    lambda x: np.nan if x.Emp < 0 else x.Emp, axis=1
)

#  Merge this calculated employment onto the dataframe of emissions for the remaining mining subsectors
mn_rem_scope12 = pd.merge(mn_rem_scope12,
                          mn_rem_emp_final[["FIPS", "Emp"]],
                          how="left",
                          on="FIPS"
                          )

In [None]:
# Define dictionary to hold all 'scope' dataframes
scp_dict = {"scope1": {}, "scope2": {}, "scope3": {}}

# For mining data, split into 3 sectors: coal mining, O&G extraction, and remaining mining activities. The first two will have scope 1, 2, 3 emissions, the last will have just scope 1 and 2
for scope in ["scope1", "scope2", "scope3"]:
    scp_dict[scope]["coal"] = ff_scopes[(ff_scopes["sector"] == "coal") &
                                        (ff_scopes["scope"] == scope)]
    scp_dict[scope]["og"] = ff_scopes[(ff_scopes["sector"] == "og") &
                                      (ff_scopes["scope"] == scope)]
    scp_dict[scope]["mn_rest"] = mn_rem_scope12[(mn_rem_scope12["sector"] == "mn_rest") &
                                                (mn_rem_scope12["scope"] == scope)]

# Read in necessary remaining industrial data, reformat, and add to dictionary
for ind in ["ag", "cn", "mf"]:
    df = getIndustrialData(ind, "2dig", "scopes")[cols_to_keep[ind]]
    df = pd.merge(df, stateFIPS, how="left", on="FIPSTATE").drop(columns="STATE")
    df["sector"] = ind

    #  Set up all necessary columns for emissions by fuel type
    fuel_em_cols = ["tonCO2e_Coal", "tonCO2e_Coke_and_breeze", "tonCO2e_Diesel", "tonCO2e_LPG_NGL",
                    "tonCO2e_Natural_gas", "tonCO2e_Net_electricity", "tonCO2e_Other", "tonCO2e_Residual_fuel_oil"]
    for col in fuel_em_cols:
        if col not in list(df.columns):
            df[col] = np.zeros(len(df))

    #  Assign fuel emissions that don't fit into one of the main categories to main categories
    df["tonCO2e_dsl"] = df["tonCO2e_Diesel"] + df["tonCO2e_Other"]  # treat 'other' as diesel
    df["tonCO2e_coal"] = df["tonCO2e_Coal"] + \
        df["tonCO2e_Coke_and_breeze"]  # treat coke and breeze as coal

    #  Drop unnecessary columns and rename to match other sectors
    df = df.drop(columns=["tonCO2e_Diesel",
                          "tonCO2e_Other",
                          "tonCO2e_Coal",
                          "tonCO2e_Coke_and_breeze",
                          ]
                 )
    df = df.rename(columns={"tonCO2e_LPG_NGL": "tonCO2e_lpg",
                            "tonCO2e_Net_electricity": "tonCO2e_elec",
                            "tonCO2e_Residual_fuel_oil": "tonCO2e_residfuel",
                            "tonCO2e_Natural_gas": "tonCO2e_ng",
                            }
                   )

    #  Split dataframe into scope 1 and scope 2, and assign to corresponding dictionary
    scp_dict["scope1"][ind] = df[df["scope"] == "scope1"]
    scp_dict["scope1"][ind]["sector"] = ind

    scp_dict["scope2"][ind] = df[df["scope"] == "scope2"]
    scp_dict["scope2"][ind]["sector"] = ind

In [None]:
# Reformat commercial data and add to dictionary
#  Isolate desired columns from raw data
df = comm_raw[cols_to_keep["comm"]].copy()

#  Rename emissions by fuel type to be in keeping with the other sectors
df["tonCO2e_ng1"] = df["lbCO2e_ng_total_w"] / 2000
df["tonCO2e_elec1"] = df["lbCO2e_elec_total_w"] / 2000
df["tonCO2e_heatoil"] = df["lbCO2e_other_total_w"] / 2000  # treat 'other' as heating oil
df["tonCO2e_ng2"] = df["lbCO2e_dist_heat_w"] / 2000  # treat as natural gas consumption
df["tonCO2e_elec2"] = df["lbCO2e_dist_cool_w"] / 2000  # treat as electrcitiy consumption
df["tonCO2e_ng"] = df["tonCO2e_ng1"] + df["tonCO2e_ng2"]
df["tonCO2e_elec"] = df["tonCO2e_elec1"] + df["tonCO2e_elec2"]

df["sector"] = "comm"

#  Split into scope 1 and scope 2
comm_scope1 = df.copy()
comm_scope2 = df.copy()

# Scope 1
#  Calculate scope 1 emissions by subtracting electricity emissions (i.e. scope 2 emissions) from total emissions
comm_scope1["tonCO2e"] = comm_scope1["tonCO2e_total_w"] - comm_scope1["tonCO2e_elec"]

#  Set electricity emissions to 0 (as for the scope 1 emissions dataframe electricity should not be accounted for)
comm_scope1["tonCO2e_elec"] = np.zeros(len(comm_scope1))

#  Drop unnecessary columns, and add to dictionary
scp_dict["scope1"]["comm"] = comm_scope1.rename(columns={"STATEFIPS": "FIPSTATE"}).drop(
    columns=[
        "tonCO2e_total_w",
        "lbCO2e_elec_total_w",
        "POP",
        "lbCO2e_ng_total_w",
        "lbCO2e_other_total_w",
        "lbCO2e_dist_heat_w",
        "lbCO2e_dist_cool_w",
        "tonCO2e_ng1",
        "tonCO2e_ng2",
        "tonCO2e_elec1",
        "tonCO2e_elec2",
    ]
)
# Scope 2
#  Calculate scope 2 emissions as just the total electricity consumption. Set all other fuel emissions to 0.
comm_scope2["tonCO2e"] = comm_scope2["tonCO2e_elec"]
comm_scope2["tonCO2e_heatoil"] = np.zeros(len(comm_scope2))
comm_scope2["tonCO2e_ng"] = np.zeros(len(comm_scope2))

#  Drop unnecessary columns, and add to dictionary
scp_dict["scope2"]["comm"] = comm_scope2.rename(columns={"STATEFIPS": "FIPSTATE"}).drop(
    columns=[
        "tonCO2e_total_w",
        "lbCO2e_elec_total_w",
        "POP",
        "lbCO2e_ng_total_w",
        "lbCO2e_other_total_w",
        "lbCO2e_dist_heat_w",
        "lbCO2e_dist_cool_w",
        "tonCO2e_ng1",
        "tonCO2e_ng2",
        "tonCO2e_elec1",
        "tonCO2e_elec2",
    ]
)


In [None]:
# Reformat power plant data and add to dictionary (only scope 1, as assumed no scope 2)
scp_dict["scope1"]["pwr"] = pwr_raw[cols_to_keep["pwr"]]
# treat 'other' emissions as natural gas
scp_dict["scope1"]["pwr"]["tonCO2e_ng"] = scp_dict["scope1"]["pwr"]["tonCO2e_other"] + \
    scp_dict["scope1"]["pwr"]["tonCO2e_GAS"]

#  Assume plants that use petroleum use residual fuel oil.
scp_dict["scope1"]["pwr"] = pd.merge(scp_dict["scope1"]["pwr"],
                                     stateFIPS,
                                     how="left",
                                     on="State"
                                     ).rename(
    columns={"Tons of CO2 Emissions": "tonCO2e",
             "emp_new": "Emp",
             "tonCO2e_PET": "tonCO2e_residfuel",
             "tonCO2e_COAL": "tonCO2e_coal",
             }
)
#  clean up columns, add sector column
scp_dict["scope1"]["pwr"] = scp_dict["scope1"]["pwr"].drop(columns=["tonCO2e_other", "tonCO2e_GAS"])
scp_dict["scope1"]["pwr"]["sector"] = "pwr"

In [None]:
# For each scope, merge into a single dataframe
for scope in scp_dict.keys():
    scp_dict[scope] = pd.concat(scp_dict[scope].values(), ignore_index=True)
    scp_dict[scope]["scope"] = scope

# Finally, concatenate each 'scope' entry in dictionary to obtain final master dataframe
sector_scope = pd.concat(scp_dict.values(), ignore_index=True)

# Fill NaNs in tonCO2e_{fuel} fields with 0s (as the reason they're NaN is because there was no consumption/production of that fuel)
tonCO2e_perfuel_cols = ["tonCO2e_coal", "tonCO2e_ng", "tonCO2e_elec",
                        "tonCO2e_residfuel", "tonCO2e_lpg", "tonCO2e_dsl", "tonCO2e_crude", "tonCO2e_heatoil"]
sector_scope[tonCO2e_perfuel_cols] = sector_scope[tonCO2e_perfuel_cols].fillna(0)

# Read in total county employment as extracted from LEHD
%run ../../../Data/empData/Scripts/LEHD_API_pull.ipynb
# total_county_emp_lehd = getLEHDemp('2018', '2', '00', write_to_csv = True)
total_county_emp_lehd = pd.read_csv(
    '../../../Data/empData/Temp/emp_ovr_00_2dig_2018.csv', 
    dtype={'state': str, 'county': str, 'FIPS': str, 'sex': str, 'year': str, 'industry': str}
).drop(columns=['Unnamed: 0'])
total_county_emp_lehd = total_county_emp_lehd[['FIPS', 'Emp']]

In [None]:
# There are several counties that have changed over the 2010s, and some of the datasources
# use old county names while the overall employment data we use is up to date. Therefore,
# need to ensure they are consistent. Should keep only the counties that have defined emissions
# data - i.e. if these are the old county geographies, should use these ahead of updating and
# losing data coverage for those areas.

# Wrangell-Petersburg Census Area (02280) in Alaska split into Wrangell City and Borough (02275) and
# Petersburg Census Area (02195). Emissions data is for 02280, therefore should back-calculate
# employment for old county geography
total_county_emp_lehd = pd.concat(
    [total_county_emp_lehd,
     pd.DataFrame(
         [{'FIPS': '02280',
          'Emp': total_county_emp_lehd[(total_county_emp_lehd.FIPS == '02275') |
                                       (total_county_emp_lehd.FIPS == '02195')].Emp.sum()}],
     )
     ],
    ignore_index=True
)
total_county_emp_lehd = total_county_emp_lehd[
    (total_county_emp_lehd.FIPS != '02275') & (total_county_emp_lehd.FIPS != '02195')]

# Bedford City (51515) was incorporated into Bedford County (51019). Have emissions data for both,
# so incorporate Bedford City's emissions into Bedford County
for i in range(len(sector_scope)):
    if sector_scope.loc[i, 'FIPS'] == '51515':
        sector_scope.loc[i, 'FIPS'] = '51019'
        sector_scope.loc[i, 'County'] = 'bedford'
sector_scope = sector_scope.groupby(
    by=['FIPS', 'FIPSTATE', 'County', 'State', 'scope', 'sector'],
    as_index=False
).sum(numeric_only=True)
sector_scope['Emp'] = sector_scope.Emp.replace(0, np.nan)

# Skagway-Hoonah-Angoon Census Area (02232) was split into Skagway Municipality (02230) and
# Hoonah-Angoon Census Area (02105). Emissions data is for Skagway-Hoonah-Angoon Census Area,
# so back-calculate employment for old county geography
pd.concat(
    [total_county_emp_lehd,
     pd.DataFrame(
         [{'FIPS': '02232',
          'Emp': total_county_emp_lehd[(total_county_emp_lehd.FIPS == '02230') |
                                       (total_county_emp_lehd.FIPS == '02105')].Emp.sum()}],
     )
     ],
    ignore_index=True
)
total_county_emp_lehd = total_county_emp_lehd[
    (total_county_emp_lehd.FIPS != '02230') & (total_county_emp_lehd.FIPS != '02105')]

# Valdez-Cordova Census Area (02261) was split into Chugach Census Area (02063) and Copper
# River Census Area (02066). Emissions data is for Valdez-Cordova Census Area, therefore
# back-calculate employment for old county geography
pd.concat(
    [total_county_emp_lehd,
     pd.DataFrame(
         [{'FIPS': '02261',
          'Emp': total_county_emp_lehd[(total_county_emp_lehd.FIPS == '02063') |
                                       (total_county_emp_lehd.FIPS == '02066')].Emp.sum()}],
     )
     ],
    ignore_index=True
)
total_county_emp_lehd = total_county_emp_lehd[
    (total_county_emp_lehd.FIPS != '02063') & (total_county_emp_lehd.FIPS != '02066')]

# Prince of Wales-Outer Ketchikan Census Area (02201) was dissolved - part was absorbed into
# Ketchikan Gateway Borough (02130) as "Outer Ketchikan" and the remainder formed Prince of
# Wales-Hyder Census Area (02198). Emissions data is for Prince of Wales-Outer Ketchikan
# Census Area, so back-calculate employment for old county geography.
# Assume that employment in Outer Ketchikan is proportional to population.
pop_outer_ketch, pop_02130 = 5729, 13754
emp_outer_ketch = pop_outer_ketch / pop_02130 * \
    total_county_emp_lehd[total_county_emp_lehd.FIPS == '02130'].Emp.sum()

pd.concat(
    [total_county_emp_lehd,
     pd.DataFrame(
         [{'FIPS': '02201',
          'Emp': total_county_emp_lehd[(total_county_emp_lehd.FIPS == '02198')].Emp.sum() + emp_outer_ketch}],
     )
     ],
    ignore_index=True
)
total_county_emp_lehd = total_county_emp_lehd[(
    total_county_emp_lehd.FIPS != '02198')]

## 2.2 Calculate tax burdens


### 2.2.1 Read and reformat PED, price, and passthrough data necessary to perform burden calculations

In [None]:
# Read in tax passthrough rates by fuel type
fuel_elast = pd.read_excel("../Temp/fuel_elasticities_FINAL.xlsx", usecols="A:P")

# Read in price data by fuel type and sector, and structure df in a mergeable format
price_data = pd.read_excel('../Temp/EIA_price_data/price_data_clean.xlsx',
                           sheet_name='Data',
                           dtype={'FIPSTATE': str},
                           usecols='A:P'
                           )
price_pivot_keys = pd.read_excel('../Temp/EIA_price_data/price_data_clean.xlsx',
                                 sheet_name='Pivot_keys',
                                 usecols='A:C'
                                 )
price_data = pd.melt(frame=price_data,
                     id_vars=['STATE', 'FIPSTATE'],
                     value_vars=list(price_pivot_keys.col_name),
                     var_name='col_name',
                     value_name='price'
                     )
price_data = pd.merge(price_data, price_pivot_keys, how='left', on='col_name')

# Merge NERC region onto price df
price_data = pd.merge(price_data, fipstate_nerc_crosswalk, how='left', on='FIPSTATE')

# Need to convert these prices from physical units to tonCO2e. Create a dictionary containing the emissions
# factors for different fuels (from https://www.eia.gov/environment/emissions/co2_vol_mass.php). Also, merge
# the carbon intensities of electricity by county (derived from eGRID subregion carbon intensities in other
# analyses) onto the price dataframe. Use these to derive final price per tonCO2e.

# Read in electricity carbon intensity by county, convert to intensities per state, and merge onto price_data
counties_elec_intensity = pd.read_csv('../../../Data/industrial/Temp/counties_elec_intensity.csv',
                                      dtype={'id': str, 'STATE': str, 'COUNTY': str}
                                      ).drop(columns='Unnamed: 0')
counties_elec_intensity = counties_elec_intensity[['id', 'SRC2ERTA']
                                                  ].rename(columns={'id': 'FIPS', 'SRC2ERTA': 'lbCO2e_perMWh_elec'})
counties_elec_intensity['FIPSTATE'] = counties_elec_intensity['FIPS'].apply(lambda x: x[:2])
state_elec_intensity = counties_elec_intensity[['FIPSTATE', 'lbCO2e_perMWh_elec']
                                               ].groupby(by='FIPSTATE', as_index=False).mean()

price_data = pd.merge(price_data, state_elec_intensity, how='left', on='FIPSTATE')

# Create dictionary of carbon emission intensities for remaining energy products
ef_dict = {'coal': 4933.59/2000,  # short tonCO2e/short ton coal, assume bituminous
           'ng': 120.96/2000,  # short tonCO2e/Mcf NG
           # tonCO2e/bbl = 1.10231 tonCO2e/tonneCO2e * tonneCO2e/Bbtu * 0.0058Bbtu/bbl
           'crude': 1.10231 * 74.47 * 0.0058,
           'dsl': 22.45/2000,  # short tonCO2e/gallon diesel
           'heatoil': 22.45/2000,  # same as diesel
           'lpg': 12.68/2000,  # short tonCO2e/gallon lpg
           'residfuel': 24.78/2000,  # short tonCO2e/gallon residual fuel oil
           }

# Define a function that reads fuel type of each entry and performs the appropraite calculation
def calc_price_pertonCO2e(fuel_type, ef_dict, price, lbCO2e_perMWh):
    if fuel_type == 'elec':
        price_pertonCO2e = price*10 * lbCO2e_perMWh/2000
    else:
        price_pertonCO2e = price / ef_dict[fuel_type]  # $/physunit / physunit/tonCO2e
    return price_pertonCO2e

# Apply function to price_data, and drop lbCO2e_perMWh_elec (don't need it anymore)
price_data['price_pertonCO2e'] = price_data.apply(
    lambda x: calc_price_pertonCO2e(x.fuel, ef_dict, x.price, x.lbCO2e_perMWh_elec),
    axis=1
)
price_data = price_data.drop(columns='lbCO2e_perMWh_elec')

### 2.2.2 Burden on fossil fuel producers
Fossil fuel producers will face different effective tax rates depending on the sector their customers are in (e.g. commercial consumption of natural gas is more elastic than industrial consumption, therefore we would expect the producer to bear a higher tax incidence when selling to commercial consumers than to industrial consumers). However, if we consider that all producers sell into the same national market, then we can average out the effective tax rates on producers, by summing the total quantity of a given fuel that sees the full carbon tax for each type of consumer (using that consumer's tax incidence) and dividing this by the total amount of fuel production. Perform this calculation for each fossil fuel produced. Note that the effective incidences discussed below are producer incidences (i.e. the burden on the fossil fuel producer = tax \* incidences used here).

<b>Coal</b><br>
Calc
Use the EIA's Coal Data Browser to identify the total annual consumption of coal across sectors (https://www.eia.gov/coal/data/browser/#/topic/20?agg=0,1&geo=g&sec=gs&linechart=COAL.CONS_TOT.US-98.A&columnchart=COAL.CONS_TOT.US-98.A&map=COAL.CONS_TOT.US-9.A&freq=A&start=2020&end=2021&ctype=map&ltype=pin&rtype=s&pin=&rse=0&maptype=0). Use 2020 data as that is the year used in the emissions calculations. Assumptions:

- Treat all coal consumption that is not for power generation as 'industrial'
- Assume that import/export elasticities of demand are the same as domestic industrial elasticities of demand

<b>Natural gas</b><br>
We can consider that natural gas produced in the US is either consumed by industry, commercial sectors, power generators, or is exported. Use the EIA's data for 2020 to determine quantities. Assumptions used for this calculation:

- All natural gas that is not consumed commercially, industrially or for power generation is exported.
- The PED of exported natural gas = PED of natural gas in international markets

<b>Oil</b><br>
We can consider oil produced in the US to be either exported as crude or sent to domestic refineries, where it is processed into different products (we will consider gasoline (for transport sector, not covered by our analysis), petroleum (for power generation), heating oil, and diesel). To determine the effective tax pass-through rate for all oil produced in the US, we need to make some assumptions about the price elasticities of demand for these products, as well as how taxes are passed through the refineries. The assumptions used for this calculation are as follows:

- Price elasticity of demand for crude oil at US refineries is equal to the price elasticity of demand for crude oil on the global market.
- Price elasticity of supply of each of the refined products is equal to the price elasticity of supply for gasoline.
- Price elasticity of demand for crude oil exports = price elasticity of demand for oil on the global market

Because we assume all oil is either exported or sent to US refineries, and we assume the PED of crude on the international market is the same as that at US refineries, the effective pass-through rate on US oil producers = the pass-through rate of oil on producers in the global market. To calculate the effective pass-through rate, we determine the total amount of crude (in tonCO2e) sent to US refineries and multiply this by the pass-through rate between oil producers and oil refineries. Note that the final result will be the percentage of any tax passed down to refineries - refineries will then pass down a percentage of that tax to consumers.

In [None]:
# EFFECTIVE PASSTHROUGH RATE FOR COAL PRODUCERS
# US coal production (short tons coal)
prod_coal = 535434354

# Coal consumption
cons_coal_ind = 793111 + 14413596 + 25659546  # Industrial consumption, short tons coal

# for power plant consumption, multiply total power plant fossil fuel consumption by the relative emissions attributable to coal for each NERC region
cons_coal_pwr_total = 435826849  # short tons coal
cons_coal_by_nerc = pwr_raw[["NERC Region", "Tons of CO2 Emissions", "tonCO2e_COAL"]
                            ].groupby(by="NERC Region", as_index=False).sum(numeric_only=True)
cons_coal_by_nerc["coal_frac"] = cons_coal_by_nerc["tonCO2e_COAL"] / \
    cons_coal_by_nerc["tonCO2e_COAL"].sum()
cons_coal_by_nerc["cons_coal"] = cons_coal_by_nerc["coal_frac"] * cons_coal_pwr_total

# Coal consumption pass-through fractions
coal_elast = fuel_elast[(fuel_elast["fuel"] == "coal")].copy()
rho_elec = fuel_elast[(fuel_elast.sector == "all") &
                      (fuel_elast.fuel == "elec")].rho_avg.mean()

coal_cons_df = pd.merge(
    coal_elast,
    cons_coal_by_nerc[["NERC Region", "coal_frac", "cons_coal"]],
    how="left",
    left_on="nerc",
    right_on=["NERC Region"],
)

# Fill in NaN field for industrial coal usage, and drop FRCC and SPP rows (as they are NaNs)
coal_cons_df.loc[0, "cons_coal"] = cons_coal_ind
coal_cons_df = coal_cons_df.drop([1, 6]).reset_index(drop=True)

# Calculate effective quantity of coal produced by coal mines that sees the full carbon tax
# For retail sectors, effective quantity seeing carbon tax is (1-rho) * Q
coal_cons_df["taxed_coal"] = coal_cons_df.apply(lambda x: (1-x.rho_avg) * x.cons_coal
                                                if x.sector in ["comm", 'ind', 'res']
                                                else np.nan,
                                                axis=1
                                                )

# For power sector, effective quantity is (1-Id_pwr)*(1-rho_elec)*Q
coal_cons_df["taxed_coal"] = coal_cons_df.apply(lambda x: (1-x.Id_pwr_avg) * (1-rho_elec) * x.cons_coal
                                                if x.sector == 'pwr'
                                                else x.taxed_coal,
                                                axis=1
                                                )

# Calculate effective producer pass-through on coal producers, as the effective quantity of US coal production that sees the full carbon tax / total US coal production
rho_prod_coal_eff = coal_cons_df["taxed_coal"].sum() / prod_coal
rho_prod_coal_eff

In [None]:
# EFFECTIVE PASSTHROUGH RATE FOR NATURAL GAS PRODUCERS
# NG production/consumption figures (units MMcf) (https://www.eia.gov/dnav/ng/ng_prod_sum_a_EPG0_FGW_mmcf_a.htm)
prod_ng = 40613767

# below from EIA consumption data: https://www.eia.gov/dnav/ng/ng_cons_sum_dcu_nus_a.htm
cons_ng_res = 4674461
cons_ng_comm = 3169955
cons_ng_pwr = 11631723
# Assume industrial = everything not residential, commercial or power sector
cons_ng_ind = 30513453 - cons_ng_res - cons_ng_comm - cons_ng_pwr

# Read in NG consumption pass-through rates, as well as electricity retail pass-through rate
ng_elast = fuel_elast[(fuel_elast["fuel"] == "ng")].copy()

# To estimate NG consumption in power plants by NERC region, multiply total power plant fossil fuel consumption by the relative emissions attributable to gas for each NERC Region
cons_ng_by_nerc = pwr_raw[["NERC Region", "Tons of CO2 Emissions", "tonCO2e_GAS"]
                          ].groupby(by="NERC Region", as_index=False).sum(numeric_only=True)
cons_ng_by_nerc["ng_frac"] = cons_ng_by_nerc["tonCO2e_GAS"] / cons_ng_by_nerc["tonCO2e_GAS"].sum()
cons_ng_by_nerc["cons_ng"] = cons_ng_by_nerc["ng_frac"] * cons_ng_pwr

# Create dataframe containing all natural gas consumption pass-through rates for different sectors
ng_cons_df = pd.merge(
    ng_elast,
    cons_ng_by_nerc[["NERC Region", "ng_frac", "cons_ng"]],
    how="left",
    right_on=["NERC Region"],
    left_on="nerc",
)

# Drop NaN rows, and fill commercial, residential, industrial values for NG consumed with the values derived above
ng_cons_df = ng_cons_df.drop([3, 8]).reset_index(drop=True)
ng_cons_df.loc[0, "cons_ng"] = cons_ng_comm
ng_cons_df.loc[1, "cons_ng"] = cons_ng_res
ng_cons_df.loc[2, "cons_ng"] = cons_ng_ind

# Calculate total quantity that sees full carbon tax for each sector
# For retail sectors, effective quantity seeing carbon tax for producers is (1-rho) * Q
ng_cons_df["taxed_ng"] = ng_cons_df.apply(
    lambda x: (1 - x.rho_avg) * x.cons_ng
    if x.sector in ["comm", "ind", "res"] else 0,
    axis=1
)

# For power sector, effective quantity for producers is (1-Id_pwr)*(1-rho_elec)*Q
ng_cons_df["taxed_ng"] = ng_cons_df.apply(
    lambda x: (1 - x.Id_pwr_avg) * (1 - rho_elec) * x.cons_ng
    if x.sector == "pwr"
    else x.taxed_ng,
    axis=1,
)

#  Calculate total effective taxed quantity and derive effective pass-through rate
total_taxed = ng_cons_df["taxed_ng"].sum()
rho_prod_ng_eff = total_taxed / prod_ng
rho_prod_ng_eff

In [None]:
# Production data
# Crude oil (https://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_a.htm). Units: thousand barrels
crude_prod = 4142277

# Refined product production data below from: https://www.eia.gov/dnav/pet/pet_cons_psup_dc_nus_mbbl_a.htm. Units: thousand barrels.
gas_prod = 2946014  # Finished motor gasoline
distfuel_prod = 1497761  # Distillate Fuel Oil
jetfuel_prod = 636335  # Kerosene-Type Jet Fuel
residfuel_prod = 76104  # Residual fuel oil
lpg_prod = 1104945  # liquefied petroleum gases
other_prod = 6656043 - gas_prod - distfuel_prod - residfuel_prod - \
    jetfuel_prod - lpg_prod  # Total Crude Oil and Petroleum Products - the above
refprod_prod = [gas_prod, distfuel_prod, jetfuel_prod, residfuel_prod, lpg_prod, other_prod]

refprod_df = pd.DataFrame({"refprod": ["gas", "distfuel", "jetfuel", "residfuel", "lpg", "other"],
                           "prod": refprod_prod})
refprod_df["frac"] = refprod_df["prod"] / refprod_df["prod"].sum()

# Consumption data
cons_crude_ref = 5201596 * 42  # crude oil input into refineries: https://www.eia.gov/dnav/pet/pet_pnp_inpt_dc_nus_mbbl_a.htm, 'Crude Oil'. Units: thousand barrels * 42 = thousand gallons
cons_gas = 19307.7 * 365  # refiner motor gasoline sales volumes: https://www.eia.gov/dnav/pet/pet_cons_refmg_d_nus_VTR_mgalpd_a.htm, units: thousand GALLONS per day * 365 days per year

# Distillate & residual fuel oil consumption from EIA's adjusted distillate fuel oil and kerosene sales by end use: https://www.eia.gov/dnav/pet/pet_cons_821usea_dcu_nus_a.htm
# units: thousand gallons. Assume all distillate fuel oil used for transport is diesel.
cons_dsl_transport = 45086223
# units: thousand gallons. Assume all distillate fuel oil used for commercial sector is heating oil.
cons_heatoil_comm = 2013097
# units: thousand gallons. Assume all distillate fuel oil used for residential sector is heating oil.
cons_heatoil_res = 3073435
# EIA's 'industrial' + 'Farm' + 'Oil Company' + 'Military' + 'Off-Highway'. units: thousand gallons. Assume all distillate fuel oil used for industrial/agricultural sectors is diesel.
cons_dsl_ind = 1471229 + 3630350 + 347185 + 147224 + 2087610
cons_dsl_pwr = 338384  # Assume all distillate fuel oil used in power generation is diesel.
cons_residfuel_transport = 2625945
cons_residfuel_ind = 214449 + 16  # EIA's 'industrial' + 'oil company'
cons_residfuel_pwr = 382880  # units: thousand gallons

# Other products
cons_lpg = lpg_prod + 63175 - 771065  # + imports - exports
cons_jetfuel = jetfuel_prod + 54787 - 35296  # + imports - exports
# assume consumption of all other products ~ production of said products
cons_other = other_prod

In [None]:
# Create a dataframe that contains the pass-through rates of refined products onto consumers, refineries, and producers for each refined product
rho_fuels = fuel_elast[(fuel_elast["fuel"] == "residfuel") | (fuel_elast["fuel"] == "heatoil") | (fuel_elast["fuel"] == "dsl") |
                       (fuel_elast["fuel"] == "crude") | (fuel_elast["fuel"] == "jetfuel") |
                       (fuel_elast["fuel"] == "gas") | (fuel_elast["fuel"] == "lpg")
                       ].copy()
Id_ref = rho_fuels[(rho_fuels["fuel"] == "crude")]["Id_avg"].mean()

# Calculate the effective passthrough rate on refineries for each refined product as Id_ref*(1-rho_refprod)
retail_sectors = ["ind", "transport", "res", "comm"]
rho_fuels["rho_ref_eff_avg"] = rho_fuels.apply(lambda x: Id_ref * (1-x["rho_avg"]) if x.sector in retail_sectors else np.nan,
                                               axis=1)

# For power sector, need also to account for the pass-through rate onto the power plant. This will be Id_pwr*(1-rho_elec).
# The pass-through rate onto refineries for the residual fuel oil used in power plants is therefore
# Id_ref * passthrough rate onto firms upstream of power plant = Id_ref*(1-Id_pwr)*(1-rho_elec)
rho_fuels["rho_pwr_eff_avg"] = rho_fuels.apply(lambda x: x.Id_pwr_avg * (1-rho_elec)
                                               if x.sector == "pwr"
                                               else np.nan,
                                               axis=1)
rho_fuels["rho_ref_eff_avg"] = rho_fuels.apply(lambda x: Id_ref * (1-x.Id_pwr_avg) * (1-rho_elec)
                                               if x.sector == "pwr"
                                               else x.rho_ref_eff_avg,
                                               axis=1)

# Calculate the effective passthrough rate onto upstream crude oil producers as (1-Id_ref)*(1-rho_refprod)
# For power sector, calculation is (1-Id_ref)*(1-Id_pwr)*(1-rho_elec)
rho_fuels["rho_prod_eff_avg"] = rho_fuels.apply(lambda x: (1-Id_ref) * (1-x["rho_avg"])
                                                if x.sector in retail_sectors
                                                else np.nan,
                                                axis=1)
rho_fuels["rho_prod_eff_avg"] = rho_fuels.apply(lambda x: (1-Id_ref) * (1-x.Id_pwr_avg) * (1-rho_elec)
                                                if x.sector == "pwr"
                                                else x.rho_prod_eff_avg,
                                                axis=1)

# Check that passthrough rates add to 1
rho_fuels["check"] = rho_fuels.apply(lambda x: x["rho_ref_eff_avg"] + x["rho_avg"] + x["rho_prod_eff_avg"]
                                     if x.sector != "pwr"
                                     else rho_elec + x["rho_pwr_eff_avg"] + x["rho_ref_eff_avg"] + x["rho_prod_eff_avg"],
                                     axis=1)

# Use these passthrough rates to calculate the effective total amounts of US crude oil production for each refined product that sees the full carbon tax
taxed_gas_prod = cons_gas * rho_fuels[rho_fuels.fuel == "gas"].rho_prod_eff_avg.mean()
taxed_dsl_prod_ind = cons_dsl_ind * rho_fuels[(rho_fuels.fuel == "dsl") &
                                              (rho_fuels.sector == "ind")].rho_prod_eff_avg.mean()
taxed_dsl_prod_transport = cons_dsl_transport * rho_fuels[(rho_fuels.fuel == "dsl") &
                                                          (rho_fuels.sector == "transport")].rho_prod_eff_avg.mean()
taxed_heatoil_prod_res = cons_heatoil_res * rho_fuels[(rho_fuels.fuel == "heatoil") &
                                                      (rho_fuels.sector == "res")].rho_prod_eff_avg.mean()
taxed_heatoil_prod_comm = cons_heatoil_comm * rho_fuels[(rho_fuels.fuel == "heatoil") &
                                                        (rho_fuels.sector == "comm")].rho_prod_eff_avg.mean()
taxed_residfuel_prod_ind = cons_residfuel_ind * rho_fuels[(rho_fuels.fuel == "residfuel") &
                                                          (rho_fuels.sector == "ind")].rho_prod_eff_avg.mean()
taxed_residfuel_prod_transport = cons_residfuel_transport * rho_fuels[(rho_fuels.fuel == "residfuel") &
                                                                      (rho_fuels.sector == "transport")].rho_prod_eff_avg.mean()
taxed_residfuel_prod_pwr = cons_residfuel_pwr * rho_fuels[(rho_fuels.fuel == "residfuel") &
                                                          (rho_fuels.sector == "pwr")].rho_prod_eff_avg.mean()
taxed_jetfuel_prod = cons_jetfuel * rho_fuels[(rho_fuels.fuel == "jetfuel") &
                                              (rho_fuels.sector == "transport")].rho_prod_eff_avg.mean()
taxed_other_prod = cons_other * rho_fuels[(rho_fuels.fuel == "dsl") &
                                          (rho_fuels.sector == "ind")].rho_prod_eff_avg.mean()  # assume passthrough rate for other fuels = that of industrial diesel

# Calculate the total effective amount of US crude oil production that sees the full carbon tax, and use this to calculate the effective producer passthrough rate on oil producers.
total_taxed_production = (
    taxed_gas_prod
    + taxed_dsl_prod_ind
    + taxed_dsl_prod_transport
    + taxed_heatoil_prod_res
    + taxed_heatoil_prod_comm
    + taxed_residfuel_prod_ind
    + taxed_residfuel_prod_transport
    + taxed_residfuel_prod_pwr
    + taxed_jetfuel_prod
    + taxed_other_prod
)

rho_eff_oil = total_taxed_production / (crude_prod * 42)
rho_eff_oil

In [None]:
# It is useful for the burden calculations below to use the passthrough rates dataframe 
# above to calculate the total amount of refined product that sees the full carbon tax, 
# as well as the total amount of product consumption that sees the full carbon tax
# Use passthrough rates to calculate the effective total amounts of product refined in
#  the US that sees the full carbon tax for each refined product
taxed_gas_ref = cons_gas * \
    rho_fuels[rho_fuels.fuel == "gas"].rho_ref_eff_avg.mean()
taxed_dsl_ref_ind = cons_dsl_ind * \
    rho_fuels[(rho_fuels.fuel == "dsl") & (rho_fuels.sector == "ind")
              ].rho_ref_eff_avg.mean()
taxed_dsl_ref_transport = cons_dsl_transport * \
    rho_fuels[(rho_fuels.fuel == "dsl") & (rho_fuels.sector == "transport")
              ].rho_ref_eff_avg.mean()
taxed_heatoil_ref_res = cons_heatoil_res * \
    rho_fuels[(rho_fuels.fuel == "heatoil") & (rho_fuels.sector == "res")
              ].rho_ref_eff_avg.mean()
taxed_heatoil_ref_comm = cons_heatoil_comm * \
    rho_fuels[(rho_fuels.fuel == "heatoil") & (rho_fuels.sector == "comm")
              ].rho_ref_eff_avg.mean()
taxed_residfuel_ref_ind = cons_residfuel_ind * \
    rho_fuels[(rho_fuels.fuel == "residfuel") & (rho_fuels.sector == "ind")
              ].rho_ref_eff_avg.mean()
taxed_residfuel_ref_transport = cons_residfuel_transport * \
    rho_fuels[(rho_fuels.fuel == "residfuel") & (rho_fuels.sector == "transport")
              ].rho_ref_eff_avg.mean()
taxed_residfuel_ref_pwr = cons_residfuel_pwr * \
    rho_fuels[(rho_fuels.fuel == "residfuel") & (rho_fuels.sector == "pwr")
              ].rho_ref_eff_avg.mean()
taxed_jetfuel_ref = cons_jetfuel * \
    rho_fuels[(rho_fuels.fuel == "jetfuel")].rho_ref_eff_avg.mean()
taxed_other_ref = cons_other * \
    rho_fuels[(rho_fuels.fuel == "dsl") & (rho_fuels.sector == "ind")
              ].rho_ref_eff_avg.mean()  # assume passthrough rate for other fuels = that of industrial diesel

#  Calculate the US total amount of refined product produced that sees the full carbon tax
total_taxed_refinery = (
    taxed_gas_ref
    + taxed_dsl_ref_ind
    + taxed_dsl_ref_transport
    + taxed_heatoil_ref_res
    + taxed_heatoil_ref_comm
    + taxed_residfuel_ref_ind
    + taxed_residfuel_ref_transport
    + taxed_residfuel_ref_pwr
    + taxed_jetfuel_ref
    + taxed_other_ref
)

# Use passthrough rates to calculate the effective total amounts of product refined in the US that sees the full carbon tax for each refined product
taxed_gas_cons = cons_gas * \
    rho_fuels[rho_fuels.fuel == "gas"].rho_avg.mean()
taxed_dsl_cons_ind = cons_dsl_ind * \
    rho_fuels[(rho_fuels.fuel == "dsl") & (rho_fuels.sector == "ind")
              ].rho_avg.mean()
taxed_dsl_cons_transport = cons_dsl_transport * \
    rho_fuels[(rho_fuels.fuel == "dsl") & (rho_fuels.sector == "transport")
              ].rho_avg.mean()
taxed_heatoil_cons_res = cons_heatoil_res * \
    rho_fuels[(rho_fuels.fuel == "heatoil") & (rho_fuels.sector == "res")
              ].rho_avg.mean()
taxed_heatoil_cons_comm = cons_heatoil_comm * \
    rho_fuels[(rho_fuels.fuel == "heatoil") & (rho_fuels.sector == "comm")
              ].rho_avg.mean()
taxed_residfuel_cons_ind = cons_residfuel_ind * \
    rho_fuels[(rho_fuels.fuel == "residfuel") & (rho_fuels.sector == "ind")
              ].rho_avg.mean()
taxed_residfuel_cons_transport = cons_residfuel_transport * \
    rho_fuels[(rho_fuels.fuel == "residfuel") & (rho_fuels.sector == "transport")
              ].rho_avg.mean()
taxed_residfuel_cons_pwr = cons_residfuel_pwr * rho_elec
taxed_jetfuel_cons = cons_jetfuel * \
    rho_fuels[(rho_fuels.fuel == "jetfuel")].rho_avg.mean()
taxed_other_cons = cons_other * \
    rho_fuels[(rho_fuels.fuel == "dsl") & (rho_fuels.sector == "ind")
              ].rho_avg.mean()  # assume passthrough rate for other fuels = that of industrial diesel

#  Calculate the US total amount of refined product produced that sees the full carbon tax
total_taxed_cons = (
    taxed_gas_cons
    + taxed_dsl_cons_ind
    + taxed_dsl_cons_transport
    + taxed_heatoil_cons_res
    + taxed_heatoil_cons_comm
    + taxed_residfuel_cons_ind
    + taxed_residfuel_cons_transport
    + taxed_residfuel_cons_pwr
    + taxed_jetfuel_cons
    + taxed_other_cons
)

In [None]:
# Group these effective incidences in a dictionary
rho_ffprod_eff = {"coal": rho_prod_coal_eff, "ng": rho_prod_ng_eff, "crude": rho_eff_oil}
rho_ffprod_eff

### 2.2.3 Define functions to perform calculations necessary to determine burden for a given datapoint
These functions calculate the following:
1. rho_eff: The effective passthrough rate of a carbon tax to energy consumers/producers for a given fuel and sector in a given county.
2. Id_pwr: The portion of firm burden in the electricity supply chain that is borne by the power plant (i.e. the 'downstream retailer' firm).
3. burden: The change in consumer/producer surplus resulting from the carbon tax for a given fuel in a given sector, in a given county.

In [None]:
# Create a dataframe that isolates all FIPS with NAICS 324 emissions (taken to be oil refining) and
# calculates the amount of those emissions that see the full carbon tax
#  Read in industrial data
taxed_refined_percounty = getIndustrialData("mf", "agg", "totals").copy()

#  Isolate NAICS 324 counties
taxed_refined_percounty = taxed_refined_percounty[taxed_refined_percounty["NAICS"] == "324"]

#  Because we don't have the quantity of refined product produced per county (just the scope 1 and 2
#  emissions associated with the refinery process), we need to make an assumption about how much
#  product each refinery handles. Assume that the amount of product handled is directly proportional
#  to the emissions of the refinery. Therefore, the fraction of total US refined product production
#  that each county's refineries are responsible for can be estimated using the fraction of total US
#  refinery emissions produced in the county
taxed_refined_percounty["rel_ref_em_frac"] = taxed_refined_percounty["tonCO2e"] / taxed_refined_percounty["tonCO2e"].sum()

#  Use this fraction to estimate the total amount of refined product (in tonCO2e) that sees the full carbon tax, 
#  by multiplying the fraction by the US total amount of refined product that sees the full carbon tax (calculated earlier)
taxed_refined_percounty["tonCO2e_ref"] = taxed_refined_percounty["rel_ref_em_frac"] * total_taxed_refinery

#  Isolate useful columns
taxed_refined_percounty = taxed_refined_percounty[["FIPS", "tonCO2e", "tonCO2e_ref"]]

# Write to csv
taxed_refined_percounty.to_csv("../Temp/taxed_refined_percounty.csv")

In [None]:
# Define function to calculate the effective passthrough rate for a given county-sector-scope-fuel combo.
def calc_rho_eff(scope=str, fuel=str, sector_code=str, nerc=str, min_max_avg='avg'):
    '''
    Calculates the effective passthrough rate of a carbon tax (rho_eff) for consumption
    of a given fuel in a given sector and county.

    Parameters:
        scope (str): Scope of emissions to which the passthrough rate will be applied.
            'scope1', 'scope2' or 'scope3'.
        fuel (str): Fuel type that emissions are attributable to.
        sector_code (str): 'ind', 'comm' or 'pwr'. Encoded value of the sector category
            in original sector_scope dataframe.
        nerc (str): NERC region for the given county.
        min_max_avg (str): One of 'min', 'max', 'avg'. Dictates whether rho_eff returned 
            is the value calculated using the maximum, minimum or average elasticity 
            figures used in the excel sheet.

    PROCEDURE:
    For end-use emissions (e.g. consumption of natural gas in manufacturing, commercial 
    electricity consumption), the effective passthrough rate is equal to the final 
    passthrough rate as calculated in fuel_elasticities.xlsx (i.e. rho). This category
    includes all Scope 2 emissions as well as Scope 1 emissions that do not occur at a 
    power plant.

    For fossil fuel producers (scope 3 emissions), the effective passthrough rate of a 
    carbon tax levied on the consumption of the fuels they produce is equal to the
    effective passthrough rates calculated in Section 2.1.2.1.

    For power generation firms (Scope 1 emissions at power plants), the effective pass-
    through rate is equal to Id_pwr*(1-rho_elec), where rho_elec = the passthrough rate 
    onto consumers for electricity consumption, (1-rho_elec) = the effective passthrough 
    rate onto firms from electricity consumption, and Id_pwr = the portion of firm 
    passthrough that is borne by the downstream retailer (in this case, the power plant).

    For oil refineries, which purchase crude and produce refined products that are 
    consumed elsewhere, the effective passthrough rate onto refineries is equal to 
    Id_ref*(1-rho_refprod), where rho_refprod = the passthrough rate onto consumers for 
    a given refined product, (1-rho_refprod) = the effective passthrough rate onto firms 
    from consumption of said refined product, and Id_ref = the portion of firm pass-
    through that is borne by the downstream retailed (in this case, the oil refinery). 
    The EXCEPTION to this rule is the case of refined product (residual fuel oil) being 
    used in power generation, as this introduces an additional layer to the supply chain. 
    In this case, the effective passthrough rate onto refineries 
    = Id_ref*(1-Id_pwr)*(1-rho_elec), where (1-Id_pwr) = the effective passthrough rate 
    from electricity consumption onto firms upstream of the power plant.
    '''
    # Start with scope 3 emissions from fossil fuel production. Use effective passthrough rates derived earlier.
    if scope == "scope3":
        #  Use effective passthrough rates derived earlier
        rho_eff_avg = rho_ffprod_eff[fuel]
        rho_eff_min, rho_eff_max = rho_eff_avg, rho_eff_avg

    # Next, consider scope 1 emissions in the commercial and industrial (including fossil fuel extraction) sectors (NOT POWER GENERATION)
    elif scope == "scope1" and (sector_code == "ind" or sector_code == "comm"):
        #  Define minimum, maximum and average passthrough rate dictionaries for each fuel type for the given sector
        passthroughs_scope1 = fuel_elast[(fuel_elast["sector"] == sector_code) &
                                         (fuel_elast["fuel"] == fuel)].copy()
        rho_eff_min = passthroughs_scope1["rho_min"].mean()
        rho_eff_max = passthroughs_scope1["rho_max"].mean()
        rho_eff_avg = passthroughs_scope1["rho_avg"].mean()

    # At this point, can read in the passthrough rates for  electricity, as they will be used either in the pwr sector scope 1 calculation or the scope 2 calculations.
    else:
        #  Read in passthrough rates for electricity
        passthroughs_elec = fuel_elast[fuel_elast["fuel"] == "elec"].copy()

        # Next, consider scope 1 emissions from power sector
        if scope == "scope1" and sector_code == "pwr":
            # Extract passthrough rate for electricity consumption across all sectors, for the given NERC region.
            rho_elec_ovr_avg = passthroughs_elec[(passthroughs_elec["sector"]
                                                 == "all") &
                                                 (passthroughs_elec["nerc"]
                                                  == nerc)]["rho_avg"].mean()
            rho_elec_ovr_min = passthroughs_elec[(passthroughs_elec["sector"]
                                                 == "all") &
                                                 (passthroughs_elec["nerc"]
                                                  == nerc)]["rho_min"].mean()
            rho_elec_ovr_max = passthroughs_elec[(passthroughs_elec["sector"]
                                                 == "all") &
                                                 (passthroughs_elec["nerc"]
                                                  == nerc)]["rho_max"].mean()

            # Use calc_Id_pwr function to calculate the max, min and avg Id_pwr values
            Id_pwr_min = calc_Id_pwr(fuel=fuel, nerc=nerc, min_max_avg='min')
            Id_pwr_max = calc_Id_pwr(fuel=fuel, nerc=nerc, min_max_avg='max')
            Id_pwr_avg = calc_Id_pwr(fuel=fuel, nerc=nerc, min_max_avg='avg')

            # Calculate effective passthrough rate as Id_pwr*(1-rho_elec)
            rho_eff_min = Id_pwr_min * (1-rho_elec_ovr_max)
            rho_eff_max = Id_pwr_max * (1-rho_elec_ovr_min)
            rho_eff_avg = Id_pwr_avg * (1-rho_elec_ovr_avg)

        # Finally, consider scope 2 emissions from electricity generation.
        elif scope == "scope2":
            # Calculate effective passthrough rate as rho_elec
            rho_eff_avg = passthroughs_elec[(passthroughs_elec["sector"]
                                            == sector_code) &
                                            (passthroughs_elec["nerc"]
                                            == nerc)]["rho_avg"].mean()
            rho_eff_min = passthroughs_elec[(passthroughs_elec["sector"]
                                            == sector_code) &
                                            (passthroughs_elec["nerc"]
                                            == nerc)]["rho_min"].mean()
            rho_eff_max = passthroughs_elec[(passthroughs_elec["sector"]
                                            == sector_code) &
                                            (passthroughs_elec["nerc"]
                                            == nerc)]["rho_max"].mean()

        else:
            raise Exception("Error: sector not found??")

    if min_max_avg == 'avg':
        return rho_eff_avg
    elif min_max_avg == 'min':
        return rho_eff_min
    else:
        return rho_eff_max

In [None]:
# Define function to extract the portion of total firm burden that is borne by power plants
def calc_Id_pwr(fuel=str, nerc=str, min_max_avg='avg'):
    '''
    Returns Id_pwr, the portion of total firm burden from electricity generation that is 
    borne by power plants, given a generation fuel type and NERC region.
    
    Parameters:
        fuel (str): One of 'coal', 'ng', 'residfuel'. Fuel type used for power generation.
        nerc (str): NERC region for the county in question.
        min_max_avg (str): One of 'min', 'max', 'avg'. Dictates whether Id_pwr returned 
            is the value calculated using the maximum, minimum or average elasticity 
            figures used in the excel sheet.
    '''

    Id_pwr_min = fuel_elast[(fuel_elast.sector == 'pwr') &
                            (fuel_elast.fuel == fuel) &
                            (fuel_elast.nerc == nerc)]['Id_pwr_min'].mean()
    Id_pwr_max = fuel_elast[(fuel_elast.sector == 'pwr') &
                            (fuel_elast.fuel == fuel) &
                            (fuel_elast.nerc == nerc)]['Id_pwr_max'].mean()
    Id_pwr_avg = fuel_elast[(fuel_elast.sector == 'pwr') &
                            (fuel_elast.fuel == fuel) &
                            (fuel_elast.nerc == nerc)]['Id_pwr_avg'].mean()
    if min_max_avg == 'avg':
        return Id_pwr_avg
    elif min_max_avg == 'min':
        return Id_pwr_min
    else:
        return Id_pwr_max

In [None]:
# Define function to calculate burden tax burden for the emissions from consumption/production of a given fuel
def calc_burden(rho_eff, SCC, Q):
    '''
    Calculates the social cost of the consumption/production of a given energy product as 
    SCC*Q0.

    Parameters:
        rho_eff (float): Effective passthrough rate of social cost, for fuel and 
            sector in question.
        SCC (float or int): Social cost of carbon in $/short tonCO2e.
        Q (float or int): Quantity of emissions in tonCO2e.

    PROCEDURE:
    The final social cost burden for a given energy product is calculated using the effective 
    passthrough rate of each energy product (rho_eff), the SCC, and the initial 
    emissions from that energy product (Q).
    '''
    cost = rho_eff * SCC * Q
    return cost


In [None]:
# Define function that calculates the tax burden borne by oil refineries from the consumption of refined products.
def calc_ref_burden(tax, sector=str, fuel=str, fips=str, scope=str, return_tonCO2e_eff=False):
    '''
    Calculate the tax burden borne by oil refineries resulting from the downstream 
    consumption of refined products. Uses the dataframe taxed_refined_percounty,
    containing the equivalent amount of emissions (in tonCO2e) produced during the 
    consumption of refined products that is borne by refineries in each county. This
    amount is equivalent to the total amount of refined product in tonCO2e that
    'sees' the full carbon tax. Therefore, the burden borne by oil refineries is 
    equal to this quantity * the tax rate.
    Also gives the option to return the equivalent amount of emissions if requested.

    Parameters:
        sector (str): Sector in question (note: unencoded version of sector_code).
        fips (str): County FIPS code in question.
        scope (str): Scope of emissions in question.
        return_tonCO2e_eff (bool, default False): Returns the tonCO2e figure if 
            True, otherwise proceeds with burden calculation.
    '''
    # Because we are calculating burden differently for refineries than other sectors,
    # we will allocate burden_ref its own column when this function is applied. However,
    # because the dataframe these functions will be applied to is in a melted format
    # indexed by FIPS, sector, scope, and fuel, just applying this function to each datapoint
    # representing an oil-refining county would result in double counting, with the calculation
    # being repeated for every sector, every scope within that sector, and every fuel within
    # that scope. Therefore, need to attribute this refinery burden to a certain sector-
    # scope-fuel combo.

    # We will attribute the refinery burden for an oil-refining county to the datapoint
    # representing the scope 1 emissions arising from consumption of diesel in the manufacturing
    # sector for the given county.

    # Check that datapoint in question is an oil-refining county, is considering scope 1 emissions, is considering diesel fuel, and is considering the manufacturing sector.
    if all([fips in taxed_refined_percounty["FIPS"].unique(),
            scope == 'scope1',
            fuel == 'dsl',
            sector == 'mf']):
        #  Reference the earlier dataframe to find, for a given county, the total amount of refined product (in tonCO2e) that sees the full carbon tax, and calculate tax burden on refinery from consumption of refined products.
        if return_tonCO2e_eff == True:
            return taxed_refined_percounty[taxed_refined_percounty["FIPS"]
                                             == fips]["tonCO2e_ref"].mean()
        else:
            burden_ref = taxed_refined_percounty[taxed_refined_percounty["FIPS"]
                                             == fips]["tonCO2e_ref"].mean() * tax
            return burden_ref
    else:
        return 0

### 2.2.4 Create final dataframe to perform burden calculations on

In [None]:
# Merge NERC regions onto master dataframe
sector_scope_fuel = pd.merge(sector_scope, fips_nerc_crosswalk,
                             how="left", on="FIPS").rename(columns={"NERC Region": "NERC"})

# Drop total emissions and emissions per capita. We will recalculate these after burden calculation.
sector_scope_fuel = sector_scope_fuel.drop(columns=['tonCO2e'])

# Identify variables to group dataframe by
id_vars = ['FIPS', 'County', 'FIPSTATE', 'State', 'NERC', 'sector', 'scope', 'Emp']

# Rename columns so that the column name is just the fueltype rather than tonCO2e_fueltype
old_col_names = [col for col in sector_scope_fuel.columns if col not in id_vars]
new_col_names = [old_col_name[8:] for old_col_name in old_col_names]
sector_scope_fuel = sector_scope_fuel.rename(columns=dict(zip(old_col_names, new_col_names)))

# Melt dataframe to narrow format, indexed by sector, scope and fuel
sector_scope_fuel = pd.melt(frame=sector_scope_fuel,
                            id_vars=id_vars,
                            value_vars=new_col_names,
                            var_name='fuel',
                            value_name='tonCO2e')

# Drop entries where the emissions associated with a given fuel are 0
sector_scope_fuel = sector_scope_fuel[sector_scope_fuel.tonCO2e > 0].reset_index(drop=True)

sector_scope_fuel.to_csv('../Temp/sector_scope_fuel.csv')

### 2.2.5 Perform burden calculations

In [None]:
# Set the tax that you want to use
SCC_dict = {'SCC1': 120 * 1.10231, 'SCC2': 190 * 1.10231, 'SCC3': 340 * 1.10231}  # $/short ton = $ /metric tonne * 1.10231 short tons / metric ton
SCC = SCC_dict['SCC2']

In [None]:
# sector_scope_fuel_burdens = sector_scope_fuel_burdens[sector_scope_fuel_burdens.FIPS == '06037'].copy()
sector_scope_fuel_burdens = sector_scope_fuel.copy()

# First, create column containing sector_code (needed for some of the functions)
sector_scope_fuel_burdens['sector_code'] = sector_scope_fuel_burdens.apply(
    lambda x: 'ind' if x.sector in ["ag", "cn", "mf", "mn_rest", "coal", "og"] else x.sector, axis=1)

# Next, calculate the refinery burden for each oil-refining county
sector_scope_fuel_burdens['burden_ref'] = sector_scope_fuel_burdens.apply(lambda x: calc_ref_burden(
    tax=SCC, fuel=x.fuel, fips=x.FIPS, sector=x.sector, scope=x.scope), axis=1)

# Calculate the minimum, maximum, and average passthrough rate for each datapoint.
for min_max_avg in ['min', 'max', 'avg']:
    sector_scope_fuel_burdens[f'rho_eff_{min_max_avg}'] = sector_scope_fuel_burdens.apply(
        lambda x: calc_rho_eff(scope=x.scope,
                               fuel=x.fuel,
                               sector_code=x.sector_code,
                               nerc=x.NERC,
                               min_max_avg=min_max_avg),
        axis=1
    )

In [None]:
# Calculate the minimum, maximum, and average burden for each datapoint. Add burden_ref to this calculation, then drop burden_ref
sector_scope_fuel_burdens['burden_min'] = calc_burden(rho_eff=sector_scope_fuel_burdens.rho_eff_min,
                                                      SCC=SCC,
                                                      Q=sector_scope_fuel_burdens.tonCO2e,
                                                      ) + sector_scope_fuel_burdens.burden_ref
sector_scope_fuel_burdens['burden_max'] = calc_burden(rho_eff=sector_scope_fuel_burdens.rho_eff_max,
                                                      SCC=SCC,
                                                      Q=sector_scope_fuel_burdens.tonCO2e,
                                                      ) + sector_scope_fuel_burdens.burden_ref
sector_scope_fuel_burdens['burden_avg'] = calc_burden(rho_eff=sector_scope_fuel_burdens.rho_eff_avg,
                                                      SCC=SCC,
                                                      Q=sector_scope_fuel_burdens.tonCO2e,
                                                      ) + sector_scope_fuel_burdens.burden_ref

sector_scope_fuel_burdens = sector_scope_fuel_burdens.drop(columns='burden_ref')

In [None]:
# Calculate minimum, maximum and average *effective* emissions for that datapoint, as the effective passthrough rate * the actual emissions.
for min_max_avg in ['min', 'max', 'avg']:
    # Also add the tonCO2e_eff from refineries
    sector_scope_fuel_burdens[f'tonCO2e_eff_{min_max_avg}'] = sector_scope_fuel_burdens.apply(
        lambda x: x[f'rho_eff_{min_max_avg}'] * x['tonCO2e'] + calc_ref_burden(tax=SCC_dict['SCC2'],
                                                                               fips=x.FIPS,
                                                                               sector=x.sector,
                                                                               scope=x.scope,
                                                                               return_tonCO2e_eff=True),
        axis=1)

# Create new columns that only keeps the emission/burden values of the row if there is no missing data (i.e. nans) in the Emp field. These columns will be the ones used to compute tonCO2e_peremp/burden_peremp
for min_max_avg in ['min', 'max', 'avg']:
    sector_scope_fuel_burdens[f'tonCO2e_eff_{min_max_avg}_emp'] = sector_scope_fuel_burdens.apply(
        lambda x: x[f'tonCO2e_eff_{min_max_avg}'] if x.Emp > 0 else np.nan,
        axis=1)
    sector_scope_fuel_burdens[f'burden_{min_max_avg}_emp'] = sector_scope_fuel_burdens.apply(
        lambda x: x[f'burden_{min_max_avg}'] if x.Emp > 0 else np.nan,
        axis=1)

In [None]:
print('No. counties covered before calculating burden:', sector_scope_fuel.FIPS.nunique())
print('No. counties covered after calculating burden:', sector_scope_fuel_burdens.FIPS.nunique())

## 2.3 Finalize sector-scope, sector, scope, and total dataframes

### 2.3.1 Aggregate to get sector-scope emissions

In [None]:
# Create aggregated dataframe, aggregating across fuels
groupby_cols = ['FIPS', 'County', 'FIPSTATE', 'State', 'NERC', 'sector', 'scope', 'Emp']

final_sector_scope = sector_scope_fuel_burdens.groupby(by=groupby_cols,
                                                       as_index=False,
                                                       dropna=False).sum(numeric_only=True)

# Overwrite effective passthrough rate columns (as they have now been aggregated), and calculate effective tax rate
for min_max_avg in ['min', 'max', 'avg']:
    final_sector_scope[f'rho_eff_{min_max_avg}'] = final_sector_scope[f'tonCO2e_eff_{min_max_avg}'] / \
        final_sector_scope['tonCO2e']
    # final_sector_scope[f'tax_eff_{min_max_avg}'] = final_sector_scope[f'rho_eff_{min_max_avg}'] * CO2tax

# Run CensusBureauPopEstimates.ipynb to define function get_pop_estimate() needed to extract Census Population Estimates for a given year data
%run ../../../Data/empData/Scripts/CensusBureauPopEstimates.ipynb

# Use get_pop_estimate() function to extract 2018 population estimates for each county
county_pop_2018 = get_pop_estimate(2018, "county")
county_pop_2018["FIPS"] = county_pop_2018["state"] + county_pop_2018["county"]

# Merge population onto final_sector_scope
final_sector_scope = pd.merge(final_sector_scope, county_pop_2018[["FIPS", "POP"]], 
                              how="left", 
                              on="FIPS")

In [None]:
# Calculate the log10 values, and per employee values for tonCO2e. Use *effective* tonCO2e values in order to avoid double counting.
for min_max_avg in ['min', 'max', 'avg']:
    # Calculate log10 of absolute effective carbon emissions
    final_sector_scope[f'tonCO2e_eff_{min_max_avg}_log10'] = np.log10(
        final_sector_scope[f'tonCO2e_eff_{min_max_avg}'])

    # Calculate per capita effective carbon emissions (in both tonsCO2e and lbCO2e), then calculate log10 value
    final_sector_scope[f'tonCO2e_eff_percapita_{min_max_avg}'] = final_sector_scope.apply(
        lambda x: x[f'tonCO2e_eff_{min_max_avg}'] / x.POP
        if x[f'tonCO2e_eff_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_sector_scope[f'tonCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(
        final_sector_scope[f"tonCO2e_eff_percapita_{min_max_avg}"])
    final_sector_scope[f'lbCO2e_eff_percapita_{min_max_avg}'] = final_sector_scope[f'tonCO2e_eff_percapita_{min_max_avg}'] * 2000
    final_sector_scope[f'lbCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(
        final_sector_scope[f"lbCO2e_eff_percapita_{min_max_avg}"])

    # Calculate per employee effective carbon emissions, then calculate log10 value
    final_sector_scope[f'tonCO2e_eff_peremp_{min_max_avg}'] = final_sector_scope[f'tonCO2e_eff_{min_max_avg}_emp'] / \
        final_sector_scope.Emp
    final_sector_scope[f'tonCO2e_eff_peremp_{min_max_avg}_log10'] = np.log10(
        final_sector_scope[f"tonCO2e_eff_peremp_{min_max_avg}"])

    # Calculate log10 of absolute burden
    final_sector_scope[f"burden_{min_max_avg}_log10"] = np.log10(
        final_sector_scope[f"burden_{min_max_avg}"])

    # Calculate per capita burden, then calculate log10 value
    final_sector_scope[f"burden_percapita_{min_max_avg}"] = final_sector_scope.apply(
        lambda x: x[f"burden_{min_max_avg}"] / x.POP
        if x[f'burden_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_sector_scope[f"burden_percapita_{min_max_avg}_log10"] = np.log10(
        final_sector_scope[f"burden_percapita_{min_max_avg}"])

    # Calculate per employee burden, then calculate log10 value
    final_sector_scope[f"burden_peremp_{min_max_avg}"] = final_sector_scope.apply(
        lambda x: x[f'burden_{min_max_avg}'] / x.Emp
        if x.Emp > 0 else np.nan,
        axis=1)
    final_sector_scope[f"burden_peremp_{min_max_avg}_log10"] = np.log10(
        final_sector_scope[f'burden_peremp_{min_max_avg}'])

final_sector_scope.head()

### 2.3.2 Aggregate to get sectoral emissions/burdens

In [None]:
# Group final_sector_scope by sector for each county (i.e. summing across scopes).
# Note that should only be dealing with *effective* emissions at this point to avoid double counting
groupby_cols = ['FIPS', 'County', 'FIPSTATE', 'State', 'NERC', 'sector', 'Emp', 'POP']
final_sector = final_sector_scope.groupby(by=groupby_cols,
                                          as_index=False,
                                          dropna=False).sum(numeric_only=True)

# Overwrite effective passthrough rate columns (as they have now been aggregated), and calculate effective tax rate
for min_max_avg in ['min', 'max', 'avg']:
    final_sector[f'rho_eff_{min_max_avg}'] = final_sector[f'tonCO2e_eff_{min_max_avg}'] / \
        final_sector['tonCO2e']
    # final_sector[f'tax_eff_{min_max_avg}'] = final_sector[f'rho_eff_{min_max_avg}'] * CO2tax

# Drop 'tonCO2e' (which will have double-counted emissions during the grouping) to avoid confusion with tonCO2e_eff
final_sector = final_sector.drop(columns='tonCO2e')

In [None]:
# Calculate the log10 values, and per employee values for tonCO2e. Use *effective* tonCO2e values in order to avoid double counting.
for min_max_avg in ['min', 'max', 'avg']:
    # Calculate log10 of absolute effective carbon emissions
    final_sector[f'tonCO2e_eff_{min_max_avg}_log10'] = np.log10(
        final_sector[f'tonCO2e_eff_{min_max_avg}'])

    # Calculate per capita effective carbon emissions, then calculate log10 value
    final_sector[f'tonCO2e_eff_percapita_{min_max_avg}'] = final_sector.apply(
        lambda x: x[f'tonCO2e_eff_{min_max_avg}'] / x.POP
        if x[f'tonCO2e_eff_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_sector[f'tonCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(
        final_sector[f"tonCO2e_eff_percapita_{min_max_avg}"])
    final_sector[f'lbCO2e_eff_percapita_{min_max_avg}'] = final_sector[f'tonCO2e_eff_percapita_{min_max_avg}'] * 2000
    final_sector[f'lbCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(final_sector[f"lbCO2e_eff_percapita_{min_max_avg}"])

    # Calculate per employee effective carbon emissions, then calculate log10 value
    final_sector[f'tonCO2e_eff_peremp_{min_max_avg}'] = final_sector[f'tonCO2e_eff_{min_max_avg}_emp'] / \
        final_sector.Emp
    final_sector[f'tonCO2e_eff_peremp_{min_max_avg}_log10'] = np.log10(
        final_sector[f"tonCO2e_eff_peremp_{min_max_avg}"])

    # Calculate log10 of absolute burden
    final_sector[f"burden_{min_max_avg}_log10"] = np.log10(final_sector[f"burden_{min_max_avg}"])

    # Calculate per capita burden, then calculate log10 value
    final_sector[f"burden_percapita_{min_max_avg}"] = final_sector.apply(
        lambda x: x[f"burden_{min_max_avg}"] / x.POP
        if x[f'burden_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_sector[f"burden_percapita_{min_max_avg}_log10"] = np.log10(
        final_sector[f"burden_percapita_{min_max_avg}"])

    # Calculate per employee burden, then calculate log10 value
    final_sector[f"burden_peremp_{min_max_avg}"] = final_sector.apply(
        lambda x: x[f'burden_{min_max_avg}'] / x.Emp
        if x.Emp > 0 else np.nan,
        axis=1)
    final_sector[f"burden_peremp_{min_max_avg}_log10"] = np.log10(
        final_sector[f'burden_peremp_{min_max_avg}'])

final_sector.head()

In [None]:
# Extract sectoral employment per county for use in construction of future dataframes
sectoral_emp = final_sector[["FIPS", "sector", "Emp"]].drop_duplicates(ignore_index=True)

### 2.3.3 Aggregate to get scope emissions/burdens

In [None]:
# Group final_sector_scope by scope for each county (i.e. sum across sectors)
# Note that should only be dealing with *effective* emissions at this point to avoid double counting.
groupby_cols = ['FIPS', 'County', 'FIPSTATE', 'State', 'NERC', 'scope', 'POP']
final_scope = final_sector_scope.groupby(by=groupby_cols,
                                         as_index=False,
                                         dropna=False).sum(numeric_only=True)

# Merge total county employment onto dataframe, as currently the 'Emp' field only contains employment in sectors that have emissions of a given scope,
# meaning that all scope 3 datapoints only record employment for those working in fossil fuel extraction, not total county employment
final_scope = pd.merge(
    final_scope.drop(columns='Emp'),
    total_county_emp_lehd,
    how='left',
    on='FIPS'
)

# Overwrite effective passthrough rate columns (as they have now been aggregated), and calculate effective tax rate.
for min_max_avg in ['min', 'max', 'avg']:
    final_scope[f'rho_eff_{min_max_avg}'] = final_scope[f'tonCO2e_eff_{min_max_avg}'] / \
        final_scope['tonCO2e']

final_scope.head()

In [None]:
# Calculate the log10 values, and per employee values for tonCO2e. Use *effective* tonCO2e values in order to avoid double counting.
for min_max_avg in ['min', 'max', 'avg']:
    # Calculate log10 of absolute effective carbon emissions
    final_scope[f'tonCO2e_eff_{min_max_avg}_log10'] = np.log10(
        final_scope[f'tonCO2e_eff_{min_max_avg}'])

    # Calculate per capita effective carbon emissions, then calculate log10 value
    final_scope[f'tonCO2e_eff_percapita_{min_max_avg}'] = final_scope.apply(
        lambda x: x[f'tonCO2e_eff_{min_max_avg}'] / x.POP
        if x[f'tonCO2e_eff_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_scope[f'tonCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(
        final_scope[f"tonCO2e_eff_percapita_{min_max_avg}"])
    final_scope[f'lbCO2e_eff_percapita_{min_max_avg}'] = final_scope[f'tonCO2e_eff_percapita_{min_max_avg}'] * 2000
    final_scope[f'lbCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(
        final_scope[f"lbCO2e_eff_percapita_{min_max_avg}"])

    # Calculate per employee effective carbon emissions, then calculate log10 value
    # NOTE: _emp value NOT used here, as we are using total county employment on the denominator,
    # therefore there should be no datapoints without corresponding Emp values
    final_scope[f'tonCO2e_eff_peremp_{min_max_avg}'] = final_scope[f'tonCO2e_eff_{min_max_avg}'] / \
        final_scope.Emp
    final_scope[f'tonCO2e_eff_peremp_{min_max_avg}_log10'] = np.log10(
        final_scope[f"tonCO2e_eff_peremp_{min_max_avg}"])

    # Calculate log10 of absolute burden
    final_scope[f"burden_{min_max_avg}_log10"] = np.log10(final_scope[f"burden_{min_max_avg}"])

    # Calculate per capita burden, then calculate log10 value
    final_scope[f"burden_percapita_{min_max_avg}"] = final_scope.apply(
        lambda x: x[f"burden_{min_max_avg}"] / x.POP
        if x[f'burden_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_scope[f"burden_percapita_{min_max_avg}_log10"] = np.log10(
        final_scope[f"burden_percapita_{min_max_avg}"])

    # Calculate per employee burden, then calculate log10 value
    final_scope[f"burden_peremp_{min_max_avg}"] = final_scope.apply(
        lambda x: x[f'burden_{min_max_avg}'] / x.Emp
        if x.Emp > 0 else np.nan,
        axis=1)
    final_scope[f"burden_peremp_{min_max_avg}_log10"] = np.log10(
        final_scope[f'burden_peremp_{min_max_avg}'])

final_scope.head()

### 2.3.4 Aggregate to get total emissions/burdens

In [None]:
# Group final_sector_scope by county (i.e. sum across sectors AND scopes)
# Note that should only be dealing with *effective* emissions at this point to avoid double counting
# Also, should drop and remerge the sectoral employment, as this will also have been double counted in the grouping.
groupby_cols = ['FIPS', 'County', 'FIPSTATE', 'State', 'NERC', 'POP']
final_total = final_sector_scope.groupby(by=groupby_cols,
                                         as_index=False,
                                         dropna=False).sum(numeric_only=True).drop(columns='Emp')

# Merge covered county employment onto the final dataframe
final_total = pd.merge(final_total, total_county_emp_lehd, how='left', on='FIPS')

# Overwrite effective passthrough rate columns (as they have now been aggregated), and calculate effective tax rate.
for min_max_avg in ['min', 'max', 'avg']:
    final_total[f'rho_eff_{min_max_avg}'] = final_total[f'tonCO2e_eff_{min_max_avg}'] / \
        final_total['tonCO2e']
    # final_total[f'tax_eff_{min_max_avg}'] = final_total[f'rho_eff_{min_max_avg}'] * CO2tax

# Drop 'tonCO2e' (which will have double-counted emissions during the grouping) to avoid confusion with tonCO2e_eff
final_total = final_total.drop(columns='tonCO2e')

final_total.head()

In [None]:
# Calculate the log10 values, and per employee values for tonCO2e. Use *effective* tonCO2e values in order to avoid double counting.
for min_max_avg in ['min', 'max', 'avg']:
    # Calculate log10 of absolute effective carbon emissions
    final_total[f'tonCO2e_eff_{min_max_avg}_log10'] = np.log10(
        final_total[f'tonCO2e_eff_{min_max_avg}'])

    # Calculate per capita effective carbon emissions, then calculate log10 value
    final_total[f'tonCO2e_eff_percapita_{min_max_avg}'] = final_total.apply(
        lambda x: x[f'tonCO2e_eff_{min_max_avg}'] / x.POP
        if x[f'tonCO2e_eff_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_total[f'tonCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(
        final_total[f"tonCO2e_eff_percapita_{min_max_avg}"])
    final_total[f'lbCO2e_eff_percapita_{min_max_avg}'] = final_total[f'tonCO2e_eff_percapita_{min_max_avg}'] * 2000
    final_total[f'lbCO2e_eff_percapita_{min_max_avg}_log10'] = np.log10(final_total[f"lbCO2e_eff_percapita_{min_max_avg}"])

    # Calculate per employee effective carbon emissions, then calculate log10 value
    # NOTE: _emp value NOT used here, as we are using total county employment on the denominator,
    # therefore there should be no datapoints without corresponding Emp values
    final_total[f'tonCO2e_eff_peremp_{min_max_avg}'] = final_total[f'tonCO2e_eff_{min_max_avg}'] / \
        final_total.Emp
    final_total[f'tonCO2e_eff_peremp_{min_max_avg}_log10'] = np.log10(
        final_total[f"tonCO2e_eff_peremp_{min_max_avg}"])

    # Calculate log10 of absolute burden
    final_total[f"burden_{min_max_avg}_log10"] = np.log10(final_total[f"burden_{min_max_avg}"])

    # Calculate per capita burden, then calculate log10 value
    final_total[f"burden_percapita_{min_max_avg}"] = final_total.apply(
        lambda x: x[f"burden_{min_max_avg}"] / x.POP
        if x[f'burden_{min_max_avg}'] > 0 else np.nan,
        axis=1)
    final_total[f"burden_percapita_{min_max_avg}_log10"] = np.log10(
        final_total[f"burden_percapita_{min_max_avg}"])

    # Calculate per employee burden, then calculate log10 value
    final_total[f"burden_peremp_{min_max_avg}"] = final_total.apply(
        lambda x: x[f'burden_{min_max_avg}'] / x.Emp
        if x.Emp > 0 else np.nan,
        axis=1)
    final_total[f"burden_peremp_{min_max_avg}_log10"] = np.log10(
        final_total[f'burden_peremp_{min_max_avg}'])


### 2.3.5 Derive ECFs for fossil fuel extraction vs non-extractive sectors
Use total county employment on denominator

In [None]:
# Group sectoral dataframe by whether or not the sector is a fossil fuel extraction sector
final_sector_extract = final_sector.drop(columns='Emp')
final_sector_extract['ff_extract'] = final_sector_extract.apply(
    lambda x: 1 if x.sector in ['og', 'coal'] else 0, axis=1
)
final_sector_extract = final_sector_extract.groupby(
    by=['FIPS', 'County', 'FIPSTATE', 'State', 'POP', 'ff_extract'],
    as_index=False
).sum(numeric_only=True)

# Merge overall county employment onto dataframe
final_sector_extract = final_sector_extract.merge(
    total_county_emp_lehd,
    how='left',
    on='FIPS'
)

# Calculate the log10 values, and per employee values for tonCO2e. Use *effective* tonCO2e values in order to avoid double counting.
# NOTE: _emp value NOT used here, as we are using total county employment on the denominator,
# therefore there should be no datapoints without corresponding Emp values
for min_max_avg in ['min', 'max', 'avg']:
    # Calculate log10 of absolute effective carbon emissions
    final_sector_extract[f'tonCO2e_eff_{min_max_avg}_log10'] = np.log10(
        final_sector_extract[f'tonCO2e_eff_{min_max_avg}'])

    # Calculate per employee effective carbon emissions, then calculate log10 value
    final_sector_extract[f'tonCO2e_eff_peremp_{min_max_avg}'] = final_sector_extract[f'tonCO2e_eff_{min_max_avg}'] / \
        final_sector_extract.Emp
    final_sector_extract[f'tonCO2e_eff_peremp_{min_max_avg}_log10'] = np.log10(
        final_sector_extract[f"tonCO2e_eff_peremp_{min_max_avg}"])

    # Calculate log10 of absolute burden
    final_sector_extract[f"burden_{min_max_avg}_log10"] = np.log10(
        final_sector_extract[f"burden_{min_max_avg}"])

    # Calculate per employee burden, then calculate log10 value
    final_sector_extract[f"burden_peremp_{min_max_avg}"] = final_sector_extract.apply(
        lambda x: x[f'burden_{min_max_avg}'] / x.Emp
        if x.Emp > 0 else np.nan,
        axis=1)
    final_sector_extract[f"burden_peremp_{min_max_avg}_log10"] = np.log10(
        final_sector_extract[f'burden_peremp_{min_max_avg}'])

final_sector_extract = final_sector_extract.drop(columns=['tonCO2e_eff_percapita_min',
                                                          'tonCO2e_eff_percapita_min_log10', 'lbCO2e_eff_percapita_min',
                                                          'lbCO2e_eff_percapita_min_log10', 'burden_percapita_min', 'burden_percapita_min_log10', 'tonCO2e_eff_percapita_max', 'tonCO2e_eff_percapita_max_log10',
                                                          'lbCO2e_eff_percapita_max', 'lbCO2e_eff_percapita_max_log10', 'burden_percapita_max',
                                                          'burden_percapita_max_log10', 'tonCO2e_eff_percapita_avg', 'tonCO2e_eff_percapita_avg_log10',
                                                          'lbCO2e_eff_percapita_avg', 'lbCO2e_eff_percapita_avg_log10', 'burden_percapita_avg',
                                                          'burden_percapita_avg_log10', 'tonCO2e_eff_min_emp', 'burden_min_emp', 'tonCO2e_eff_max_emp',
                                                          'burden_max_emp', 'tonCO2e_eff_avg_emp', 'burden_avg_emp',])

## 2.4 Winsorize outliers

There are several datapoints that skew the ECF distributions in our data by being several orders of magnitude higher or lower than is reasonable. We will therefore winsorize our ECF and burden per employee fields to the 0.4th and 99.7th percentiles.


In [None]:
# Add column with metric tonnes
final_total['tonneCO2e_eff_peremp_avg'] = final_total.tonCO2e_eff_peremp_avg * 0.9071847
final_total['tonneCO2e_eff_avg'] = final_total.tonCO2e_eff_avg * 0.9071847
final_total['tonneCO2e_eff_peremp_avg_log10'] = np.log10(final_total.tonneCO2e_eff_peremp_avg)

final_sector_scope['tonneCO2e_eff_peremp_avg'] = final_sector_scope.tonCO2e_eff_peremp_avg * 0.9071847
final_sector_scope['tonneCO2e_eff_peremp_avg_log10'] = np.log10(final_sector_scope.tonneCO2e_eff_peremp_avg)

final_sector['tonneCO2e_eff_peremp_avg'] = final_sector.tonCO2e_eff_peremp_avg * 0.9071847
final_sector['tonneCO2e_eff_peremp_avg_log10'] = np.log10(final_sector.tonneCO2e_eff_peremp_avg)

final_sector_extract['tonneCO2e_eff_peremp_avg'] = final_sector_extract.tonCO2e_eff_peremp_avg * 0.9071847
final_sector_extract['tonneCO2e_eff_peremp_avg_log10'] = np.log10(final_sector_extract.tonneCO2e_eff_peremp_avg)

final_scope['tonneCO2e_eff_peremp_avg'] = final_scope.tonCO2e_eff_peremp_avg * 0.9071847
final_scope['tonneCO2e_eff_peremp_avg_log10'] = np.log10(final_scope.tonneCO2e_eff_peremp_avg)

In [None]:
# Separate sectoral ECFs into fossil fuel extraction vs non-extraction sectors
final_sector_ffextract = final_sector_extract[final_sector_extract.ff_extract == 1].reset_index(drop=True)
final_sector_noextract = final_sector_extract[final_sector_extract.ff_extract == 0].reset_index(drop=True)

final_scope1 = final_scope[final_scope.scope == 'scope1'].reset_index(drop=True)
final_scope2 = final_scope[final_scope.scope == 'scope2'].reset_index(drop=True)
final_scope3 = final_scope[final_scope.scope == 'scope3'].reset_index(drop=True)

In [None]:
from scipy.stats.mstats import winsorize

limits=[0.004, 0.003]

for col in [
    'burden_peremp_min',
    'burden_peremp_min_log10',
    'burden_peremp_max',
    'burden_peremp_max_log10',
    'burden_peremp_avg',
    'burden_peremp_avg_log10',
    'tonCO2e_eff_peremp_min',
    'tonCO2e_eff_peremp_min_log10',
    'tonCO2e_eff_peremp_max',
    'tonCO2e_eff_peremp_max_log10',
    'tonCO2e_eff_peremp_avg',
    'tonCO2e_eff_peremp_avg_log10',
    'tonneCO2e_eff_peremp_avg',
    'tonneCO2e_eff_peremp_avg_log10'
]:
    final_total[col] = winsorize(
        a=np.array(final_total[col]),
        limits=limits
    )
    final_sector[col] = winsorize(
        a=np.array(final_sector[col]),
        limits=limits
    )
    final_sector_ffextract[col] = winsorize(
        a=np.array(final_sector_ffextract[col]),
        limits=limits
    )
    final_sector_noextract[col] = winsorize(
        a=np.array(final_sector_noextract[col]),
        limits=limits
    )
    
    for df in [final_scope1, final_scope2, final_scope3]:
        df[col] = winsorize(
            a=np.array(df[col]),
            limits=limits
        )

## 2.5 Write key dataframes to csv files


In [None]:
# Calculate z-scores (for visualization purposes)
final_total['ECF_std_dev'] = (final_total['tonneCO2e_eff_peremp_avg_log10'] -
                              final_total['tonneCO2e_eff_peremp_avg_log10'].mean()) / np.std(final_total['tonneCO2e_eff_peremp_avg_log10'])
final_sector_scope['ECF_std_dev'] = (final_sector_scope['tonneCO2e_eff_peremp_avg_log10'] -
                                     final_sector_scope['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
    np.std(final_sector_scope['tonneCO2e_eff_peremp_avg_log10'])

final_sector_dfs = dict.fromkeys(final_sector.sector.unique())
for sector in final_sector.sector.unique():
    final_sector_dfs[sector] = final_sector[final_sector.sector ==
                                            sector].reset_index(drop=True)
    final_sector_dfs[sector]['ECF_std_dev'] = (final_sector_dfs[sector]['tonneCO2e_eff_peremp_avg_log10'] -
                                               final_sector_dfs[sector]['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
        np.std(final_sector_dfs[sector]['tonneCO2e_eff_peremp_avg_log10'])

final_sector = pd.concat(final_sector_dfs.values(), ignore_index=True)

# for extraction dataframes, we want to share the color scale of the overall ecf plots. Therefore, use final_total means and stds.
final_sector_ffextract['ECF_std_dev'] = (final_sector_ffextract['tonneCO2e_eff_peremp_avg_log10'] -
                                         final_total['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
    np.std(final_total['tonneCO2e_eff_peremp_avg_log10'])
final_sector_noextract['ECF_std_dev'] = (final_sector_noextract['tonneCO2e_eff_peremp_avg_log10'] -
                                         final_total['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
    np.std(final_total['tonneCO2e_eff_peremp_avg_log10'])

final_scope1['ECF_std_dev'] = (final_scope1['tonneCO2e_eff_peremp_avg_log10'] -
                               final_scope1['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
    np.std(final_scope1['tonneCO2e_eff_peremp_avg_log10'])
final_scope2['ECF_std_dev'] = (final_scope2['tonneCO2e_eff_peremp_avg_log10'] -
                               final_scope2['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
    np.std(final_scope2['tonneCO2e_eff_peremp_avg_log10'])
final_scope3['ECF_std_dev'] = (final_scope3['tonneCO2e_eff_peremp_avg_log10'] -
                               final_scope3['tonneCO2e_eff_peremp_avg_log10'].mean()) / \
    np.std(final_scope3['tonneCO2e_eff_peremp_avg_log10'])

In [None]:
# Save final results as csv files
final_sector_scope.to_csv("../Output/ECF_sector_scope.csv")
final_sector.to_csv("../Output/ECF_sector.csv")
final_sector_ffextract.to_csv("../Output/ECF_sector_ffextract.csv")
final_sector_noextract.to_csv("../Output/ECF_sector_noextract.csv")
final_total.to_csv("../Output/ECF_total.csv")
final_scope1.to_csv("../Output/ECF_scope1.csv")
final_scope2.to_csv("../Output/ECF_scope2.csv")
final_scope3.to_csv("../Output/ECF_scope3.csv")

for sector in final_sector_dfs.keys():
    final_sector_dfs[sector].to_csv(f'../Temp/ECF_sector_{sector}.csv')
    final_sector_dfs[sector][final_sector_dfs[sector].ECF_std_dev.isna()].to_csv(f'../Temp/ECF_sector_{sector}_nans.csv')

# A1 Summary statistics for paper

In [None]:
# Annual direct CO2 emissions (not including other GHGs) from 2016-2020, from https://cfpub.epa.gov/ghgdata/inventoryexplorer/#iallsectors/allsectors/carbondioxide/select/all (U.S. EPA Greenhouse Gas Inventory Data Explorer)
co2_2016_2020 = [5251.758, 5210.958, 5376.657, 5259.144, 4715.691]
co2_avg = np.mean(co2_2016_2020)
print('Average annual direct CO2 emissions between 2016-2020:', np.round(co2_avg,2), 'million metric tonnes')

# Annual direct CO2 emissions (not including other GHGs) from 2016-2020 from transport sector
co2_transport_2016_2020 = [1757.638, 1779.977, 1812.761, 1813.755, 1572.034]
co2_transport_avg = np.mean(co2_transport_2016_2020)
print('Average annual direct CO2 emissions from transportation between 2016-2020:', np.round(co2_transport_avg,2), 'million metric tonnes')
print('Average annual direct CO2 emissions excluding transportation between 2016-2020:', np.round(co2_avg - co2_transport_avg,2), 'million metric tonnes')

# Annual direct CO2e emissions from fossil fuel combustion from 2016-2020
co2_ff_2016_2020 = [4909.609, 4853.299, 4989.308, 4852.330, 4342.659]
co2_ff_avg = np.mean(co2_ff_2016_2020)
print('Average annual CO2e emissions from fossil fuel combustion between 2016-2020:', np.round(co2_ff_avg, 2), 'million metric tonnes' )
print('Average annual CO2e emissions from fossil fuel combustion (excluding transportation) between 2016-2020:', np.round(co2_ff_avg - co2_transport_avg, 2), 'million metric tonnes' )

In [None]:
# Total covered direct emissions in our data
direct_co2_covered = final_scope[final_scope.scope == 'scope1'].tonCO2e.sum() / 1e6 / 1.10231

print('Direct CO2 emissions covered by our analysis:', np.round(direct_co2_covered,2), 'million metric tonnes')
print('Percent of total direct CO2 emissions covered by our analysis:', np.round(direct_co2_covered/co2_avg * 100,2), '%')
print('Percent of total direct CO2 emissions (excluding transport) covered by our analysis:', np.round(direct_co2_covered/(co2_avg - co2_transport_avg) * 100,2), '%')
print('Percent of total CO2e emissions from fossil fuel combustion covered by our analysis:', np.round(direct_co2_covered / (co2_ff_avg) * 100,2), '%')
print('Percent of total CO2e emissions from fossil fuel combustion (excluding transport) covered by our analysis:', np.round(direct_co2_covered / (co2_ff_avg - co2_transport_avg) * 100,2), '%')

In [None]:
# Total excluded employees according to QCEW (data taken from 2018)
emp_excl = 21084389

# Total employment beteen 2016-2020, according to BLS CPS (https://www.bls.gov/cps/tables.htm)
emp_total_2016_2020 = [151436, 153337, 155761, 157538, 147795]
emp_total_avg = np.mean(emp_total_2016_2020)*1e3

# Total employment in the sectors our analysis covers
emp_covered_sectors = emp_total_avg - emp_excl

# Total employment actually recorded in our analysis
emp_actual = final_sector.Emp.sum()

print('Averge total employment between 2016-2020:', emp_total_avg/1e6, 'million')
print('Percent of total employment between 2016-2020 covered by the sectors we consider:', np.round(emp_covered_sectors/emp_total_avg*100, 2), '%')
print('Percent of total employment between 2016-2020 corresponding to actual data in our analysis:', np.round(emp_actual/emp_total_avg*100, 2), '%')

In [None]:
# No. counties vs population with higher-than-average ECF
df = final_total[final_total.tonCO2e_eff_peremp_avg_log10 > final_total.tonCO2e_eff_peremp_avg_log10.mean()]
print('No. counties with above-average ECFs:', len(df))
print('Percent of counties with above-average ECFs:', np.round(len(df) / len(final_total) * 100, 2), '%')
print('Percent of population with above-average ECFs:', np.round(df.POP.sum() / final_total.POP.sum() * 100,2), '%')

# A2 Calculations for Tableau figures
Tableau maps are color coded by logarithmic standard deviation (i.e. z-scores). Therefore, need to manually compute corresponding ECFs to input into the legend. In all instances, round to 2 significant figures for interpretability.

In [None]:
ECF_std_dev, burden_std_dev = np.std(final_total.tonneCO2e_eff_peremp_avg_log10), np.std(final_total.burden_peremp_avg_log10)
ECF_mean, burden_mean = final_total.tonneCO2e_eff_peremp_avg_log10.mean(), final_total.burden_peremp_avg_log10.mean()
print("Logarithmic mean:", ECF_mean, " Std dev:", ECF_std_dev, ' ECF at log mean:', 10**ECF_mean, ' Burden at log mean:', 10**burden_mean)
print("Min ECF:", 10**final_total.tonneCO2e_eff_peremp_avg_log10.min(), ' Max ECF:', 10**final_total.tonneCO2e_eff_peremp_avg_log10.max())
print("Min burden:", 10**final_total.burden_peremp_avg_log10.min(), ' Max burden:', 10**final_total.burden_peremp_avg_log10.max())
print('Min z-score:', final_total.ECF_std_dev.min(), 'Max z-score:', final_total.ECF_std_dev.max(),)
n = 6
step = (4 + 2) / n
for i in np.arange(-2,5,step):
    ECF_log10_i, burden_log10_i = ECF_mean + ECF_std_dev * i, burden_mean + burden_std_dev * i
    ECF_i, burden_i = 10**(ECF_log10_i), 10**(burden_log10_i)
    print('Z-score:', i, ' ECF:', ECF_i, ' Burden:', burden_i)

In [None]:
# Create state-level dataframe
final_total_state = final_total[['FIPSTATE', 'State', 'tonneCO2e_eff_avg', 'burden_avg', 'POP', 'Emp']].groupby(
    by=['FIPSTATE', 'State'], as_index=False).sum(numeric_only=True)
final_total_state['tonneCO2e_eff_peremp_avg'] = final_total_state.tonneCO2e_eff_avg / \
    final_total_state.Emp
final_total_state['tonneCO2e_eff_peremp_avg_log10'] = np.log10(
    final_total_state['tonneCO2e_eff_peremp_avg'])
final_total_state['burden_peremp_avg'] = final_total_state.burden_avg / final_total_state.Emp
final_total_state['burden_peremp_avg_log10'] = np.log10(final_total_state['burden_peremp_avg'])
mean = np.mean(final_total_state.tonneCO2e_eff_peremp_avg_log10)
std = np.std(final_total_state.tonneCO2e_eff_peremp_avg_log10)
final_total_state['ECF_std_dev'] = final_total_state.apply(
    lambda x: (x.tonneCO2e_eff_peremp_avg_log10 - mean) / std, axis=1)
final_total_state.to_csv('../Output/ECF_total_state.csv')

In [None]:
ECF_std_dev, burden_std_dev = np.std(final_total_state.tonneCO2e_eff_peremp_avg_log10), np.std(final_total_state.burden_peremp_avg_log10)
ECF_mean, burden_mean = final_total_state.tonneCO2e_eff_peremp_avg_log10.mean(), final_total_state.burden_peremp_avg_log10.mean()
print("Logarithmic mean:", ECF_mean, " Std dev:", ECF_std_dev, ' ECF at log mean:', 10**ECF_mean, ' Burden at log mean:', 10**burden_mean)
print("Min ECF:", 10**final_total_state.tonneCO2e_eff_peremp_avg_log10.min(), ' Max ECF:', 10**final_total_state.tonneCO2e_eff_peremp_avg_log10.max())
print("Min burden:", 10**final_total_state.burden_peremp_avg_log10.min(), ' Max burden:', 10**final_total_state.burden_peremp_avg_log10.max())
print('Min z-score:', final_total_state.ECF_std_dev.min(), 'Max z-score:', final_total_state.ECF_std_dev.max(),)
for i in np.arange(-2,4,1):
    ECF_log10_i, burden_log10_i = ECF_mean + ECF_std_dev * i, burden_mean + burden_std_dev * i
    ECF_i, burden_i = 10**(ECF_log10_i), 10**(burden_log10_i)
    print('Z-score:', i, ' ECF:', ECF_i, ' Burden:', burden_i)

In [None]:
df = final_sector_ffextract.copy()
ECF_std_dev, burden_std_dev = np.std(final_total.tonneCO2e_eff_peremp_avg_log10), np.std(final_total.burden_peremp_avg_log10)
ECF_mean, burden_mean = final_total.tonneCO2e_eff_peremp_avg_log10.mean(), final_total.burden_peremp_avg_log10.mean()
print("Logarithmic mean:", ECF_mean, " Std dev:", ECF_std_dev, ' ECF at log mean:', 10**ECF_mean, ' Burden at log mean:', 10**burden_mean)
print("Min Z-score:", df.ECF_std_dev.min(), ' Max Z-score:', df.ECF_std_dev.max())
print("Min ECF:", 10**df.tonneCO2e_eff_peremp_avg_log10.min(), ' Max ECF:', 10**df.tonneCO2e_eff_peremp_avg_log10.max())
print("Min burden:", 10**df.burden_peremp_avg_log10.min(), ' Max burden:', 10**df.burden_peremp_avg_log10.max())
print('Min z-score:', df.ECF_std_dev.min(), 'Max z-score:', df.ECF_std_dev.max(),)
for i in np.arange(-2.709,2.719,2.709/3):
    ECF_log10_i, burden_log10_i = ECF_mean + ECF_std_dev * i, burden_mean + burden_std_dev * i
    ECF_i, burden_i = 10**(ECF_log10_i), 10**(burden_log10_i)
    print('Z-score:', i, ' ECF:', ECF_i, ' Burden:', burden_i)

In [None]:
df = final_sector_noextract.copy()
ECF_std_dev, burden_std_dev = np.std(df.tonneCO2e_eff_peremp_avg_log10), np.std(df.burden_peremp_avg_log10)
ECF_mean, burden_mean = df.tonneCO2e_eff_peremp_avg_log10.mean(), df.burden_peremp_avg_log10.mean()
print("Logarithmic mean:", ECF_mean, " Std dev:", ECF_std_dev, ' ECF at log mean:', 10**ECF_mean, ' Burden at log mean:', 10**burden_mean)
print("Min Z-score:", df.ECF_std_dev.min(), ' Max Z-score:', df.ECF_std_dev.max())
print("Min ECF:", 10**df.tonneCO2e_eff_peremp_avg_log10.min(), ' Max ECF:', 10**df.tonneCO2e_eff_peremp_avg_log10.max())
print("Min burden:", 10**df.burden_peremp_avg_log10.min(), ' Max burden:', 10**df.burden_peremp_avg_log10.max())
print('Min z-score:', df.ECF_std_dev.min(), 'Max z-score:', df.ECF_std_dev.max(),)
for i in np.arange(-2,4,1):
    ECF_log10_i, burden_log10_i = ECF_mean + ECF_std_dev * i, burden_mean + burden_std_dev * i
    ECF_i, burden_i = 10**(ECF_log10_i), 10**(burden_log10_i)
    print('Z-score:', i, ' ECF:', ECF_i, ' Burden:', burden_i)

In [None]:
df = final_scope3.copy()
ECF_std_dev, burden_std_dev = np.std(df.tonneCO2e_eff_peremp_avg_log10), np.std(df.burden_peremp_avg_log10)
ECF_mean, burden_mean = df.tonneCO2e_eff_peremp_avg_log10.mean(), df.burden_peremp_avg_log10.mean()
print("Logarithmic mean:", ECF_mean, " Std dev:", ECF_std_dev, ' ECF at log mean:', 10**ECF_mean, ' Burden at log mean:', 10**burden_mean)
print("Min Z-score:", df.ECF_std_dev.min(), ' Max Z-score:', df.ECF_std_dev.max())
print("Min ECF:", 10**df.tonneCO2e_eff_peremp_avg_log10.min(), ' Max ECF:', 10**df.tonneCO2e_eff_peremp_avg_log10.max())
print("Min burden:", 10**df.burden_peremp_avg_log10.min(), ' Max burden:', 10**df.burden_peremp_avg_log10.max())
print('Min z-score:', df.ECF_std_dev.min(), 'Max z-score:', df.ECF_std_dev.max(),)
for i in np.arange(-2.4,3,0.8):
    ECF_log10_i, burden_log10_i = ECF_mean + ECF_std_dev * i, burden_mean + burden_std_dev * i
    ECF_i, burden_i = 10**(ECF_log10_i), 10**(burden_log10_i)
    print('Z-score:', i, ' ECF:', ECF_i, ' Burden:', burden_i)

In [None]:
df = final_sector_dfs['og'].copy()
ECF_std_dev, burden_std_dev = np.std(df.tonneCO2e_eff_peremp_avg_log10), np.std(df.burden_peremp_avg_log10)
ECF_mean, burden_mean = df.tonneCO2e_eff_peremp_avg_log10.mean(), df.burden_peremp_avg_log10.mean()

print("Logarithmic mean:", ECF_mean, " Std dev:", ECF_std_dev, ' ECF at log mean:', 10**ECF_mean, ' Burden at log mean:', 10**burden_mean)
print("Min Z-score:", df.ECF_std_dev.min(), ' Max Z-score:', df.ECF_std_dev.max())
print("Min ECF:", 10**df.tonneCO2e_eff_peremp_avg_log10.min(), ' Max ECF:', 10**df.tonneCO2e_eff_peremp_avg_log10.max())
print("Min burden:", 10**df.burden_peremp_avg_log10.min(), ' Max burden:', 10**df.burden_peremp_avg_log10.max())
print('Min z-score:', df.ECF_std_dev.min(), 'Max z-score:', df.ECF_std_dev.max(),)
for i in np.arange(-3,4,1):
    ECF_log10_i, burden_log10_i = ECF_mean + ECF_std_dev * i, burden_mean + burden_std_dev * i
    ECF_i, burden_i = 10**(ECF_log10_i), 10**(burden_log10_i)
    print('Z-score:', i, ' ECF:', ECF_i, ' Burden:', burden_i)