In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from matplotlib import ticker as mticker
%matplotlib inline
params = {"text.usetex": True,
          "font.family": "serif",
          "font.serif": ["Computer Modern Serif"]}
plt.rcParams.update(params)

# Read in county geometry
import geopandas as gpd
counties = gpd.read_file('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json')
counties = counties.rename(columns={'id': 'FIPS'})
counties['centroid_lat'], counties['centroid_long'] = counties.geometry.centroid.y, counties.geometry.centroid.x

In [None]:
# Set up dictionary to reference full sector names
sector_dict = {
    "ovr": r"\textbf{Overall ECF}",
    "ag": "Agriculture",
    "cn": "Construction",
    "mf": "Manufacturing",
    "mn_rest": "Mining (excl. fossil fuel extraction)",
    "pwr": "Fossil fuel power generation",
    "og": "Oil \& gas extraction",
    "coal": "Coal mining",
    "comm": "Commercial",
}

scope_dict = {"scope1": "Scope 1", "scope2": "Scope 2", "scope3": "Scope 3"}

# Create a dictionary for renaming fuel emissions columns
tonCO2e_fuel_rename_dict = {
    'tonCO2e_Diesel': 'dsl',
    'tonCO2e_LPG_NGL': 'lpg',
    'tonCO2e_Natural_gas': 'ng',
    'tonCO2e_Net_electricity': 'elec',
    'tonCO2e_Coal': 'coal',
    'tonCO2e_Residual_fuel_oil': 'residfuel'
}

# Read in fips_nerc_crosswalk
fips_nerc_crosswalk = pd.read_csv(
    '../../overallFootprintCalc/Temp/fips_nerc_crosswalk.csv',
    index_col=0,
    dtype={'FIPS': str}
)

# Read in grid carbon intensities
counties_elec_intensity = pd.read_csv(
    '../../../Data/industrial/Temp/counties_elec_intensity.csv',
    dtype={'id': str, 'STATE': str, 'COUNTY': str}
).drop(columns='Unnamed: 0').rename(columns={'id': 'FIPS'})
counties_elec_intensity = counties_elec_intensity.rename(
    columns={'id': 'FIPS', 'SRC2ERTA': 'lbCO2e_perMWh_elec'})
counties_elec_intensity = counties_elec_intensity.replace(
    to_replace=['02270', '46113'], value=['02158', '46102'])

# 1 Prepare data

## 1.1 Read in ECF dataframes and geographic & demographic variables

In [None]:
# Import ECF dataframes from OverallCarbonFootprint analysis
ECF_sector = pd.read_csv("../../overallFootprintCalc/Output/ECF_sector.csv",
                           dtype={"FIPS": str, "FIPSTATE": str}).drop(columns=["Unnamed: 0", 'POP'])
ECF_scope1 = pd.read_csv("../../overallFootprintCalc/Output/ECF_scope1.csv",
                          dtype={"FIPS": str, "FIPSTATE": str}).drop(columns=["Unnamed: 0", 'POP'])
ECF_scope2 = pd.read_csv("../../overallFootprintCalc/Output/ECF_scope2.csv",
                          dtype={"FIPS": str, "FIPSTATE": str}).drop(columns=["Unnamed: 0", 'POP'])
ECF_scope3 = pd.read_csv("../../overallFootprintCalc/Output/ECF_scope3.csv",
                          dtype={"FIPS": str, "FIPSTATE": str}).drop(columns=["Unnamed: 0", 'POP'])   
ECF_total = pd.read_csv("../../overallFootprintCalc/Output/ECF_total.csv",
                          dtype={"FIPS": str, "FIPSTATE": str}).drop(columns=["Unnamed: 0", 'POP'])                          
ECF_total = ECF_total.rename(
    columns={
        'tonCO2e_eff_peremp_avg_log10': 'ECF_log10',
        'tonCO2e_eff_peremp_avg': 'ECF',
        'tonCO2e_eff_peremp_avg_ctrlelec': 'ECF_ctrlelec',
        'tonCO2e_eff_peremp_avg_ctrlelec_log10': 'ECF_ctrlelec_log10'
    }
)

# Read in county land area data, downloaded from https://www.census.gov/library/publications/2011/compendia/usa-counties-2011.html#LND
landarea = pd.read_excel('../Input/CensusBureau_USCounties/LND01.xls', dtype={'STCOU': str})

# Rename key columns and only keep 2010 measurements
landarea = landarea.rename(columns={'STCOU': 'FIPS', 'LND110210D': 'land_area_sq_mi'})
landarea = landarea[['FIPS', 'land_area_sq_mi']]

In [None]:
# Read in ACS county-level data, as pulled from ACF_data_ECF.R
ACS_COUNTY_CLEAN = pd.read_csv("../Input/ACS/ACS_COUNTY_CLEAN.csv",
                               dtype={"GEOID": str}
                               ).rename(columns={"GEOID": "FIPS"}).drop(columns="Unnamed: 0")
# create percent minority metric that is 1 - the percent of non-latin whites in a given county
ACS_COUNTY_CLEAN['RACE_PERCENT_MINORITY'] = (
    1 - ACS_COUNTY_CLEAN.RACE_NOLATIN_WHITE / ACS_COUNTY_CLEAN.POP) * 100
ACS_COUNTY_CLEAN['RACE_PERCENT_MINORITY_log10'] = ACS_COUNTY_CLEAN.apply(
    lambda x: np.log10(x['RACE_PERCENT_MINORITY']) if x.RACE_PERCENT_MINORITY > 0 else 1, axis=1)
# create metrics for percent of population whose highest level education is high-school or lower, as well as those with some tertiary education
ACS_COUNTY_CLEAN['ED_PERCENT_HSORLESS'] = (
    ACS_COUNTY_CLEAN.ED_LESSHS + ACS_COUNTY_CLEAN.ED_HS) / ACS_COUNTY_CLEAN.POP * 100
ACS_COUNTY_CLEAN['ED_PERCENT_TERTIARY'] = (
    ACS_COUNTY_CLEAN.ED_SOMECOLLEGE + ACS_COUNTY_CLEAN.ED_BACH + ACS_COUNTY_CLEAN.ED_GRAD) / ACS_COUNTY_CLEAN.POP * 100
# create metrics for unemployment and poverty rates
ACS_COUNTY_CLEAN['UNEMP_RATE'] = ACS_COUNTY_CLEAN.UNEMP / ACS_COUNTY_CLEAN.POP * 100
ACS_COUNTY_CLEAN['POV_RATE'] = ACS_COUNTY_CLEAN.POV_TOT / ACS_COUNTY_CLEAN.POP * 100
# create metrics for percent migrant population
ACS_COUNTY_CLEAN['MIG_PERCENT'] = ACS_COUNTY_CLEAN.MIG_TOT / ACS_COUNTY_CLEAN.POP
ACS_COUNTY_CLEAN['MIG_TOT_log10'] = np.log10(ACS_COUNTY_CLEAN.MIG_TOT)
ACS_COUNTY_CLEAN['MIG_PERCENT_log10'] = np.log10(ACS_COUNTY_CLEAN['MIG_PERCENT'])

In [None]:
# Read in 2020 state presidential voting trends, downloaded and manipulated from MIT Election Labl https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/42MVDX
state_pres_vote_2020 = pd.read_excel(
    '../Input/MITElectionLab/state_pres_votes/2020_state_pres_votes.xlsx',
    sheet_name='2020_state_pres_votes',
    dtype={'FIPSTATE': str})

state_pres_vote_2020['state_color'] = state_pres_vote_2020.apply(
    lambda x: 'narrowly D' if (x.state_percent_D - x.state_percent_R < 0.05 and x.state_percent_D - x.state_percent_R > 0) else
    'narrowly R' if (x.state_percent_R - x.state_percent_D < 0.05 and x.state_percent_R - x.state_percent_D > 0) else
    'D' if x.state_preferred_party == 'DEMOCRAT' else 'R', axis=1
)

# Read in 2020 county presidential voting trends, downloaded and manipulated from MIT Election Lab https://electionlab.mit.edu/data
county_pres_vote_2020 = pd.read_excel(
    '../Input/MITElectionLab/county_pres_votes/2020_county_pres_votes.xlsx',
    sheet_name='2020_county_pres_votes',
    dtype={'FIPS': str})

county_pres_vote_2020['county_color'] = county_pres_vote_2020.apply(
    lambda x: 'narrowly D' if (x.county_percent_D - x.county_percent_R < 0.05 and x.county_percent_D - x.county_percent_R > 0) else
    'narrowly R' if (x.county_percent_R - x.county_percent_D < 0.05 and x.county_percent_R - x.county_percent_D > 0) else
    'D' if x.county_preferred_party == 'DEMOCRAT' else 'R', axis=1
)

## 1.2 Merge demographic and geographic variables onto ECF dataframes of interest


In [None]:
dfs = [ECF_total, ECF_sector, ECF_scope1, ECF_scope2, ECF_scope3]

for i in range(len(dfs)):
    dfs[i] = pd.merge(dfs[i], ACS_COUNTY_CLEAN, how='left', on='FIPS')
    dfs[i] = pd.merge(dfs[i], landarea, how='left', on='FIPS')

    # Compute population density, and take log10
    dfs[i]['pop_per_sqmi'] = dfs[i].POP / dfs[i].land_area_sq_mi
    dfs[i]['pop_per_sqmi_log10'] = np.log10(dfs[i]['pop_per_sqmi'])

    dfs[i]['POP_log10'] = np.log10(dfs[i]['POP'])

totalECF_demo, sectorECF_demo, scope1ECF_demo, scope2ECF_demo, scope3ECF_demo = dfs[0], dfs[1], dfs[2], dfs[3], dfs[4]
totalECF_demo = pd.merge(totalECF_demo, state_pres_vote_2020, how='left', on='FIPSTATE')
totalECF_demo = pd.merge(totalECF_demo, county_pres_vote_2020, how='left', on='FIPS')

totalECF_demo.to_csv('../Temp/totalECF_demo.csv')
scope1ECF_demo.to_csv('../Temp/scope1ECF_demo.csv')
scope2ECF_demo.to_csv('../Temp/scope2ECF_demo.csv')
scope3ECF_demo.to_csv('../Temp/scope3ECF_demo.csv')
for sector in sectorECF_demo.sector.unique():
    sectorECF_demo[sectorECF_demo.sector == sector].to_csv(f'../Temp/{sector}ECF_demo.csv')
    sectorECF_demo[sectorECF_demo.sector == sector][sectorECF_demo[sectorECF_demo.sector == sector].tonCO2e_eff_peremp_avg.isna()].to_csv(f'../Temp/{sector}ECF_demo_nans.csv')
sectorECF_demo.to_csv('../Temp/sectorECF_demo.csv')

## 1.3 Read in other sets of independent variables

In [None]:
# Read in heating degree day (HDD) and cooling degree day (CDD) data,
# from NOAA's 30-year Climate Normals (https://www.ncdc.noaa.gov/cdo-web/datasets, FTP: https://www.ncei.noaa.gov/pub/data/normals/1981-2010/)
CDD = pd.read_fwf(
    'https://www.ncei.noaa.gov/pub/data/normals/1981-2010/products/temperature/ann-cldd-normal.txt',
    header=None)
CDD.columns = ['STNID', 'VALUE']
CDD['FLAG'] = CDD.VALUE.apply(lambda x: x[-1])
CDD['VALUE'] = CDD.VALUE.apply(lambda x: float(x[:-1]))
CDD['VALUE'] = CDD.VALUE.apply(lambda x: np.nan if x < 0 else x)

HDD = pd.read_fwf(
    'https://www.ncei.noaa.gov/pub/data/normals/1981-2010/products/temperature/ann-htdd-normal.txt',
    header=None)
HDD.columns = ['STNID', 'VALUE']
HDD['FLAG'] = HDD.VALUE.apply(lambda x: x[-1])
HDD['VALUE'] = HDD.VALUE.apply(lambda x: float(x[:-1]))
HDD['VALUE'] = HDD.VALUE.apply(lambda x: np.nan if x < 0 else x)

# Read in station to zip code crosswalk from NOAA
STNID_ZIP = pd.read_fwf(
    'https://www.ncei.noaa.gov/pub/data/normals/1981-2010/station-inventories/zipcodes-normals-stations.txt',
    header=None,
    dtype={1: str}
)
STNID_ZIP.columns = ['STNID', 'ZIP', 'PO_NAME']

# Read in zip code to county crosswalk from https://www.huduser.gov/apps/public/uspscrosswalk/home
ZIP_COUNTY = pd.read_excel(
    '../../../Data/other/Input/ZIP_COUNTY_032011.xlsx',
    dtype={'ZIP': str, 'COUNTY': str}
)
ZIP_COUNTY = ZIP_COUNTY.sort_values(  # keep counties were majority of zip code lies
    by='TOT_RATIO', ascending=False
).drop_duplicates(
    subset='ZIP', keep='first'
)
ZIP_COUNTY = ZIP_COUNTY.reset_index(drop=True)

# Group CDD and HDD data at county level
degreedays = [CDD, HDD]
for i in range(len(degreedays)):
    degreedays[i] = degreedays[i].merge(
        STNID_ZIP[['STNID', 'ZIP']],
        on='STNID',
        how='left'
    )
    degreedays[i] = degreedays[i].merge(
        ZIP_COUNTY[['ZIP', 'COUNTY']],
        on='ZIP',
        how='left'
    )
    degreedays[i] = degreedays[i].groupby(
        by='COUNTY', as_index=False
    ).mean(numeric_only=True)
    degreedays[i] = degreedays[i].rename(columns={'COUNTY': 'FIPS'})

degreedays_county = degreedays[0].merge(
    degreedays[1],
    on='FIPS',
    how='left'
)
degreedays_county = degreedays_county.rename(
    columns={'VALUE_x': 'CDD', 'VALUE_y': 'HDD'}
)
degreedays_county = degreedays_county.dropna().reset_index(drop=True)

## 1.4 Calculate fuel mixes of all subsectors

### 1.4.1 Construction

In [None]:
# Read in construction datafile & isolate relevant columns
# Read in final agg dataframe to get emloyment and other data
cn_agg_totals_peremp_final = pd.read_csv(
    "../../../Data/industrial/Output/cn_agg_totals_peremp_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Read in agg_scopes pure emissions dataframe for tonco2e_eff calcs
county_cn_agg_scopes_CO2e = pd.read_csv(
    "../../../Data/industrial/Temp/county_cn_agg_scopes_CO2e.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Drop irrelevant columns
county_cn_agg_scopes_CO2e = county_cn_agg_scopes_CO2e.drop(
    columns=['lbCO2e', 'MMBTU_TOTAL', 'tonCO2e'])

# Rename columns to match functions
county_cn_agg_scopes_CO2e = county_cn_agg_scopes_CO2e.rename(columns=tonCO2e_fuel_rename_dict)

# Add sector columns
county_cn_agg_scopes_CO2e['sector'] = 'cn'

# Add NERC regions
county_cn_agg_scopes_CO2e = pd.merge(
    county_cn_agg_scopes_CO2e,
    fips_nerc_crosswalk,
    how="left",
    on="FIPS").rename(columns={"NERC Region": "NERC"})

# Melt dataframe to narrow format, indexed by sector, scope and fuel
id_vars = ['FIPS', 'County', 'FIPSTATE', 'STATE', 'NERC', 'NAICS', 'sector', 'scope']
value_vars = [col for col in county_cn_agg_scopes_CO2e.columns if col in [
    'coal', 'ng', 'elec', 'residfuel', 'lpg', 'dsl', 'crude', 'heatoil']]
cn_fuel_scope = pd.melt(
    frame=county_cn_agg_scopes_CO2e,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name='fuel',
    value_name='tonCO2e')

In [None]:
# Pivot fuel-scope table to have one column per fuel for emissions
fuelmix_cn = cn_fuel_scope.pivot_table(
    columns='fuel',
    values='tonCO2e',
    index=['FIPS', 'NAICS', 'sector']
).reset_index()

fuels = [col for col in fuelmix_cn.columns if col not in [
    'FIPS', 'NAICS', 'sector', 'tonCO2e_eff', 'SUBRGN', 'lbCO2e_perMWh_elec']]

fuelmix_cn['tonCO2e_eff'] = fuelmix_cn[fuels].sum(axis=1)
fuelmix_cn['tonCO2e_eff_noelec'] = fuelmix_cn[[fuel for fuel in fuels if fuel not in ['elec']]
                                              ].sum(axis=1)

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_cn = fuelmix_cn.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_cn[f'y_{fuel}'] = fuelmix_cn[fuel] * 2000 / \
            fuelmix_cn.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_cn[f'y_{fuel}'] = fuelmix_cn[fuel] * 2000 / ef_dict[fuel]
    fuelmix_cn = fuelmix_cn.drop(columns=fuel)
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_cn['total_mmbtu'] = fuelmix_cn[y_cols].sum(axis=1)
fuelmix_cn['total_mmbtu_noelec'] = fuelmix_cn[[
    col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in fuelmix_cn.columns[-6:-2]:
    fuelmix_cn[fuelmix] = fuelmix_cn[fuelmix] / fuelmix_cn['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_cn['EF_ovr_noelec'] = fuelmix_cn.tonCO2e_eff_noelec / fuelmix_cn.total_mmbtu
# Isolate weighted EF by subsector
subsectors = fuelmix_cn.NAICS.unique()
fuelmix_cn_s_efs = pd.pivot_table(
    data=fuelmix_cn,
    columns='NAICS',
    values='EF_ovr_noelec',
    index=['FIPS', 'lbCO2e_perMWh_elec']
).reset_index()
fuelmix_cn_s_efs = fuelmix_cn_s_efs.fillna(0)

fuelmix_cn_ss = fuelmix_cn_s_efs.rename(
    columns=dict(zip(
        subsectors,
        [f'EF_{subsector}' for subsector in subsectors]
    ))
)

### 1.4.2 Manufacturing

In [None]:
# Read in manufacturing datafile & isolate relevant columns
# Read in final agg dataframe to get emloyment and other data
mf_agg_totals_peremp_final = pd.read_csv(
    "../../../Data/industrial/Output/mf_agg_totals_peremp_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Read in agg_scopes pure emissions dataframe for tonco2e_eff calcs
county_mf_agg_scopes_CO2e = pd.read_csv(
    "../../../Data/industrial/Temp/county_mf_agg_scopes_CO2e.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Drop irrelevant columns
county_mf_agg_scopes_CO2e = county_mf_agg_scopes_CO2e.drop(
    columns=['lbCO2e', 'MMBTU_TOTAL', 'tonCO2e'])

# Rename columns to match functions
county_mf_agg_scopes_CO2e = county_mf_agg_scopes_CO2e.rename(columns=tonCO2e_fuel_rename_dict)

# Add sector columns
county_mf_agg_scopes_CO2e['sector'] = 'mf'

# Add NERC regions
county_mf_agg_scopes_CO2e = pd.merge(
    county_mf_agg_scopes_CO2e,
    fips_nerc_crosswalk,
    how="left",
    on="FIPS").rename(columns={"NERC Region": "NERC"})

# Melt dataframe to narrow format, indexed by sector, scope and fuel
id_vars = ['FIPS', 'County', 'FIPSTATE', 'STATE', 'NERC', 'NAICS', 'sector', 'scope']
value_vars = [col for col in county_mf_agg_scopes_CO2e.columns if col in [
    'coal', 'ng', 'elec', 'residfuel', 'lpg', 'dsl', 'crude', 'heatoil']]
mf_fuel_scope = pd.melt(
    frame=county_mf_agg_scopes_CO2e,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name='fuel',
    value_name='tonCO2e')

In [None]:
# Pivot fuel-scope table to have one column per fuel for emissions
fuelmix_mf = mf_fuel_scope.pivot_table(
    columns='fuel',
    values='tonCO2e',
    index=['FIPS', 'NAICS', 'sector']
).reset_index()

fuels = ['coal', 'dsl', 'elec', 'lpg', 'ng', 'residfuel']

fuelmix_mf['tonCO2e_eff'] = fuelmix_mf[fuels].sum(axis=1)
fuelmix_mf['tonCO2e_eff_noelec'] = fuelmix_mf[[fuel for fuel in fuels if fuel not in ['elec']]
                                              ].sum(axis=1)

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_mf = fuelmix_mf.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_mf[f'y_{fuel}'] = fuelmix_mf[fuel] * 2000 / \
            fuelmix_mf.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_mf[f'y_{fuel}'] = fuelmix_mf[fuel] * 2000 / ef_dict[fuel]
    fuelmix_mf = fuelmix_mf.drop(columns=fuel)
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_mf['total_mmbtu'] = fuelmix_mf[y_cols].sum(axis=1)
fuelmix_mf['total_mmbtu_noelec'] = fuelmix_mf[[
    col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_mf[fuelmix] = fuelmix_mf[fuelmix] / fuelmix_mf['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_mf['EF_ovr_noelec'] = fuelmix_mf.tonCO2e_eff_noelec / fuelmix_mf.total_mmbtu
# Isolate weighted EF by subsector
subsectors = fuelmix_mf.NAICS.unique()
fuelmix_mf = fuelmix_mf[fuelmix_mf.tonCO2e_eff > 0].reset_index(drop=True)
fuelmix_mf_s_efs = pd.pivot_table(
    data=fuelmix_mf,
    columns='NAICS',
    values='EF_ovr_noelec',
    index=['FIPS', 'lbCO2e_perMWh_elec']
).reset_index()
fuelmix_mf_s_efs = fuelmix_mf_s_efs.fillna(0)

fuelmix_mf_ss = fuelmix_mf_s_efs.rename(
    columns=dict(zip(
        subsectors,
        [f'EF_{subsector}' for subsector in subsectors]
    ))
)

### 1.4.3 Mining

In [None]:
# Read in mining datafile & isolate relevant columns
# Read in final agg dataframe to get emloyment and other data
mn_agg_totals_peremp_final = pd.read_csv(
    "../../../Data/industrial/Output/mn_agg_totals_peremp_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Read in agg_scopes pure emissions dataframe for tonco2e_eff calcs
county_mn_agg_scopes_CO2e = pd.read_csv(
    "../../../Data/industrial/Temp/county_mn_agg_scopes_CO2e.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Drop irrelevant columns
county_mn_agg_scopes_CO2e = county_mn_agg_scopes_CO2e.drop(
    columns=['lbCO2e', 'MMBTU_TOTAL', 'tonCO2e'])

# Retrofit fuel categories
if "tonCO2e_Coke_and_breeze"  in county_mn_agg_scopes_CO2e.columns:
    county_mn_agg_scopes_CO2e['tonCO2e_Coal'] = county_mn_agg_scopes_CO2e['tonCO2e_Coal'] + \
        county_mn_agg_scopes_CO2e['tonCO2e_Coke_and_breeze']
if "tonCO2e_Other"  in county_mn_agg_scopes_CO2e.columns:
    county_mn_agg_scopes_CO2e['tonCO2e_Diesel'] = county_mn_agg_scopes_CO2e['tonCO2e_Diesel'] + \
        county_mn_agg_scopes_CO2e['tonCO2e_Other']
        
# Rename columns to match functions
county_mn_agg_scopes_CO2e = county_mn_agg_scopes_CO2e.rename(columns=tonCO2e_fuel_rename_dict)

# Add sector columns
county_mn_agg_scopes_CO2e['sector'] = 'mn'

# Add NERC regions
county_mn_agg_scopes_CO2e = pd.merge(
    county_mn_agg_scopes_CO2e,
    fips_nerc_crosswalk,
    how="left",
    on="FIPS").rename(columns={"NERC Region": "NERC"})

# Melt dataframe to narrow format, indexed by sector, scope and fuel
id_vars = ['FIPS', 'County', 'FIPSTATE', 'STATE', 'NERC', 'NAICS', 'sector', 'scope']
value_vars = [col for col in county_mn_agg_scopes_CO2e.columns if col in [
    'coal', 'ng', 'elec', 'residfuel', 'lpg', 'dsl', 'crude', 'heatoil']]
mn_fuel_scope = pd.melt(
    frame=county_mn_agg_scopes_CO2e,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name='fuel',
    value_name='tonCO2e')

# Pivot fuel-scope table to have one column per fuel for emissions
fuelmix_mn = mn_fuel_scope.pivot_table(
    columns='fuel',
    values='tonCO2e',
    index=['FIPS', 'NAICS', 'sector']
).reset_index()

fuels = [col for col in fuelmix_mn.columns if col not in ['FIPS', 'NAICS', 'sector', 'tonCO2e_eff', 'SUBRGN', 'lbCO2e_perMWh_elec']]

fuelmix_mn['tonCO2e_eff'] = fuelmix_mn[fuels].sum(axis=1)
fuelmix_mn['tonCO2e_eff_noelec'] = fuelmix_mn[[fuel for fuel in fuels if fuel not in ['elec']]
                                       ].sum(axis=1)
fuelmix_mn = fuelmix_mn[fuelmix_mn.tonCO2e_eff > 0].reset_index(drop=True)

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_mn = fuelmix_mn.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_mn[f'y_{fuel}'] = fuelmix_mn[fuel] * 2000 / \
            fuelmix_mn.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_mn[f'y_{fuel}'] = fuelmix_mn[fuel] * 2000 / ef_dict[fuel]
    fuelmix_mn = fuelmix_mn.drop(columns=fuel)
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_mn['total_mmbtu'] = fuelmix_mn[y_cols].sum(axis=1)
fuelmix_mn['total_mmbtu_noelec'] = fuelmix_mn[[col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_mn[fuelmix] = fuelmix_mn[fuelmix] / fuelmix_mn['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_mn['EF_ovr_noelec'] = fuelmix_mn.tonCO2e_eff_noelec / fuelmix_mn.total_mmbtu

# Isolate weighted EF by subsector
subsectors = fuelmix_mn.NAICS.unique()
fuelmix_mn_s_efs = pd.pivot_table(
    data=fuelmix_mn,
    columns='NAICS',
    values='EF_ovr_noelec',
    index=['FIPS', 'lbCO2e_perMWh_elec']
).reset_index()
fuelmix_mn_s_efs = fuelmix_mn_s_efs.fillna(0)

fuelmix_mn_ss = fuelmix_mn_s_efs.rename(
    columns=dict(zip(
        subsectors,
        [f'EF_{subsector}' for subsector in subsectors]
    ))
)
fuelmix_mn_ss = fuelmix_mn_ss.drop(columns=['EF_2111', 'EF_2121'])

### 1.4.4 Agriculture

In [None]:
ag_agg_totals_peremp_final = pd.read_csv(
    "../../../Data/industrial/Output/ag_agg_totals_peremp_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Read in agg_scopes pure emissions dataframe for tonco2e_eff calcs
county_ag_agg_scopes_CO2e = pd.read_csv(
    "../../../Data/industrial/Temp/county_ag_agg_scopes_CO2e.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str, "NAICS": str}
).drop(columns="Unnamed: 0")

# Drop irrelevant columns
county_ag_agg_scopes_CO2e = county_ag_agg_scopes_CO2e.drop(
    columns=['lbCO2e', 'MMBTU_TOTAL', 'tonCO2e'])

# Rename columns to match functions
county_ag_agg_scopes_CO2e = county_ag_agg_scopes_CO2e.rename(columns=tonCO2e_fuel_rename_dict)

# Add sector columns
county_ag_agg_scopes_CO2e['sector'] = 'ag'

# Add NERC regions
county_ag_agg_scopes_CO2e = pd.merge(
    county_ag_agg_scopes_CO2e,
    fips_nerc_crosswalk,
    how="left",
    on="FIPS").rename(columns={"NERC Region": "NERC"})

# Melt dataframe to narrow format, indexed by sector, scope and fuel
id_vars = ['FIPS', 'County', 'FIPSTATE', 'STATE', 'NERC', 'NAICS', 'sector', 'scope']
value_vars = [col for col in county_ag_agg_scopes_CO2e.columns if col in [
    'coal', 'ng', 'elec', 'residfuel', 'lpg', 'dsl', 'crude', 'heatoil']]
ag_fuel_scope = pd.melt(
    frame=county_ag_agg_scopes_CO2e,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name='fuel',
    value_name='tonCO2e')

In [None]:
# Pivot fuel-scope table to have one column per fuel for emissions
fuelmix_ag = ag_fuel_scope.pivot_table(
    columns='fuel',
    values='tonCO2e',
    index=['FIPS', 'NAICS', 'sector']
).reset_index()

fuels = [col for col in fuelmix_ag.columns if col not in ['FIPS', 'NAICS', 'sector', 'tonCO2e_eff', 'SUBRGN', 'lbCO2e_perMWh_elec']]

fuelmix_ag['tonCO2e_eff'] = fuelmix_ag[fuels].sum(axis=1)
fuelmix_ag['tonCO2e_eff_noelec'] = fuelmix_ag[[fuel for fuel in fuels if fuel not in ['elec']]
                                       ].sum(axis=1)
fuelmix_ag = fuelmix_ag[fuelmix_ag.tonCO2e_eff > 0].reset_index(drop=True)

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_ag = fuelmix_ag.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_ag[f'y_{fuel}'] = fuelmix_ag[fuel] * 2000 / \
            fuelmix_ag.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_ag[f'y_{fuel}'] = fuelmix_ag[fuel] * 2000 / ef_dict[fuel]
    fuelmix_ag = fuelmix_ag.drop(columns=fuel)
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_ag['total_mmbtu'] = fuelmix_ag[y_cols].sum(axis=1)
fuelmix_ag['total_mmbtu_noelec'] = fuelmix_ag[[col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_ag[fuelmix] = fuelmix_ag[fuelmix] / fuelmix_ag['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_ag['EF_ovr_noelec'] = fuelmix_ag.tonCO2e_eff_noelec / fuelmix_ag.total_mmbtu

# Isolate weighted EF by subsector
subsectors = fuelmix_ag.NAICS.unique()
fuelmix_ag_s_efs = pd.pivot_table(
    data=fuelmix_ag,
    columns='NAICS',
    values='EF_ovr_noelec',
    index=['FIPS', 'lbCO2e_perMWh_elec']
).reset_index()
fuelmix_ag_s_efs = fuelmix_ag_s_efs.fillna(0)

fuelmix_ag_ss = fuelmix_ag_s_efs.rename(
    columns=dict(zip(
        subsectors,
        [f'EF_{subsector}' for subsector in subsectors]
    ))
)
fuelmix_ag_ss = fuelmix_ag_ss.drop(columns='EF_1151')

### 1.4.5 Coal mining

In [None]:
ff_scopes = pd.read_csv(
    "../../overallFootprintCalc/Temp/ff_scopes.csv",
    index_col=0,
    dtype={'FIPS': str, 'FIPSTATE': str}
)
fuelmix_coal = ff_scopes[(ff_scopes.scope != 'scope3') & (ff_scopes.sector == 'coal')]
fuelmix_coal = fuelmix_coal.drop(columns='tonCO2e_crude')
fuelmix_coal['tonCO2e_noelec'] = fuelmix_coal.apply(
    lambda x: x.tonCO2e if x.scope == 'scope1' else 0, axis=1
)
fuelmix_coal = fuelmix_coal.groupby(by=['FIPS', 'FIPSTATE', 'County', 'State', 'Emp'],
                                    as_index=False,
                                    dropna=False).sum()

fuels = ['coal', 'ng', 'elec', 'residfuel', 'lpg', 'dsl']

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_coal = fuelmix_coal.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_coal[f'y_{fuel}'] = fuelmix_coal[f'tonCO2e_{fuel}'] * 2000 / \
            fuelmix_coal.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_coal[f'y_{fuel}'] = fuelmix_coal[f'tonCO2e_{fuel}'] * 2000 / ef_dict[fuel]
    fuelmix_coal = fuelmix_coal.drop(columns=f'tonCO2e_{fuel}')
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_coal['total_mmbtu'] = fuelmix_coal[y_cols].sum(axis=1)
fuelmix_coal['total_mmbtu_noelec'] = fuelmix_coal[[
    col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_coal[fuelmix] = fuelmix_coal[fuelmix] / fuelmix_coal['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_coal['EF_ovr_noelec'] = fuelmix_coal.tonCO2e_noelec / fuelmix_coal.total_mmbtu

### 1.4.6 Oil & gas

In [None]:
fuelmix_og = ff_scopes[(ff_scopes.scope != 'scope3') & (ff_scopes.sector == 'og')]
fuelmix_og = fuelmix_og.drop(columns='tonCO2e_crude')
fuelmix_og['tonCO2e_noelec'] = fuelmix_og.apply(
    lambda x: x.tonCO2e if x.scope == 'scope1' else 0, axis=1
)
fuelmix_og = fuelmix_og.groupby(
    by=['FIPS', 'FIPSTATE', 'County', 'State', 'Emp'], as_index=False, dropna=False).sum()

fuels = ['coal', 'ng', 'elec', 'residfuel', 'lpg', 'dsl']

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_og = fuelmix_og.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_og[f'y_{fuel}'] = fuelmix_og[f'tonCO2e_{fuel}'] * 2000 / \
            fuelmix_og.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_og[f'y_{fuel}'] = fuelmix_og[f'tonCO2e_{fuel}'] * 2000 / ef_dict[fuel]
    fuelmix_og = fuelmix_og.drop(columns=f'tonCO2e_{fuel}')
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_og['total_mmbtu'] = fuelmix_og[y_cols].sum(axis=1)
fuelmix_og['total_mmbtu_noelec'] = fuelmix_og[[col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_og[fuelmix] = fuelmix_og[fuelmix] / fuelmix_og['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_og['EF_ovr_noelec'] = fuelmix_og.tonCO2e_noelec / fuelmix_og.total_mmbtu

### 1.4.7 Fossil fuel power generation

In [None]:
# Read in power plant data
pwr_raw = pd.read_csv(
    "../../../Data/pwr/Output/pwr_totalCO2_final.csv",
    dtype={"FIPS": str, "FIPSTATE": str, "STATEFIPS": str},
    index_col=0
)

# Reformat power plant data (only scope 1, as assumed no scope 2)
fuelmix_pwr = pwr_raw[["FIPS", "County", "State", "Tons of CO2 Emissions", "tonCO2e_percapita",
                       "emp_new", "tonCO2e_PET", "tonCO2e_GAS", "tonCO2e_COAL", "tonCO2e_other"]]
# treat 'other' emissions as natural gas
fuelmix_pwr["tonCO2e_ng"] = fuelmix_pwr["tonCO2e_other"] + fuelmix_pwr["tonCO2e_GAS"]

# Assume plants that use petroleum use residual fuel oil.
fuelmix_pwr['FIPSTATE'] = fuelmix_pwr.FIPS.apply(lambda x: x[:2])
fuelmix_pwr = fuelmix_pwr.rename(columns={"Tons of CO2 Emissions": "tonCO2e",
                                          "emp_new": "Emp",
                                          "tonCO2e_PET": "tonCO2e_residfuel",
                                          "tonCO2e_COAL": "tonCO2e_coal",
                                          }
                                 )
#  clean up columns, add sector column
fuelmix_pwr = fuelmix_pwr.drop(columns=["tonCO2e_other", "tonCO2e_GAS"])
fuelmix_pwr["sector"] = "pwr"

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_pwr = fuelmix_pwr.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

fuels = ['coal', 'ng', 'residfuel']

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87
           }

y_cols = []
for fuel in fuels:
    fuelmix_pwr[f'y_{fuel}'] = fuelmix_pwr[f'tonCO2e_{fuel}'] * 2000 / ef_dict[fuel]
    fuelmix_pwr = fuelmix_pwr.drop(columns=f'tonCO2e_{fuel}')
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_pwr['total_mmbtu'] = fuelmix_pwr[y_cols].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_pwr[fuelmix] = fuelmix_pwr[fuelmix] / fuelmix_pwr['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_pwr['EF_ovr'] = fuelmix_pwr.tonCO2e / fuelmix_pwr.total_mmbtu

### 1.4.8 Commercial

In [None]:
fuelmix_comm_ss = pd.read_csv(
    '../../../Data/comm/Temp/comstock_buildtype.csv',
    index_col=0,
    dtype={'FIPS': str, 'STATEFIPS': str})
# Reformat commercial data and add to dictionary
#  Isolate desired columns from raw data
fuelmix_comm_ss = fuelmix_comm_ss[
    ["State", "FIPS", "STATEFIPS", "County", 'ind_name',
     "lbCO2e_elec_total_w", "tonCO2e_total_w",
     "lbCO2e_ng_total_w", "lbCO2e_other_total_w", "lbCO2e_dist_heat_w",
     "lbCO2e_dist_cool_w"]
]

#  Rename emissions by fuel type to be in keeping with the other sectors
fuelmix_comm_ss["tonCO2e_ng1"] = fuelmix_comm_ss["lbCO2e_ng_total_w"] / 2000
fuelmix_comm_ss["tonCO2e_elec1"] = fuelmix_comm_ss["lbCO2e_elec_total_w"] / 2000
fuelmix_comm_ss["tonCO2e_heatoil"] = fuelmix_comm_ss["lbCO2e_other_total_w"] / \
    2000  # treat 'other' as heating oil
fuelmix_comm_ss["tonCO2e_ng2"] = fuelmix_comm_ss["lbCO2e_dist_heat_w"] / \
    2000  # treat as natural gas consumption
fuelmix_comm_ss["tonCO2e_elec2"] = fuelmix_comm_ss["lbCO2e_dist_cool_w"] / \
    2000  # treat as electrcitiy consumption
fuelmix_comm_ss["tonCO2e_ng"] = fuelmix_comm_ss["tonCO2e_ng1"] + fuelmix_comm_ss["tonCO2e_ng2"]
fuelmix_comm_ss["tonCO2e_elec"] = fuelmix_comm_ss["tonCO2e_elec1"] + \
    fuelmix_comm_ss["tonCO2e_elec2"]

# Drop unnecessary columns
fuelmix_comm_ss = fuelmix_comm_ss.rename(columns={"STATEFIPS": "FIPSTATE"}).drop(
    columns=[
        "tonCO2e_total_w",
        "lbCO2e_elec_total_w",
        "lbCO2e_ng_total_w",
        "lbCO2e_other_total_w",
        "lbCO2e_dist_heat_w",
        "lbCO2e_dist_cool_w",
        "tonCO2e_ng1",
        "tonCO2e_ng2",
        "tonCO2e_elec1",
        "tonCO2e_elec2",
    ]
)
fuels = ['heatoil', 'ng', 'elec']

# Read in power grid carbon intensity & merge onto dataframe
fuelmix_comm_ss = fuelmix_comm_ss.merge(
    counties_elec_intensity[['FIPS', 'SUBRGN', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)
fuelmix_comm_ss['tonCO2e_eff_noelec'] = fuelmix_comm_ss[[
    'tonCO2e_ng', 'tonCO2e_heatoil']].sum(axis=1)

# Back-calculate the MMBtu using the lbCO2e/MMBtu used in emissions calculates
ef_dict = {'dsl': 163.45,
           'lpg': 138.63,
           'ng': 116.65,
           'residfuel': 165.55,
           'coal': 211.87,
           'heatoil': 163.45
           }

y_cols = []
for fuel in fuels:
    if fuel == 'elec':
        fuelmix_comm_ss[f'y_{fuel}'] = fuelmix_comm_ss[f'tonCO2e_{fuel}'] * 2000 / \
            fuelmix_comm_ss.lbCO2e_perMWh_elec / 0.29307107
    else:
        fuelmix_comm_ss[f'y_{fuel}'] = fuelmix_comm_ss[f'tonCO2e_{fuel}'] * 2000 / ef_dict[fuel]
    fuelmix_comm_ss = fuelmix_comm_ss.drop(columns=f'tonCO2e_{fuel}')
    y_cols.append(f'y_{fuel}')

# Calculate total MMBtu and share of total attributable to each fuel
fuelmix_comm_ss['total_mmbtu'] = fuelmix_comm_ss[y_cols].sum(axis=1)
fuelmix_comm_ss['total_mmbtu_noelec'] = fuelmix_comm_ss[[
    col for col in y_cols if col not in ['y_elec']]].sum(axis=1)

for fuelmix in y_cols:
    fuelmix_comm_ss[fuelmix] = fuelmix_comm_ss[fuelmix] / fuelmix_comm_ss['total_mmbtu'] * 100

# Calculate weighted emissions factor using fuel shares, excluding electricity
fuelmix_comm_ss['EF_ovr_noelec'] = fuelmix_comm_ss.tonCO2e_eff_noelec / fuelmix_comm_ss.total_mmbtu

# Isolate weighted EF by subsector
subsectors = fuelmix_comm_ss.ind_name.unique()
# fuelmix_comm_ss = fuelmix_comm_ss[fuelmix_comm_ss.tonCO2e_eff > 0].reset_index(drop=True)
fuelmix_comm_s_efs = pd.pivot_table(
    data=fuelmix_comm_ss,
    columns='ind_name',
    values='EF_ovr_noelec',
    index=['FIPS', 'lbCO2e_perMWh_elec']
).reset_index()
fuelmix_comm_s_efs = fuelmix_comm_s_efs.fillna(0)

fuelmix_comm_ss = fuelmix_comm_s_efs.rename(
    columns=dict(zip(
        subsectors,
        [f'EF_{subsector}' for subsector in subsectors]
    ))
)

## 1.5 Merge data together

In [None]:
# Identify relevant ACS variables for regressions from totalECF_demo
acs_regression_cols = ['FIPS',
                   'INC_IND_TOT',
                   'RACE_PERCENT_MINORITY',
                   'ED_PERCENT_TERTIARY',
                   'UNEMP_RATE',
                   'POV_RATE',
                   'county_percent_D',
                   'county_percent_R',
                   'state_preferred_party',
                   'pop_per_sqmi',
                   ]

# Isolate relevant columns and pivot datatable so that sectoral emp and ECF are in columns
from functools import reduce
X_s = ECF_sector[['FIPS', 'sector', 'Emp', 'tonCO2e_eff_peremp_avg']]
X_s = X_s.drop_duplicates(subset=['FIPS', 'sector']).reset_index()
X_s = X_s.fillna('NA')
X_s = X_s.pivot(
    columns='sector',
    values='Emp',
    index='FIPS'
).reset_index()
X_s = X_s.fillna(0)
X_s = X_s.replace('NA', np.nan)

# Rename columns
new_emp_colnames = {}
for col in X_s.columns[1:9]:
    new_emp_colnames[col] = 'x_' + col
X_s = X_s.rename(columns=new_emp_colnames)

# Merge total emp and total ECF onto dataframe
X_s = pd.merge(X_s, ECF_total[['FIPS', 'Emp',
                               'ECF']], how='left', on='FIPS')

# Calculate x_s as an absolute percentage
for sector in ECF_sector.sector.unique():
    X_s['x_' + sector] = X_s['x_' + sector] / X_s['Emp'] * 100

# Merge demographic vars
X_s_demo = X_s.merge(
    totalECF_demo[acs_regression_cols],
    how='left',
    on='FIPS'
)
X_s_demo['state_preferred_party'] = X_s_demo['state_preferred_party'].apply(
    lambda x: 1 if x == 'REPUBLICAN' else 0)
X_s_demo['popdens_inc'] = X_s_demo.pop_per_sqmi * \
    X_s_demo.INC_IND_TOT  # interaction term
X_s_demo = X_s_demo.dropna()

# Merge carbon intensity of grid
X_s_demo_grid = X_s_demo.merge(
    counties_elec_intensity[['FIPS', 'lbCO2e_perMWh_elec']],
    on='FIPS',
    how='left'
)

# Add share-grid interactions
X_s_demo_grid_int = X_s_demo_grid.copy()
for col in X_s.columns[1:9]:
    X_s_demo_grid_int[col + '_grid'] = X_s_demo_grid_int[col] * \
        X_s_demo_grid_int.lbCO2e_perMWh_elec

# Merge heating and cooling days
X_s_demo_grid_weather = X_s_demo_grid_int.merge(
    degreedays_county,
    how='left',
    on='FIPS'
)

# Add interaction between heating/cooling days and carbon intensity of grid
X_s_demo_grid_weather_int = X_s_demo_grid_weather.copy()
X_s_demo_grid_weather_int['HDD_grid'] = X_s_demo_grid_weather_int.HDD * X_s_demo_grid_weather_int.lbCO2e_perMWh_elec
X_s_demo_grid_weather_int['CDD_grid'] = X_s_demo_grid_weather_int.CDD * X_s_demo_grid_weather_int.lbCO2e_perMWh_elec

# Derive and merge fuel mixes for each subsector
fuelmix_dfs = {
    'cn': fuelmix_cn_ss,
    'ag': fuelmix_ag_ss,
    'mn_rest': fuelmix_mn_ss,
    'mf': fuelmix_mf_ss,
    'comm': fuelmix_comm_ss,
    'og': fuelmix_og[['FIPS', 'lbCO2e_perMWh_elec', 'EF_ovr_noelec']].rename(
        columns={'EF_ovr_noelec': 'EF_ovr_noelec_og'}
    ),
    'coal': fuelmix_coal[['FIPS', 'lbCO2e_perMWh_elec', 'EF_ovr_noelec']].rename(
        columns={'EF_ovr_noelec': 'EF_ovr_noelec_coal'}
    ),
    'pwr': fuelmix_pwr[['FIPS', 'lbCO2e_perMWh_elec', 'EF_ovr']].rename(
        columns={'EF_ovr': 'EF_ovr_pwr'}
    ),
}

for sector in fuelmix_dfs.keys():
    fuelmix_dfs[sector] = fuelmix_dfs[sector].drop(
        columns='lbCO2e_perMWh_elec')

fuelmixes = reduce(lambda left, right: pd.merge(left, right, on=['FIPS'],
                                                how='outer'), fuelmix_dfs.values())
fuelmixes = fuelmixes.fillna(0)

X_s_demo_grid_weather_fuelmix = X_s_demo_grid_weather_int.merge(
    fuelmixes,
    how='left',
    on='FIPS'
)

# Add interactions between EF_ss and x_ss
sector_subsector_crosswalk = { # describes which columns belong to which overall sectors
    'x_cn': ['EF_236', 'EF_237', 'EF_238'],
    'x_ag': ['EF_1111', 'EF_1112', 'EF_1113',
             'EF_1114', 'EF_1119', 'EF_1121', 'EF_1122', 'EF_1123', 'EF_1124',
             'EF_1125'],
    'x_mn_rest': ['EF_2122', 'EF_2123', 'EF_2131'],
    'x_mf': ['EF_311', 'EF_312',
             'EF_313', 'EF_314', 'EF_315', 'EF_316', 'EF_321', 'EF_322', 'EF_323',
             'EF_324', 'EF_325', 'EF_326', 'EF_327', 'EF_331', 'EF_332', 'EF_333',
             'EF_334', 'EF_335', 'EF_336', 'EF_337', 'EF_339'],
    'x_comm': ['EF_accommodation',
               'EF_hospital', 'EF_office', 'EF_outpatient', 'EF_restaurant',
               'EF_retail', 'EF_school', 'EF_warehouse_storage'],
    'x_og': ['EF_ovr_noelec_og'],
    'x_coal': ['EF_ovr_noelec_coal'],
    'x_pwr': ['EF_ovr_pwr']
}

covars_all = X_s_demo_grid_weather_fuelmix.copy()
for x_s in sector_subsector_crosswalk.keys():
    for EF_ss in sector_subsector_crosswalk[x_s]:
        covars_all[x_s + '_' + EF_ss] = covars_all[x_s] * covars_all[EF_ss]

covars_all = covars_all.dropna().reset_index(drop=True)
covars_all

# 2 Regressing each set of independent variables separately

In [None]:
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer


model_empshare = smf.ols(
    formula='np.log(ECF) ~ np.log1p(x_ag) + np.log1p(x_cn) + np.log1p(x_coal) + np.log1p(x_comm) + np.log1p(x_mf) + np.log1p(x_mn_rest) + np.log1p(x_og) + np.log1p(x_pwr)',
    data=covars_all
).fit()
model_demo = smf.ols(
    formula='np.log(ECF) ~ INC_IND_TOT + np.log1p(RACE_PERCENT_MINORITY) + ED_PERCENT_TERTIARY + UNEMP_RATE + POV_RATE + np.log1p(pop_per_sqmi) + np.log1p(popdens_inc)',
    data=covars_all
).fit()
model_political = smf.ols(
    formula='np.log(ECF) ~ county_percent_R + state_preferred_party',
    data=covars_all
).fit()
model_efgrid = smf.ols(
    formula='np.log(ECF) ~ lbCO2e_perMWh_elec',
    data=covars_all
).fit()
model_degreedays = smf.ols(
    formula='np.log(ECF) ~ HDD + CDD',
    data=covars_all
).fit()
model_fuelmix = smf.ols(
    formula='np.log(ECF) ~ EF_236 +  EF_237 +  EF_238 +  EF_1111 +  EF_1112 +  EF_1113 +  EF_1114 +  EF_1119 +  EF_1121 +  EF_1122 +  EF_1123 +  EF_1124 +  EF_1125 +  EF_2122 +  EF_2123 +  EF_2131 + \
     EF_311 +  EF_312 +  EF_313 +  EF_314 +  EF_315 +  EF_316 +  EF_321 +  EF_322 +  EF_323 +  EF_324 +  EF_325 +  EF_326 +  EF_327 +  EF_331 +  EF_332 +  EF_333 +  EF_334 +  EF_335 +  EF_336 +  \
        EF_337 +  EF_339 +  EF_accommodation +  EF_hospital +  EF_office +  EF_outpatient +  EF_restaurant +  EF_retail +  EF_school +  EF_warehouse_storage +  EF_ovr_noelec_og +  EF_ovr_noelec_coal +  EF_ovr_pwr',
    data=covars_all
).fit()

print('R2 & variance values')
print('empshare:', model_empshare.rsquared, ', ', np.var(model_empshare.resid))
print('demo:', model_demo.rsquared, ', ', np.var(model_demo.resid))
print('political:', model_political.rsquared, ', ', np.var(model_political.resid))
print('efgrid:', model_efgrid.rsquared, ', ', np.var(model_efgrid.resid))
print('degreedays:', model_degreedays.rsquared, ', ', np.var(model_degreedays.resid))
print('fuelmix:', model_fuelmix.rsquared, ', ', np.var(model_fuelmix.resid))
regtable = Stargazer([model_empshare, model_demo, model_efgrid, model_degreedays, model_fuelmix])
regtable

# 3 Regressing independent variables together, adding variable groups one by one

In [None]:
formulas = {}
models = {}

formulas['m1'] = 'np.log(ECF) ~ np.log1p(x_ag)'
for x_s in ['x_cn', 'x_coal', 'x_comm', 'x_mf', 'x_mn_rest', 'x_og', 'x_pwr']:
    formulas['m1'] = formulas['m1'] + f' + np.log1p({x_s})'

formulas['m2'] = formulas['m1'] + \
    ' + INC_IND_TOT + np.log1p(RACE_PERCENT_MINORITY) + ED_PERCENT_TERTIARY + UNEMP_RATE + POV_RATE + np.log1p(pop_per_sqmi) + np.log1p(popdens_inc)'

formulas['m3'] = formulas['m2'] + ' + county_percent_R + state_preferred_party '

formulas['m4'] = formulas['m3'] + ' + lbCO2e_perMWh_elec'

for x_s_grid_interaction in ['x_ag_grid', 'x_cn_grid', 'x_coal_grid',
                             'x_comm_grid', 'x_mf_grid', 'x_mn_rest_grid', 'x_og_grid', 'x_pwr_grid']:
    formulas['m4'] = formulas['m4'] + f' + np.log1p({x_s_grid_interaction})'

formulas['m5'] = formulas['m4'] + ' + HDD*lbCO2e_perMWh_elec + CDD*lbCO2e_perMWh_elec'

formulas['m6'] = formulas['m5']
for EF_lst in sector_subsector_crosswalk.values():
    formulas['m6'] = formulas['m6'] + ' + ' + ' + '.join(EF_lst)

formulas['m7'] = formulas['m6']
for x_s in sector_subsector_crosswalk.keys():
    for EF_ss in sector_subsector_crosswalk[x_s]:
        formulas['m7'] = formulas['m7'] + f' + np.log1p({x_s}_{EF_ss})'

print('R2 values')
for model in formulas.keys():
    models[model] = smf.ols(
        formula=formulas[model],
        data=covars_all
    ).fit()
    print(f'{model}:', models[model].rsquared)

print('Var(residuals):', np.var(models['m7'].resid),'\n')
regtable = Stargazer(models.values())
regtable