In [187]:
# import necessary dependencies
import numpy as np 
import pandas as pd 
import altair as alt

# to merge datasets
from functools import reduce 

In [188]:
#
CH4_capita = pd.read_csv("ghg-emissions-ch4-capita.csv")
CH4_gdp = pd.read_csv("ghg-emissions-ch4-gdp.csv")
CH4_total = pd.read_csv("ghg-emissions-ch4-total.csv")

#
CO2_capita = pd.read_csv("ghg-emissions-co2-capita.csv")
CO2_gdp = pd.read_csv("ghg-emissions-co2-gdp.csv")
CO2_total = pd.read_csv("ghg-emissions-co2-total.csv")

#
fgas_capita = pd.read_csv("ghg-emissions-fgas-capita.csv")
fgas_gdp = pd.read_csv("ghg-emissions-fgas-gdp.csv")
fgas_total = pd.read_csv("ghg-emissions-fgas-total.csv")

#
N2O_capita = pd.read_csv("ghg-emissions-n2o-capita.csv")
N2O_gdp = pd.read_csv("ghg-emissions-n2o-gdp.csv")
N2O_total = pd.read_csv("ghg-emissions-n2o-total.csv")

In [189]:
# dropping the unit column and keeping iso, state, and unit. we melt the year variable to turn them from a column per year into rows

CH4_capita_melt = pd.melt(CH4_capita, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "CH4pC").drop(columns = "unit")
CH4_gdp_melt = pd.melt(CH4_gdp, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "CH4pGDP").drop(columns = "unit")
CH4_total_melt = pd.melt(CH4_total, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "CH4TOT").drop(columns = "unit")

#
CO2_capita_melt = pd.melt(CO2_capita, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "CO2pC").drop(columns = "unit")
CO2_gdp_melt = pd.melt(CO2_gdp, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "CO2pGDP").drop(columns = "unit")
CO2_total_melt = pd.melt(CO2_total, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "CO2TOT").drop(columns = "unit")

#
fgas_capita_melt = pd.melt(fgas_capita, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "FGaspC").drop(columns = "unit")
fgas_gdp_melt = pd.melt(fgas_gdp, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "FGaspGDP").drop(columns = "unit")
fgas_total_melt = pd.melt(fgas_total, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "FGasTOT").drop(columns = "unit")

#
N2O_capita_melt = pd.melt(N2O_capita, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "N2OpC").drop(columns = "unit")
N2O_gdp_melt = pd.melt(N2O_gdp, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "N2OpGDP").drop(columns = "unit")
N2O_total_melt = pd.melt(N2O_total, id_vars = ["iso", "Country/Region", "unit"], var_name = "Year", value_name = "N2OTOT").drop(columns = "unit")

In [190]:
#
dfs = [CH4_capita_melt, CH4_gdp_melt, CH4_total_melt, CO2_capita_melt, CO2_gdp_melt, CO2_total_melt, fgas_capita_melt, fgas_gdp_melt, fgas_total_melt, N2O_capita_melt, N2O_gdp_melt, N2O_total_melt]

#
ghg = reduce(lambda left, right: pd.merge(left, right, on = ["iso", "Country/Region", "Year"], how = "left"), dfs)

#
ghg.head()

Unnamed: 0,iso,Country/Region,Year,CH4pC,CH4pGDP,CH4TOT,CO2pC,CO2pGDP,CO2TOT,FGaspC,FGaspGDP,FGasTOT,N2OpC,N2OpGDP,N2OTOT
0,USA.TX,Texas,1990,6.33,246.87,107.96,32.16,1254.4,548.54,0.26,10.31,4.51,3.23,126.05,55.12
1,USA.CA,California,1990,2.15,71.05,64.37,10.42,344.4,312.06,0.1,3.19,2.89,0.62,20.55,18.62
2,USA.PA,Pennsylvania,1990,3.13,128.85,37.23,18.61,766.75,221.56,0.05,2.24,0.65,0.7,29.02,8.39
3,USA.OK,Oklahoma,1990,10.87,510.14,34.22,25.32,1188.46,79.73,0.2,9.62,0.65,3.71,174.05,11.68
4,USA.WV,West Virginia,1990,20.1,1149.17,36.03,45.88,2623.43,82.24,0.63,35.78,1.12,1.18,67.58,2.12


In [191]:
# rename states column
ghg = ghg.rename(columns = {"Country/Region": "State"})

# remove USA. from iso
ghg["iso"] = ghg["iso"].str.extract(r'USA\.(.*)')

# Assign regions
def get_region(state):
    if state in ['AK', 'CA', 'CO', 'HI', 'ID', 'MT', 'NV', 'OR', 'UT', 'WA', 'WY']:
        return 'West'
    elif state in ['AZ', 'NM', 'OK', 'TX']:
        return 'Southwest'
    elif state in ['IA', 'IL', 'IN', 'KS', 'MI', 'MN', 'MO', 'ND', 'NE', 'OH', 'SD', 'WI']:
        return 'Midwest'
    elif state in ['AL', 'AR', 'DC', 'DE', 'FL', 'GA', 'KY', 'LA', 'MD', 'MS', 'NC', 'SC', 'TN', 'VA', 'WV']:
        return 'Southeast'
    elif state in ['CT', 'MA', 'ME', 'NH', 'NJ', 'NY', 'PA', 'RI', 'VT']:
        return 'Northeast'
    else:
        return 'Other'

ghg["Region"] = ghg["iso"].apply(get_region)

#
ghg["TOT"] = ghg["CH4TOT"] + ghg["CO2TOT"] + ghg["FGasTOT"] + ghg["N2OTOT"]

# trillions 
ghg["GDP"] = ((ghg["CH4TOT"] / ghg["CH4pGDP"]) + (ghg["CO2TOT"] / ghg["CO2pGDP"]) + (ghg["FGasTOT"] / ghg["FGaspGDP"]) + (ghg["N2OTOT"] / ghg["N2OpGDP"])) / 3

# millions
ghg["Population"] = ((ghg["CH4TOT"] / ghg["CH4pC"]) + (ghg["CO2TOT"] / ghg["CO2pC"]) + (ghg["FGasTOT"] / ghg["FGaspC"]) + (ghg["N2OTOT"] / ghg["N2OpC"])) / 3

ghg

Unnamed: 0,iso,State,Year,CH4pC,CH4pGDP,CH4TOT,CO2pC,CO2pGDP,CO2TOT,FGaspC,FGaspGDP,FGasTOT,N2OpC,N2OpGDP,N2OTOT,Region,TOT,GDP,Population
0,TX,Texas,1990,6.33,246.87,107.96,32.16,1254.40,548.54,0.26,10.31,4.51,3.23,126.05,55.12,Southwest,716.13,0.583111,22.841018
1,CA,California,1990,2.15,71.05,64.37,10.42,344.40,312.06,0.10,3.19,2.89,0.62,20.55,18.62,West,397.94,1.208039,39.606657
2,PA,Pennsylvania,1990,3.13,128.85,37.23,18.61,766.75,221.56,0.05,2.24,0.65,0.70,29.02,8.39,Northeast,267.83,0.385730,16.261903
3,OK,Oklahoma,1990,10.87,510.14,34.22,25.32,1188.46,79.73,0.20,9.62,0.65,3.71,174.05,11.68,Southwest,126.28,0.089614,4.231752
4,WV,West Virginia,1990,20.10,1149.17,36.03,45.88,2623.43,82.24,0.63,35.78,1.12,1.18,67.58,2.12,Southeast,121.51,0.041791,2.386476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1576,NH,New Hampshire,2020,0.68,12.34,0.94,4.57,83.04,6.31,0.40,7.25,0.55,0.25,4.60,0.35,Northeast,8.15,0.101371,1.846032
1577,DE,Delaware,2020,0.80,12.70,0.79,11.93,190.11,11.84,0.40,6.35,0.40,0.48,7.63,0.48,Southeast,13.51,0.083462,1.326652
1578,HI,Hawaii,2020,0.46,9.55,0.67,9.74,201.53,14.13,0.52,10.75,0.75,0.40,8.25,0.58,West,16.13,0.093447,1.933183
1579,RI,Rhode Island,2020,0.27,5.73,0.30,8.27,175.99,9.07,0.38,8.16,0.42,0.18,3.92,0.20,Northeast,9.99,0.068795,1.474740


In [192]:
ghg.isna().sum()

iso           0
State         0
Year          0
CH4pC         0
CH4pGDP       0
CH4TOT        0
CO2pC         0
CO2pGDP       0
CO2TOT        0
FGaspC        0
FGaspGDP      0
FGasTOT       0
N2OpC         0
N2OpGDP       0
N2OTOT        0
Region        0
TOT           0
GDP           0
Population    0
dtype: int64

In [193]:
ghg_stack = alt.Chart(ghg).mark_bar().encode(
    x = "Year",
    y = "TOT",
    color = alt.Color(
        "Category:N",
    scale = alt.Scale(
            domain = ["CH4TOT", "CO2TOT", "FGasTOT", "N2OTOT"])
    )
).transform_fold(
    ["CH4TOT", "CO2TOT", "FGasTOT", "N2OTOT"],
    as_ = ["Category", "Emission"]
).properties(
    height = 500,
    title = "Stacked Bar Chart of Greenhouse Gasses"
)

ghg_stack

## maybe remove the next two graphs since the following graphs do the same thing

In [194]:
# can we remove this cell

fig1 = alt.Chart(ghg).mark_bar().encode(
    x = alt.X("Year:O"),
    y = alt.Y("CH4TOT:Q"),
    color = alt.Color("Region:N")
        ).properties(height = 500, title = "Regional Yearly Methane Totals")

fig1

In [195]:
# can we remove this cell

fig1 = alt.Chart(ghg).transform_filter(
    alt.FieldEqualPredicate(field='Region', equal='West')
).encode(
    x = alt.X("Year:O"),
    y = alt.Y("CH4TOT:Q"),
    color = alt.Color("Region:N")
).mark_line(point=True)

fig1

In [196]:
CH4_stack = alt.Chart(ghg).mark_bar().encode(
    x = alt.X("Year:O"),
    y = alt.Y("CH4TOT:Q", title = "CH4 in MtCO2e"),
    color = alt.Color("Region:N")
        ).properties(height = 500, title = "Regional Yearly Methane Totals")

CH4_stack


Trying to decide if we should use the mean or median because the data is right skewed.


In [197]:
CH4_mean = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("mean(CH4TOT):Q", title = "CH4 in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Methane Mean")

CH4_median = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("median(CH4TOT):Q", title = "CH4 in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Methane Median")

CH4_mean | CH4_median

In [198]:
CO2_stack = alt.Chart(ghg).mark_bar().encode(
    x = alt.X("Year:O"),
    y = alt.Y("CO2TOT:Q", title = "CO2 in MtCO2e"),
    color = alt.Color("Region:N")
        ).properties(height = 500, title = "Regional Yearly Carbon Dioxide Totals")

CO2_stack


In [199]:
CO2_mean = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("mean(CO2TOT):Q", title = "CO2 in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Carbon Dioxide Mean")

CO2_median = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("median(CO2TOT):Q", title = "CO2 in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Carbon Dioxide Median")

CO2_mean | CO2_median

In [200]:
FGas_stack = alt.Chart(ghg).mark_bar().encode(
    x = alt.X("Year:O"),
    y = alt.Y("FGasTOT:Q", title = "FGas in MtCO2e"),
    color = alt.Color("Region:N")
        ).properties(height = 500, title = "Regional Yearly Fluorinated Gas Totals")

FGas_stack


In [201]:
FGas_mean = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("mean(FGasTOT):Q", title = "FGas in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Fluorinated Gas Mean")

FGas_median = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("median(FGasTOT):Q", title = "FGas in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Fluorinated Gas Median")

FGas_mean | FGas_median

In [202]:
N2O_stack = alt.Chart(ghg).mark_bar().encode(
    x = alt.X("Year:O"),
    y = alt.Y("N2OTOT:Q", title = "N2O in MtCO2e"),
    color = alt.Color("Region:N")
        ).properties(height = 500, title = "Regional Yearly Nitrous Oxide Gas Totals")

N2O_stack


In [203]:
N2O_mean = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("mean(N2OTOT):Q", title = "N2O in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Nitrous Oxide Gas Mean")

N2O_median = alt.Chart(ghg).encode(
    x = alt.X("Year:O"),
    y = alt.Y("median(N2OTOT):Q", title = "N2O in MtCO2e"),
    color = alt.Color("Region:N")
).mark_line(point=True).properties(height = 500, title = "Regional Yearly Nitrous Oxide Gas Median")

N2O_mean | N2O_median

In [204]:
#
ghg_melted = pd.melt(ghg, id_vars=["iso", "State", "Year", "Region", "GDP", "Population"], value_vars=['CH4TOT', 'CO2TOT', 'FGasTOT', 'N2OTOT'],
                    var_name='Gas Type', value_name='Gas Total')

In [211]:
#
gdp_mw = alt.Chart(ghg_melted[ghg_melted["Region"] == "Midwest"]).encode(
    x = alt.X("GDP:Q", title = "Dollars in Trillions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Midwestern Gas Emission by GDP")

gdp_ne = alt.Chart(ghg_melted[ghg_melted["Region"] == "Northeast"]).encode(
    x = alt.X("GDP:Q", title = "Dollars in Trillions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Northeaster Gas Emission by GDP")

gdp_se = alt.Chart(ghg_melted[ghg_melted["Region"] == "Southeast"]).encode(
    x = alt.X("GDP:Q", title = "Dollars in Trillions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Southeastern Gas Emission by GDP")

gdp_sw = alt.Chart(ghg_melted[ghg_melted["Region"] == "Southwest"]).encode(
    x = alt.X("GDP:Q", title = "Dollars in Trillions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Southwestern Gas Emission by GDP")

gdp_w = alt.Chart(ghg_melted[ghg_melted["Region"] == "West"]).encode(
    x = alt.X("GDP:Q", title = "Dollars in Trillions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Western Gas Emission by GDP")

gdp_mw & gdp_sw | gdp_ne & gdp_w | gdp_se


In [213]:
#
pop_mw = alt.Chart(ghg_melted[ghg_melted["Region"] == "Midwest"]).encode(
    x = alt.X("Population:Q", title = "Population in Millions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Midwestern Gas Emission by Population")

pop_ne = alt.Chart(ghg_melted[ghg_melted["Region"] == "Northeast"]).encode(
    x = alt.X("Population:Q", title = "Population in Millions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Northeaster Gas Emission by Population")

pop_se = alt.Chart(ghg_melted[ghg_melted["Region"] == "Southeast"]).encode(
    x = alt.X("Population:Q", title = "Population in Millions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Southeastern Gas Emission by Population")

pop_sw = alt.Chart(ghg_melted[ghg_melted["Region"] == "Southwest"]).encode(
    x = alt.X("Population:Q", title = "Population in Millions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Southwestern Gas Emission by Population")

pop_w = alt.Chart(ghg_melted[ghg_melted["Region"] == "West"]).encode(
    x = alt.X("Population:Q", title = "Population in Millions"),
    y = alt.Y("mean(Gas Total):Q", title = "Greenhouse Gasses in MtCO2e"),
    color = alt.Color("Gas Type:N")
).mark_point().properties(height = 500, title = "Mean Western Gas Emission by Population")

pop_mw & pop_sw | pop_ne & pop_w | pop_se