In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
from scipy import stats

In [12]:
#Data taken from 2020 census but as a csv
#Read the csv
power_data = "../Output/power_us_comp_df.csv"
power_data_df = pd.read_csv(power_data, encoding="ISO-8859-1")
income_data = "../Resources/2017_median_income_by_county.csv"
income_data_df = pd.read_csv(income_data,encoding="ISO-8859-1")
income_data_df.head()

Unnamed: 0,Geography,county,state,total household,median income
0,0500000US01003,Baldwin County,Alabama,79120,55342
1,0500000US01015,Calhoun County,Alabama,44507,46763
2,0500000US01043,Cullman County,Alabama,30740,45044
3,0500000US01049,DeKalb County,Alabama,25848,39373
4,0500000US01051,Elmore County,Alabama,29176,60558


In [42]:
#Read the CSV for all the state powerplants
power_data_df = power_data_df[['primary_fuel','state','county']]
#power_data_df.head()
income_data_df = income_data_df[['county','state','total household','median income']]
income_data_df.head()

Unnamed: 0,county,state,total household,median income
0,Baldwin County,Alabama,79120,55342
1,Calhoun County,Alabama,44507,46763
2,Cullman County,Alabama,30740,45044
3,DeKalb County,Alabama,25848,39373
4,Elmore County,Alabama,29176,60558


In [40]:
power_data_df.dropna(axis=0,how='any')
power_data_df.head()

Unnamed: 0,primary_fuel,state,county
0,Solar,New York,Rockland County
1,Solar,North Carolina,Lee County
2,Solar,North Carolina,Yadkin County
3,Solar,New Jersey,Mercer County
4,Solar,Massachusetts,Norfolk County


In [41]:
filtered_power_data_df = power_data_df.groupby(['county'])
filtered_power_data_df.head()

Unnamed: 0,primary_fuel,state,county
0,Solar,New York,Rockland County
1,Solar,North Carolina,Lee County
2,Solar,North Carolina,Yadkin County
3,Solar,New Jersey,Mercer County
4,Solar,Massachusetts,Norfolk County
...,...,...,...
9798,Hydro,Montana,Big Horn County
9799,Hydro,Washington,Thurston County
9803,Hydro,Georgia,Stephens County
9812,Hydro,Washington,Snohomish County


In [43]:
power_data_df['median income'] = ''
power_data_df.columns

Index(['primary_fuel', 'state', 'county', 'median income'], dtype='object')

In [46]:
merged_df = power_data_df.merge(income_data_df, on=['county', 'state'])
#filtered_power_data_df.merge(income_data_df.to_frame(), left_on='county', right_index=True)
merge_df.head()

Unnamed: 0,primary_fuel,state,county,median income_x,total household,median income_y
0,Solar,New York,Rockland County,,,
1,Solar,North Carolina,Lee County,,,
2,Solar,North Carolina,Yadkin County,,,
3,Solar,New Jersey,Mercer County,,,
4,Solar,Massachusetts,Norfolk County,,,


In [None]:
#Sort the CO2 data to only include CO2 generated by electrical power generation
state_CO2_data_filtered_df = state_CO2_data_df.loc[state_CO2_data_df["Sector Name"] == "Electric Power carbon dioxide emissions"].reset_index(drop = True)
state_CO2_data_filtered_df = state_CO2_data_filtered_df[["State", "Year", "CO2 Emissions (Million Metric Tons)"]]
state_CO2_data_filtered_df

In [None]:
#Add a column to determine the percentage of electricity generated from high-carbon fossil fuels (Percentage High-Carbon)
state_CO2_data_2020_df = state_CO2_data_filtered_df.loc[state_CO2_data_filtered_df["Year"] == 2020].reset_index(drop=True)
state_CO2_data_2020_df["Percentage High-Carbon"] = ""

In [None]:
#Only select 2020 - the same year as census population data
state_generation_data_2020_df = state_generation_data_df.loc[state_generation_data_df["Year"] == 2020].reset_index(drop=True)

In [None]:
#Generate a list of the states
states = state_generation_data_2020_df["State"].unique()

In [None]:
#Create two empty lists - one to store the total generation capacity of each state
# and the other to share the generation capacity of non-renewable sources only for each state
total_capacity = []
nonrenewable_capacity = []

In [None]:
# Run through each state

for state in states:
    #Reset a counter which will add up the totals by state each time
    nonrenewables = 0
    total = 0
    #For each row in the electricity generation dataframe:
    for index, row in state_generation_data_2020_df.iterrows():
        #Select rows only that match the state we're looking at
        if state_generation_data_2020_df.loc[index, "State"] == state:
            #Select only the fossil fuel sources
            if ((state_generation_data_2020_df.loc[index, "Fuel Type"] == "Coal") | 
                (state_generation_data_2020_df.loc[index, "Fuel Type"] == "Natural Gas") |
                (state_generation_data_2020_df.loc[index, "Fuel Type"] == "Petroleum")):
                    #Add the totals of the non-renewables capacity together for each state 
                    nonrenewables = nonrenewables + state_generation_data_2020_df.loc[index, "Capacity (MWh)"]
            #And calculate the total capacity for each state
            total = total + state_generation_data_2020_df.loc[index, "Capacity (MWh)"]
            
    #Add both of these numbers to the empty list created earlier
    nonrenewable_capacity.append(nonrenewables)
    total_capacity.append(total)




In [None]:
nonrenewables_percent = []
for i in range (len(total_capacity)):
    #Calculate the percentage of electricity generated by non-renewable sources for each state
    percentnonrenewable = round(((nonrenewable_capacity[i])/(total_capacity[i]))*100,2)
    nonrenewables_percent.append(percentnonrenewable)
    
nonrenewables_percent

In [None]:
#Input that data to the state CO2 dataframe
state_CO2_data_2020_df["Percentage High-Carbon"] = nonrenewables_percent
state_CO2_data_2020_df

In [None]:
#Add in a column to determine the CO2 emissions per person
state_CO2_data_2020_df.insert(3, "CO2 Emissions per capita (Tons per person)", "")

In [None]:
# Calculate the CO2 Emissions per capita (in tons per person)
state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"] = (state_CO2_data_2020_df["CO2 Emissions (Million Metric Tons)"]/state_pop_data_df["Population"])*(1000000)
state_CO2_data_2020_df

In [None]:
#Plot the percentage of electricity generated by fossil fuels for each state
#Against the CO2 emissions per capita in that state

x_values = state_CO2_data_2020_df["Percentage High-Carbon"]
y_values = state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = f"y = {str(round(slope,2))}x + {str(round(intercept,2))}"
rvalue_string = f"r-value = {str(round(rvalue,2))}"

plt.scatter(x_values, y_values)
plt.plot(x_values,regress_values)
plt.annotate(line_eq,(15,55),fontsize=12.5,color="red")
plt.annotate(rvalue_string, (15, 50), fontsize = 12.5, color = "red")
plt.xlabel("Percentage of Electricity generated from Fossil Fuel sources")
plt.ylabel("CO2 Emissions per capita (Tons per person) \n from Electricity generation")
plt.title("State CO2 Emissions per capita vs \n Percentage of electricity generated from fossil fuel sources \n (Including Outliers)")
print(f"The r-value for this is {rvalue}")

plt.savefig("../Output/Images/State_CO2_Emissions_against_%fossilfuels(Inc_Outliers).png")
plt.show


In [None]:
#Determing outliers
quartiles = state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"].quantile([.25, .5, .75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
outliers_df = state_CO2_data_2020_df.loc[(state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"] < lower_bound) | (state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"] > upper_bound)]
print(outliers_df["State"])
print(f"The upper bound is {upper_bound}")

In [None]:
#Removing the Outliers
state_CO2_data_2020_without_outliers = state_CO2_data_2020_df.loc[(state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"] < upper_bound) & (state_CO2_data_2020_df["CO2 Emissions per capita (Tons per person)"] > 0)]
state_CO2_data_2020_without_outliers

In [None]:
x_values_no_outs = state_CO2_data_2020_without_outliers["Percentage High-Carbon"]
y_values_no_outs = state_CO2_data_2020_without_outliers["CO2 Emissions per capita (Tons per person)"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values_no_outs, y_values_no_outs)
regress_values_no_outs = x_values_no_outs * slope + intercept

line_eq = f"y = {str(round(slope,2))}x + {str(round(intercept,2))}"
rvalue_string = f"r-value = {str(round(rvalue,2))}"

plt.scatter(x_values_no_outs, y_values_no_outs)
plt.plot(x_values_no_outs,regress_values_no_outs)
plt.xlabel("Percentage of Electricity generated from High-Carbon sources")
plt.ylabel("CO2 Emissions per capita (Tons per person) \n from Electricity generation")

plt.title("State CO2 Emissions per capita vs \n Percentage of electricity generated from fossil fuel sources \n (Excluding Outliers)")

plt.annotate(line_eq,(15,10.75),fontsize=12.5,color="red")
plt.annotate(rvalue_string, (15, 9.75), fontsize = 12.5, color = "red")

plt.savefig("../Output/Images/State_CO2_Emissions_against_%fossilfuels(Exc_Outliers).png")

plt.show
print(f"The r-value for this is {rvalue}")