In [115]:
# here, we combine all finalized .csv files into single dataframes. 
# Specifically, we output 2 versions, a monthly and a yearly version
# The both versions contain 

import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df1 = pd.read_csv('../SharedData/dataset-generation-final/population-1990-2020-final.csv')
df2 = pd.read_csv('../SharedData/dataset-generation-final/monthly-emissions-1990-2024-final.csv')
df3 = pd.read_csv('../SharedData/dataset-generation-final/monthly-weather-1990-2019-final.csv')
df4 = pd.read_csv('../SharedData/dataset-generation-final/gdp-1997-2023-final.csv')
df5 = pd.read_csv('../SharedData/dataset-generation-final/energy-use-prod-final.csv')
df6 = pd.read_csv('../SharedData/dataset-generation-final/power-plant-count-final.csv')
df7 = pd.read_csv('../SharedData/dataset-generation-final/energy-by-source-final.csv')
df8 = pd.read_csv('../SharedData/dataset-generation-final/republican-votes-by-state.csv')
df9 = pd.read_csv('../SharedData/dataset-generation-final/state-areas-final.csv')


In [122]:
# merge all dfs 1-8 on 'state' and 'date'
total_df = df1.merge(df2, on=['state','date'], how='right').merge(df3, on=['state','date'], how='left').merge(df4, on=['state','date'], how='left').merge(df5, on=['state','date'], how='left').merge(df6, on=['state','date'], how='left').merge(df7,on=['state','date'], how='left').merge(df8,on=['state','date'], how='left') 

# remove DC, PR (not states) and HI, AK (states with missing CO2 data)
total_df = total_df[~total_df.state.isin(['DC','PR','HI','AK'])]
assert(len(total_df.state.unique() == 48))

# ensure the 'date' column is in correct datetime formatting
total_df['date']=pd.to_datetime(total_df['date'])

In [123]:
# Check dataframe: 
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20160 entries, 0 to 20669
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   state                        20160 non-null  object        
 1   date                         20160 non-null  datetime64[ns]
 2   monthly_population           17040 non-null  float64       
 3   yearly_population            17280 non-null  float64       
 4   monthly_emissions            20160 non-null  float64       
 5   prcp                         17281 non-null  float64       
 6   snow                         17281 non-null  float64       
 7   tavg                         17281 non-null  float64       
 8   monthly_gdp_rel_2017         15552 non-null  float64       
 9   yearly_gdp_rel_2017          15552 non-null  float64       
 10  monthly_energy_prod          19008 non-null  float64       
 11  monthly_energy_use           19008 non-null  f

In [124]:
# Create a dictionary to use to calculate population density
df9_dict = df9.set_index('state')['area'].to_dict()

# add columns for monthly/yearly population density
total_df['monthly_pop_density'] = total_df['state'].map(df9_dict) / total_df['monthly_population']
total_df['yearly_pop_density'] = total_df['state'].map(df9_dict) / total_df['yearly_population']

In [125]:
# Split the dataframe into two: monthly and yearly features

mo_cols = ['state', 'date',
           'monthly_emissions', 'monthly_population', 'monthly_pop_density',
           'prcp', 'snow', 'tavg', 
           'monthly_gdp_rel_2017', 
           'monthly_energy_prod', 'monthly_energy_use', 'monthly_energy_flow', 
           'monthly_num_plants',
           'total_facility_age', 'total_weighted_facility_age',
           'monthly_renew_pct', 'monthly_fossil_pct', 'monthly_coal_pct',
           'republican_vote_pct']

yr_cols = ['state', 'date',
           'monthly_emissions', 'yearly_population', 'yearly_pop_density',
           'prcp', 'snow', 'tavg', 
           'yearly_gdp_rel_2017', 
           'yearly_energy_prod', 'yearly_energy_use', 'yearly_energy_flow', 
           'yearly_num_plants', 
           'total_facility_age', 'total_weighted_facility_age',
           'yearly_renew_pct', 'yearly_fossil_pct', 'yearly_coal_pct',
           'republican_vote_pct']

monthly_total_df = total_df[mo_cols]
yearly_total_df = total_df[yr_cols]

Now we calculate the per capita data for both the monthly and yearly dataframes:

In [126]:
# add all per capita features:

monthly_total_df['co2_per_capita'] = monthly_total_df['monthly_emissions'] / monthly_total_df['monthly_population']
monthly_total_df['gdp_per_capita'] = monthly_total_df['monthly_gdp_rel_2017'] / monthly_total_df['monthly_population']
monthly_total_df['eprod_per_capita'] = monthly_total_df['monthly_energy_prod'] / monthly_total_df['monthly_population']
monthly_total_df['eflow_per_capita'] = monthly_total_df['monthly_energy_flow'] / monthly_total_df['monthly_population']
monthly_total_df['euse_per_capita'] = monthly_total_df['monthly_energy_use'] / monthly_total_df['monthly_population']
monthly_total_df['avg_facility_age'] = monthly_total_df['total_facility_age'] / monthly_total_df['monthly_num_plants']
monthly_total_df['avg_weighted_facility_age'] = monthly_total_df['total_weighted_facility_age'] / monthly_total_df['monthly_num_plants'] 

# yearly_total_df['co2_per_capita'] = yearly_total_df['yearly_emissions'] / yearly_total_df['yearly_population']
yearly_total_df['co2_per_capita'] = yearly_total_df['monthly_emissions'] / yearly_total_df['yearly_population']
yearly_total_df['gdp_per_capita'] = yearly_total_df['yearly_gdp_rel_2017'] / yearly_total_df['yearly_population']
yearly_total_df['eprod_per_capita'] = yearly_total_df['yearly_energy_prod'] / yearly_total_df['yearly_population']
yearly_total_df['eflow_per_capita'] = yearly_total_df['yearly_energy_flow'] / yearly_total_df['yearly_population']
yearly_total_df['euse_per_capita'] = yearly_total_df['yearly_energy_use'] / yearly_total_df['yearly_population']
yearly_total_df['avg_facility_age'] = yearly_total_df['total_facility_age'] / yearly_total_df['yearly_num_plants']
yearly_total_df['avg_weighted_facility_age'] = yearly_total_df['total_weighted_facility_age'] / yearly_total_df['yearly_num_plants'] 

In [127]:
# At this point, we rearrange the columns for readability, and split the monthly/yearly dataframes into 2:
# The first will hold all the monthly/yearly features, and the 2nd will hold only the features of interest, 
# i.e. we remove the features that were only used to calculate the other features. 
# In particular, the 2nd dataframe removes: 
# monthly_emissions, monthly_gdp_rel_2017, monthly_energy_prod, monthly_energy_use, monthly_energy_flow, 
# monthly_population, total_facility_age, and total_weighted_facility_age
# (and resp. yearly versions)

mo_tot_cols = ['state', 'date', 
                'monthly_emissions', 'co2_per_capita',
                'prcp', 'snow', 'tavg', 
                'monthly_gdp_rel_2017', 'gdp_per_capita',
                'monthly_energy_prod', 'eprod_per_capita', 
                'monthly_energy_use', 'euse_per_capita', 
                'monthly_energy_flow', 'eflow_per_capita',
                'monthly_num_plants', 
                'monthly_renew_pct', 'monthly_fossil_pct', 'monthly_coal_pct',
                'republican_vote_pct', 
                'monthly_population', 'monthly_pop_density', 
                'total_facility_age', 'avg_facility_age',
                'total_weighted_facility_age', 'avg_weighted_facility_age']

mo_cap_cols = ['state', 'date', 
                'co2_per_capita',
                'prcp', 'snow', 'tavg', 
                'gdp_per_capita',
                'eprod_per_capita', 
                'euse_per_capita', 
                'eflow_per_capita',
                'monthly_num_plants', 
                'monthly_renew_pct', 'monthly_fossil_pct', 'monthly_coal_pct',
                'republican_vote_pct', 
                'monthly_pop_density',
                'avg_facility_age', 'avg_weighted_facility_age']

yr_tot_cols = ['state', 'date', 
                'monthly_emissions', 'co2_per_capita',
                'prcp', 'snow', 'tavg', 
                'yearly_gdp_rel_2017', 'gdp_per_capita',
                'yearly_energy_prod', 'eprod_per_capita', 
                'yearly_energy_use', 'euse_per_capita', 
                'yearly_energy_flow', 'eflow_per_capita',
                'yearly_num_plants', 
                'yearly_renew_pct', 'yearly_fossil_pct', 'yearly_coal_pct', 
                'republican_vote_pct', 
                'yearly_population', 'yearly_pop_density',
                'total_facility_age', 'avg_facility_age',
                'total_weighted_facility_age', 'avg_weighted_facility_age']

yr_cap_cols = ['state', 'date', 
                'co2_per_capita',
                'prcp', 'snow', 'tavg', 
                'gdp_per_capita',
                'eprod_per_capita', 
                'euse_per_capita',
                'eflow_per_capita', 
                'yearly_num_plants', 
                'yearly_renew_pct', 'yearly_fossil_pct', 'yearly_coal_pct',
                'republican_vote_pct', 
                'yearly_pop_density', 
                'avg_facility_age', 'avg_weighted_facility_age']


In [128]:
# Here we make the split as noted above:

monthly_total_df = monthly_total_df[mo_tot_cols]

monthly_capita_df  = monthly_total_df[mo_cap_cols]

yearly_total_df = yearly_total_df[yr_tot_cols]

yearly_capita_df = yearly_total_df[yr_cap_cols]

In [129]:
print("Number of total monthly features:", len(monthly_total_df.columns))
print("Number of per capita monthly features:", len(monthly_capita_df.columns))

print("Number of total yearly features:", len(yearly_total_df.columns))
print("Number of per capita yearly features:", len(yearly_capita_df.columns))

Number of total monthly features: 26
Number of per capita monthly features: 18
Number of total yearly features: 26
Number of per capita yearly features: 18


In [132]:
monthly_total_df.to_csv('../SharedData/FinalData/monthly_total_data.csv', index=False)
monthly_capita_df.to_csv('../SharedData/FinalData/monthly_capita_data.csv', index=False)
yearly_total_df.to_csv('../SharedData/FinalData/yearly_total_data.csv', index=False)
yearly_capita_df.to_csv('../SharedData/FinalData/yearly_capita_data.csv', index=False)