In [11]:
# here, we convert campd-monthly-emissions-facility-aggregation.csv 
# into the proper timer series format and output power_plant_count.csv

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../../SharedData/dataset-generation/campd-monthly-emissions-facility-aggregation.csv')

In [12]:
abbreviations = [
    "AL", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]

In [13]:
# calculate power plant ages in months

df.sort_values(by=['State','Facility ID', 'Year','Month'], inplace=True)
df['facility_age'] = df.groupby('Facility ID').cumcount() + 1

In [14]:
df = df.dropna(subset=['Gross Load (MWh)', 'Steam Load (1000 lb)', 'SO2 Mass (short tons)', 'CO2 Mass (short tons)', 'NOx Mass (short tons)', 'Heat Input (mmBtu)'], how='all')
df = df.drop(columns=['Gross Load (MWh)','Steam Load (1000 lb)', 'SO2 Mass (short tons)', 'CO2 Mass (short tons)', 'NOx Mass (short tons)', 'Heat Input (mmBtu)'])
df = df.rename(columns={'State':'state', 'Year':'year', 'Month':'month','Facility ID':'id'})
df = df[df.year >= 1997]

In [15]:
# Count power plants each month and add up ages

new_df = df.groupby(['year', 'month', 'state'],as_index=False).agg(
    monthly_num_plants=('id', 'count'),
    total_facility_age=('facility_age', 'sum')
)
new_df = new_df.rename(columns={'id':'monthly_num_plants'})

# Date formatting

new_df['date'] = new_df['year'].astype(str) + '-' + new_df['month'].astype(str).str.zfill(2)
new_df['date'] = new_df['date'].apply(pd.to_datetime)
new_df['date'] = new_df['date'].dt.strftime('%Y-%m')

In [16]:
# Initialize yearly column

new_df['yearly_num_plants'] = 0

In [17]:
# Fill in yearly column by using data from month 12 of each year

for state in abbreviations:
    for year in range(1997,2021):
        year_value = new_df.loc[(new_df.state==state)&(new_df.year==year)&(new_df.month==12),'monthly_num_plants']
        new_df.loc[(new_df.state==state)&(new_df.year==year),'yearly_num_plants'] = year_value.values[0]

In [23]:
# Reorganize columns and drop unnecessary ones
new_df = new_df[['state', 'date', 'monthly_num_plants', 'yearly_num_plants', 'total_facility_age']]
new_df.head()

Unnamed: 0,state,date,monthly_num_plants,yearly_num_plants,total_facility_age
0,AL,1997-01,11,10,95
1,AR,1997-01,7,6,7
2,AZ,1997-01,7,9,7
3,CA,1997-01,24,24,24
4,CO,1997-01,14,14,14


In [24]:
# export new_df to a .csv
new_df.to_csv('../../SharedData/dataset-generation-final/power-plant-count-final.csv', index=False)