# <center>CO2 LSTM Forecasting</center>

> Authors:
> - D11202805 - Ian Joseph Chandra
> - M11002818 - Wilfrid Azariah

# Step 1: Import Libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from datetime import timedelta

# # Load data
df = pd.read_csv('./data/emissions.csv')

df


Unnamed: 0,year,state-name,sector-name,fuel-name,value
0,1970,Alabama,Industrial carbon dioxide emissions,Coal,26.721507
1,1970,Alabama,Industrial carbon dioxide emissions,Petroleum,3.577779
2,1970,Alabama,Industrial carbon dioxide emissions,Natural Gas,8.944097
3,1970,Alabama,Industrial carbon dioxide emissions,All Fuels,39.243383
4,1970,Alabama,Total carbon dioxide emissions from all sectors,All Fuels,102.646851
...,...,...,...,...,...
59896,2021,Wyoming,Commercial carbon dioxide emissions,Coal,0.012374
59897,2021,Wyoming,Residential carbon dioxide emissions,All Fuels,0.937989
59898,2021,Wyoming,Residential carbon dioxide emissions,Natural Gas,0.717777
59899,2021,Wyoming,Residential carbon dioxide emissions,Petroleum,0.220212


# Step 2: Data Preprocessing

In [21]:
# Remove "-name" from column names
df.columns = df.columns.str.replace('-name', '')

# Remove "carbon dioxide emissions" from 'sector' column
df['sector'] = df['sector'].str.replace('carbon dioxide emissions', '')

# Remove rows where 'fuel' column contains "All Fuels"
df = df[~df['fuel'].str.contains('All Fuels')]

# Remove rows where 'sector' column contains "Total"
df = df[~df['sector'].str.contains('Total')]

df

Unnamed: 0,year,state,sector,fuel,value
0,1970,Alabama,Industrial,Coal,26.721507
1,1970,Alabama,Industrial,Petroleum,3.577779
2,1970,Alabama,Industrial,Natural Gas,8.944097
8,1970,Alabama,Residential,Coal,0.163635
9,1970,Alabama,Residential,Petroleum,1.123947
...,...,...,...,...,...
59894,2021,Wyoming,Commercial,Natural Gas,0.686983
59895,2021,Wyoming,Commercial,Petroleum,0.216315
59896,2021,Wyoming,Commercial,Coal,0.012374
59898,2021,Wyoming,Residential,Natural Gas,0.717777


In [22]:
# Exclude 'state' column & group by 'year', 'sector', 'fuel'
df = df.drop('state', axis=1)
state_excluded = df.groupby(['year', 'sector', 'fuel'], as_index=False)['value'].sum()

state_excluded

Unnamed: 0,year,sector,fuel,value
0,1970,Commercial,Coal,31.306261
1,1970,Commercial,Natural Gas,260.289232
2,1970,Commercial,Petroleum,233.356940
3,1970,Electric Power,Coal,1374.959395
4,1970,Electric Power,Natural Gas,429.208459
...,...,...,...,...
717,2021,Industrial,Petroleum,698.180684
718,2021,Residential,Natural Gas,520.017628
719,2021,Residential,Petroleum,132.467958
720,2021,Transportation,Natural Gas,129.829709


In [23]:
# Group by fuel
fuel_group = state_excluded.groupby(['year', 'fuel'], as_index=False)['value'].sum()

fuel_group

Unnamed: 0,year,fuel,value
0,1970,Coal,2311.126560
1,1970,Natural Gas,2255.982068
2,1970,Petroleum,3939.954943
3,1971,Coal,2187.743724
4,1971,Natural Gas,2325.650502
...,...,...,...
151,2020,Natural Gas,3300.450490
152,2020,Petroleum,4133.134512
153,2021,Coal,2017.214715
154,2021,Natural Gas,3309.710671


In [24]:
# Group by sector
sector_group = state_excluded.groupby(['year', 'sector'], as_index=False)['value'].sum()

sector_group

Unnamed: 0,year,sector,value
0,1970,Commercial,524.952433
1,1970,Electric Power,2122.880366
2,1970,Industrial,2640.355565
3,1970,Residential,955.538394
4,1970,Transportation,2263.336813
...,...,...,...
255,2021,Commercial,495.812665
256,2021,Electric Power,3084.080959
257,2021,Industrial,1960.369449
258,2021,Residential,652.485586
