In [11]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from pandas.tseries.offsets import DateOffset
%matplotlib inline

# Preprocessing Data

In [12]:
def states(raw_data):
    df = df= raw_data[['State','Date', 'CO2 Mass (short tons)']].copy()
    

In [13]:
def preprocessing_data(raw_data):
    df= raw_data[['Date', 'CO2 Mass (short tons)']].copy()
    df.columns=['Date', 'CO2_Emission']
    df['CO2_Emission']=df['CO2_Emission'].fillna(0)
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    return df

In [14]:
def split_data(raw_data):
    train = raw_data[:'2021-12-31']
    test  = raw_data['2022-1-1':]
    return train, test

In [15]:
def preprocessing_data_to_monthly(raw_data):
    df=preprocessing_data(raw_data)
    df, test = split_data(df)
    return df, test

In [16]:
def preprocessing_data_to_monthly_notest(raw_data):
    df=preprocessing_data(raw_data)
    return df

In [17]:
def mean_absolute_percentage_error(y_true, y_pred): 
    MAPE_sum=0
    for i in range(0, len(y_true)):
        if y_true[i]==0:
            MAPE.append(None)
            continue
        else:
            MAPE.append(abs(y_true[i] - y_pred[i])/y_true[i]*100)
            MAPE_sum = MAPE_sum+ MAPE[i]
    return MAPE_sum/(len(y_true)-1)


In [27]:
def Sarimax(raw_data, st):
    df=preprocessing_data_to_monthly_notest(raw_data)
    sarimax= pm.auto_arima(df['CO2_Emission'],
                           start_p=1, start_q=1,
                           test='adf',
                           max_p=3, max_q=3, m=12,
                           start_P=0, seasonal=True,
                           d=None, D=1, trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True)
    model=sm.tsa.statespace.SARIMAX(df['CO2_Emission'],order=sarimax.order,seasonal_order=sarimax.seasonal_order)
    results=model.fit()
    future_dates=[df.index[-1]+ DateOffset(months=x)for x in range(0,61)]
    future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)
    future_df=pd.concat([df,future_datest_df])
    future_df['predicted'] = results.predict(start = len(df.index), end = len(df.index)+61, dynamic= True)  
    future_df.to_csv('Sarimax_weekly_result.csv')
    final_df = pd.read_csv('Sarimax_weekly_result.csv')
    final_df.columns=['Date','Historical', 'Predicted']
    final_df['Date'] = pd.to_datetime(final_df['Date'])
    final_df.set_index('Date', inplace=True)
    final_df['State'] = st
    return final_df
    
    


In [28]:
data=[]
raw_data = pd.read_csv('monthly.csv')
dfer= raw_data[['Date', 'CO2 Mass (short tons)', 'State']].copy() #filters to these columns
#states = dfer['State'].unique() #array of each state
states= ['CA']
for st in states:
    data_state = dfer.loc[dfer['State']== st] #filters for data for that specific state
    data.append(Sarimax(data_state, st))
    
final_data = pd.concat(data)
final_data.to_csv('final_data2027.csv')

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=5926.655, Time=0.21 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=6052.075, Time=0.01 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=5951.481, Time=0.08 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=6001.031, Time=0.14 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=6050.758, Time=0.01 sec
 ARIMA(1,0,1)(0,1,0)[12] intercept   : AIC=5979.401, Time=0.06 sec
 ARIMA(1,0,1)(1,1,1)[12] intercept   : AIC=5927.986, Time=0.30 sec
 ARIMA(1,0,1)(0,1,2)[12] intercept   : AIC=5928.103, Time=0.45 sec
 ARIMA(1,0,1)(1,1,0)[12] intercept   : AIC=5942.512, Time=0.12 sec
 ARIMA(1,0,1)(1,1,2)[12] intercept   : AIC=5929.862, Time=0.82 sec
 ARIMA(1,0,0)(0,1,1)[12] intercept   : AIC=5933.034, Time=0.12 sec
 ARIMA(2,0,1)(0,1,1)[12] intercept   : AIC=5927.688, Time=0.32 sec
 ARIMA(1,0,2)(0,1,1)[12] intercept   : AIC=5923.898, Time=0.32 sec
 ARIMA(1,0,2)(0,1,0)[12] intercept   : AIC=5981.852, Time=0.08 sec
 ARIMA(1,0,2)(1,1,1

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Unnamed: 0,Date,CO2 Mass (short tons),State
0,2005-01-01,7828972.171,AL
1,2005-02-01,6443405.776,AL
2,2005-03-01,7296086.599,AL
3,2005-04-01,6049318.953,AL
4,2005-05-01,6848692.033,AL
...,...,...,...
10579,2022-08-01,4435549.961,WY
10580,2022-09-01,4010366.330,WY
10581,2022-10-01,3944799.551,WY
10582,2022-11-01,3458120.066,WY


In [26]:
final_data.to_csv('final_data2027.csv')