In [38]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.neural_network import MLPRegressor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle
from sklearn.svm import SVR

In [11]:
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv("archive.csv")

In [7]:
df.head(10)

Unnamed: 0,Year,Month,Decimal Date,Carbon Dioxide (ppm),Seasonally Adjusted CO2 (ppm),Carbon Dioxide Fit (ppm),Seasonally Adjusted CO2 Fit (ppm)
0,1958,1,1958.0411,,,,
1,1958,2,1958.126,,,,
2,1958,3,1958.2027,315.69,314.42,316.18,314.89
3,1958,4,1958.2877,317.45,315.15,317.3,314.98
4,1958,5,1958.3699,317.5,314.73,317.83,315.06
5,1958,6,1958.4548,,,317.22,315.14
6,1958,7,1958.537,315.86,315.17,315.87,315.21
7,1958,8,1958.6219,314.93,316.17,314.01,315.29
8,1958,9,1958.7068,313.21,316.06,312.48,315.35
9,1958,10,1958.789,,,312.45,315.4


In [8]:
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 Year                                  0
Month                                 0
Decimal Date                          0
Carbon Dioxide (ppm)                 17
Seasonally Adjusted CO2 (ppm)        17
Carbon Dioxide Fit (ppm)             13
Seasonally Adjusted CO2 Fit (ppm)    13
dtype: int64


In [9]:
df.fillna(df.mean(), inplace=True)

In [12]:
#standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.iloc[:, 3:])  # Only scaling numerical columns
df.iloc[:, 3:] = scaled_data

In [13]:
df['Carbon Dioxide (ppm)'] = df['Carbon Dioxide (ppm)'].diff().fillna(0)  # Differencing

In [14]:
df.head(10)

Unnamed: 0,Year,Month,Decimal Date,Carbon Dioxide (ppm),Seasonally Adjusted CO2 (ppm),Carbon Dioxide Fit (ppm),Seasonally Adjusted CO2 Fit (ppm)
0,1958,1,1958.0411,0.0,2.199618e-15,-2.187543e-15,-2.192768e-15
1,1958,2,1958.126,0.0,2.199618e-15,-2.187543e-15,-2.192768e-15
2,1958,3,1958.2027,-1.41579,-1.468718,-1.380566,-1.433462
3,1958,4,1958.2877,0.067923,-1.44047,-1.337464,-1.42999
4,1958,5,1958.3699,0.00193,-1.456722,-1.317068,-1.426904
5,1958,6,1958.4548,1.345937,2.199618e-15,-1.340543,-1.423818
6,1958,7,1958.537,-1.409229,-1.439696,-1.392496,-1.421118
7,1958,8,1958.6219,-0.035891,-1.401,-1.464075,-1.418032
8,1958,9,1958.7068,-0.066379,-1.405256,-1.522955,-1.415717
9,1958,10,1958.789,1.511499,2.199618e-15,-1.52411,-1.413788


In [15]:
# Split data into train and test sets (you can adjust the split ratio as needed)
train_size = int(len(df) * 0.8)
train, test = df.iloc[:train_size], df.iloc[train_size:]

# Define evaluation function
def evaluate_forecast(y_true, y_pred):
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    return rmse

In [34]:
# Define and fit ARIMA model
arima_model = ARIMA(train['Carbon Dioxide (ppm)'], order=(5,1,0)) # Example order, tune as needed
arima_model_fit = arima_model.fit()

# Forecast
arima_forecast = arima_model_fit.forecast(steps=len(test))
arima_rmse = evaluate_forecast(test['Carbon Dioxide (ppm)'], arima_forecast)

In [35]:
# 2. ANN (Multi-layer Perceptron)
# Define and fit MLPRegressor
ann_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000) # Example architecture, tune as needed
ann_model.fit(train[['Year', 'Month', 'Decimal Date', 'Seasonally Adjusted CO2 (ppm)', 'Carbon Dioxide Fit (ppm)', 'Seasonally Adjusted CO2 Fit (ppm)']], train['Carbon Dioxide (ppm)'])

# Forecast
ann_forecast = ann_model.predict(test[['Year', 'Month', 'Decimal Date', 'Seasonally Adjusted CO2 (ppm)', 'Carbon Dioxide Fit (ppm)', 'Seasonally Adjusted CO2 Fit (ppm)']])
ann_rmse = evaluate_forecast(test['Carbon Dioxide (ppm)'], ann_forecast)


In [36]:
# 3. SARIMA
# Define and fit SARIMA model
sarima_model = SARIMAX(train['Carbon Dioxide (ppm)'], order=(5,1,0), seasonal_order=(1, 1, 1, 12)) # Example orders, tune as needed
sarima_model_fit = sarima_model.fit()

# Forecast
sarima_forecast = sarima_model_fit.forecast(steps=len(test))
sarima_rmse = evaluate_forecast(test['Carbon Dioxide (ppm)'], sarima_forecast)



In [39]:
# 4. Support Vector Regression (SVR)
# Define and train SVR model
svr_model = SVR(kernel='rbf', C=100, gamma=0.1)  # Example parameters, tune as needed
svr_model.fit(train[['Year', 'Month', 'Decimal Date', 'Seasonally Adjusted CO2 (ppm)', 'Carbon Dioxide Fit (ppm)', 'Seasonally Adjusted CO2 Fit (ppm)']], train['Carbon Dioxide (ppm)'])

# Forecast
svr_forecast = svr_model.predict(test[['Year', 'Month', 'Decimal Date', 'Seasonally Adjusted CO2 (ppm)', 'Carbon Dioxide Fit (ppm)', 'Seasonally Adjusted CO2 Fit (ppm)']])
svr_rmse = evaluate_forecast(test['Carbon Dioxide (ppm)'], svr_forecast)

In [31]:
# Deploy the best models based on RMSE
best_models = {'ARIMA': arima_rmse, 'ANN': ann_rmse, 'SARIMA': sarima_rmse, 'SVR': svr_rmse}
best_models = dict(sorted(best_models.items(), key=lambda item: item[1]))

print("Best Models (RMSE):", best_models)

Best Models (RMSE): {'ARIMA': 0.18286213361658596, 'SARIMA': 0.18459880190922684, 'SVR': 0.19075694302387633, 'ANN': 0.30396603971782593}


In [42]:
# Pickle the model results
with open('model_results.pkl', 'wb') as f:
    pickle.dump({
        'arima_forecast': arima_forecast,
        'arima_rmse': arima_rmse,
        'ann_forecast': ann_forecast,
        'ann_rmse': ann_rmse,
        'sarima_forecast': sarima_forecast,
        'sarima_rmse': sarima_rmse,
        'svr_forecast': svr_forecast,
        'svr_rmse': svr_rmse,
        'test_data': test['Carbon Dioxide (ppm)']  # Include test data
    }, f)