# Import Required Libraries
Import the necessary libraries, including pandas, numpy, statsmodels, and matplotlib.

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Load and Preprocess the Ozone Level Detection Dataset
Load the Ozone Level Detection dataset from the UCI Machine Learning Repository and preprocess it for analysis.

In [10]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.data"
df = pd.read_csv(url, header=None)

# Check the number of columns in the dataframe
num_cols = df.shape[1]
print(f'The dataframe has {num_cols} columns.')

# If the number of columns is 6, rename the columns
if num_cols == 6:
    df.columns = ['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day']
else:
    print('The number of columns in the dataframe does not match the expected number.')

# Replace missing values '?' with NaN
df = df.replace('?', np.nan)

# Convert columns to appropriate data types
df = df.astype({'Ozone': 'float64', 'Solar.R': 'float64', 'Wind': 'float64', 'Temp': 'float64', 'Month': 'int64', 'Day': 'int64'})

# Fill missing values with column means
df.fillna(df.mean(), inplace=True)

# Display the first few rows of the dataframe
df.head()

The dataframe has 74 columns.
The number of columns in the dataframe does not match the expected number.


KeyError: "Only a column name can be used for the key in a dtype mappings argument. 'Ozone' not found in columns."

In [7]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.data"
df = pd.read_csv(url, header=None)

# Define column names as per the dataset description
column_names = ['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day']

# Assign column names to the dataframe
df.columns = column_names

# Replace missing values '?' with NaN
df = df.replace('?', np.nan)

# Convert columns to appropriate data types
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values with column means
df.fillna(df.mean(), inplace=True)

# Display the first few rows of the dataframe
df.head()

ValueError: Length mismatch: Expected axis has 74 elements, new values have 6 elements

In [6]:
# Print column names
print(df.columns)

Index(['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11',
       'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21',
       'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28', 'F29', 'F30', 'F31',
       'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38', 'F39', 'F40', 'F41',
       'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50', 'F51',
       'F52', 'F53', 'F54', 'F55', 'F56', 'F57', 'F58', 'F59', 'F60', 'F61',
       'F62', 'F63', 'F64', 'F65', 'F66', 'F67', 'F68', 'F69', 'F70', 'F71',
       'F72', 'F73', 'Class'],
      dtype='object')


# Implement Multivariate ARIMA Forecasting Technique
Implement the multivariate ARIMA forecasting technique on the preprocessed Ozone Level Detection dataset.

In [5]:
# Define the ARIMA model
model = ARIMA(df['Ozone'], order=(5,1,0))

# Fit the model
model_fit = model.fit(disp=0)

# Summary of the model
print(model_fit.summary())

# Plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()

# Density plot of the residual error values
residuals.plot(kind='kde')
plt.show()

# Print the residuals
print(residuals.describe())

KeyError: 'Ozone'

# Visualize the ARIMA Forecast
Visualize the forecasted values from the ARIMA model using matplotlib.

In [None]:
# Generate ARIMA forecast
forecast = model_fit.forecast(steps=10)

# Plot the forecasted values
plt.figure(figsize=(12,8))
plt.plot(forecast[0])
plt.title('ARIMA Forecast')
plt.xlabel('Time')
plt.ylabel('Ozone Level')
plt.show()

# Load and Preprocess the Air Quality Dataset
Load the Air Quality dataset and preprocess it for state space modelling.

In [None]:
# Load the Air Quality dataset
air_quality_url = "https://www.statapress.com/data/r12/air2.dta"
air_quality_df = pd.read_stata(air_quality_url)

# Preprocess the data for state space modelling
# Convert the 'date' column to datetime format
air_quality_df['date'] = pd.to_datetime(air_quality_df['date'])

# Set the 'date' column as the index
air_quality_df.set_index('date', inplace=True)

# Check for missing values and handle them if any
air_quality_df = air_quality_df.fillna(air_quality_df.mean())

# Display the first few rows of the dataframe
air_quality_df.head()

# Implement State Space Modelling
Implement state space modelling on the preprocessed Air Quality dataset.

In [None]:
# Define the State Space model
model = SARIMAX(air_quality_df, order=(1, 1, 1), seasonal_order=(1, 1, 1, 1))

# Fit the model
model_fit = model.fit(disp=False)

# Summary of the model
print(model_fit.summary())

# Plot the diagnostics
model_fit.plot_diagnostics(figsize=(15, 12))
plt.show()

# Generate State Space model forecast
forecast = model_fit.get_forecast(steps=10)

# Plot the forecasted values
plt.figure(figsize=(12,8))
plt.plot(forecast.predicted_mean)
plt.title('State Space Model Forecast')
plt.xlabel('Time')
plt.ylabel('Air Quality')
plt.show()

# Visualize the State Space Model
Visualize the state space model using matplotlib.

In [None]:
# Visualize the State Space Model
plt.figure(figsize=(15, 7))
plt.plot(air_quality_df, label='Original')
plt.plot(model_fit.fittedvalues, color='red', label='Fitted')
plt.title('State Space Model - Fitted Values vs Original')
plt.legend(loc='best')
plt.show()