In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
import numpy as np

%matplotlib qt

In [None]:
dataset = pd.read_csv("../data/ready_to_use_datasets/smh.csv")

dataset["date"] = pd.to_datetime(dataset["date"])

# Use only 2023 data
dataset = dataset[dataset["date"].dt.year == 2023]

In [None]:
dataset.head()

Unnamed: 0,date,microsensor_pm25,hourofday,monthofyear,season,reference_pm25,temperature,humidity
0,2023-01-01 00:00:00,23.241923,0,1,Winter,15.2,10.7,99.9
1,2023-01-01 01:00:00,23.285335,1,1,Winter,15.9,10.16,99.9
2,2023-01-01 02:00:00,21.687298,2,1,Winter,15.2,10.14,99.9
3,2023-01-01 03:00:00,19.969197,3,1,Winter,13.6,10.05,99.9
4,2023-01-01 04:00:00,16.398725,4,1,Winter,11.8,9.77,99.9


In [None]:
X = dataset[["microsensor_pm25", "reference_pm25"]]
X_ref = X["reference_pm25"].values.reshape(-1, 1)
X_micro = X["microsensor_pm25"].values.reshape(-1, 1)

In [None]:
# Plot MSE of the dataset
X_MSE = (X_ref - X_micro) ** 2

plt.plot(dataset["date"], X_MSE)
plt.title("MSE of the dataset")
plt.show()

In [None]:
# Evaluate the MSE per season
plt.figure()

# Winter
winter_idxs = dataset["season"] == "Winter"
winter_MSE = X_MSE[winter_idxs]
plt.plot(dataset[winter_idxs]["date"], winter_MSE, label="Winter")
print("Winter MSE: ", np.mean(winter_MSE))

# Spring
spring_idxs = dataset["season"] == "Spring"
spring_MSE = X_MSE[spring_idxs]
plt.plot(dataset[spring_idxs]["date"], spring_MSE, label="Spring")
print("Spring MSE: ", np.mean(spring_MSE))

# Summer
summer_idxs = dataset["season"] == "Summer"
summer_MSE = X_MSE[summer_idxs]
plt.plot(dataset[summer_idxs]["date"], summer_MSE, label="Summer")
print("Summer MSE: ", np.mean(summer_MSE))

# Autumn
autumn_idxs = dataset["season"] == "Autumn"
autumn_MSE = X_MSE[autumn_idxs]
plt.plot(dataset[autumn_idxs]["date"], autumn_MSE, label="Autumn")
print("Autumn MSE: ", np.mean(autumn_MSE))

plt.title("MSE of the dataset per season")
plt.xlabel("Date")
plt.ylabel("MSE")
plt.legend()
plt.show()

Winter MSE:  98.0213573205415
Spring MSE:  25.953008010483625
Summer MSE:  29.528723762804642
Autumn MSE:  24.76858801648947


In [None]:
## Analysis the corretion of the MSE and the temperature

# Evaluate the MSE per temperature
plt.figure()

# Winter
winter_idxs = dataset["season"] == "Winter"
winter_MSE = X_MSE[winter_idxs]
winter_temperature = dataset[winter_idxs]["temperature"]
plt.scatter(winter_temperature, winter_MSE, label="Winter", alpha=0.5)

# # Spring
# spring_idxs = dataset["season"] == "Spring"
# spring_MSE = X_MSE[spring_idxs]
# spring_temperature = dataset[spring_idxs]["temperature"]
# plt.scatter(spring_temperature, spring_MSE, label="Spring", alpha=0.5)

# # Summer
# summer_idxs = dataset["season"] == "Summer"
# summer_MSE = X_MSE[summer_idxs]
# summer_temperature = dataset[summer_idxs]["temperature"]
# plt.scatter(summer_temperature, summer_MSE, label="Summer", alpha=0.5)

# # Autumn
# autumn_idxs = dataset["season"] == "Autumn"
# autumn_MSE = X_MSE[autumn_idxs]
# autumn_temperature = dataset[autumn_idxs]["temperature"]
# plt.scatter(autumn_temperature, autumn_MSE, label="Autumn", alpha=0.5)

<matplotlib.collections.PathCollection at 0x2199eb1a790>

In [None]:
## Analysis the correlation of the MSE and hour of the day

# Evaluate the MSE per hour of the day
plt.figure()
# Group by hour of the day and calculate the mean MSE for each hour
dataset['hour'] = dataset['date'].dt.hour
hourly_MSE = pd.Series(X_MSE.flatten()).groupby(dataset['hour']).mean()

plt.plot(hourly_MSE.index, hourly_MSE)
plt.title("MSE per Hour of the Day")
plt.xlabel("Hour of the Day")
plt.ylabel("Mean Squared Error")
plt.show()

# Check for correlation
hours = pd.Series(hourly_MSE.index)
correlation = hourly_MSE.corr(hours)
print("Correlation between hour of the day and MSE:", correlation)

Correlation between hour of the day and MSE: -0.29283176635816116


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

MSE = []
MSE_raw = []

# One week is 168 hours, so we will use a window of 4 weeks
window_size = 168*4

X_micro_calibrated = []

for i in range(0, len(X_ref), window_size):
    calibration_model = Ridge()

    t = range(i, i+window_size)
    
    if i + window_size > len(X_ref):
        t = range(i, len(X_ref))

    week_ref = X_ref[t]
    week_micro = X_micro[t]

    calibration_model.fit(week_ref, week_micro)
    y_pred = calibration_model.predict(week_ref)

    parameters = calibration_model.coef_
    bias = calibration_model.intercept_

    # Calibrate the microsensor data
    week_micro_calibrated = (week_micro - bias) / parameters[0]

    MSE.append(sum((week_ref - week_micro_calibrated) ** 2) / len(week_ref))
    MSE_raw.append(sum((week_ref - week_micro) ** 2) / len(week_ref))

    X_micro_calibrated.extend(week_micro_calibrated)

X_micro_calibrated = np.array(X_micro_calibrated)
X_micro_calibrated = X_micro_calibrated.reshape(-1, 1)

plt.plot(dataset["date"], X_ref, color="blue", label="Reference")
plt.plot(dataset["date"], X_micro, color="red", label="Microsensor")
plt.plot(dataset["date"], X_micro_calibrated, color="green", label="Microsensor Calibrated")
plt.legend()
plt.show()

print(f"Mean Squared Error: {sum(MSE) / len(MSE)}")
print(f"Mean Squared Error Raw: {sum(MSE_raw) / len(MSE_raw)}")

# Plot the MSE
plt.figure(figsize=(10, 6))
plt.plot(range(len(MSE)), MSE)
plt.plot(range(len(MSE_raw)), MSE_raw)
plt.show()

Mean Squared Error: [67.03161039]
Mean Squared Error Raw: [16.34791664]


## Residuals analysis using the Ridge Regression

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

MSE = []
MSE_raw = []

# One week is 168 hours, so we will use a window of 4 weeks
window_size = 168*4

bias = []
parameters = []
residuals = []

X_micro_calibrated = []

for i in range(0, len(X_ref)):
    calibration_model = Ridge()

    t = range(i, i + window_size)
    
    if i + window_size > len(X_ref):
        t = range(i, len(X_ref))

    x_ref = X_ref[t]
    x_micro = X_micro[t]

    calibration_model.fit(x_ref, x_micro)
    y_pred = calibration_model.predict(x_ref)

    parameters.append(calibration_model.coef_)
    bias.append(calibration_model.intercept_)

    residuals.append(sum((x_ref - y_pred) ** 2) / len(x_ref))

    # Calibrate the microsensor data
    x_micro_calibrated = (x_micro - bias[-1]) / parameters[-1]

    MSE.append(sum((x_ref - x_micro_calibrated) ** 2) / len(x_ref))
    MSE_raw.append(sum((x_ref - x_micro) ** 2) / len(x_ref))

    X_micro_calibrated.extend(x_micro_calibrated[-1])

X_micro_calibrated = np.array(X_micro_calibrated)
X_micro_calibrated = X_micro_calibrated.reshape(-1, 1)

plt.plot(dataset["date"], X_ref, color="blue", label="Reference")
plt.plot(dataset["date"], X_micro, color="red", label="Microsensor")
plt.plot(dataset["date"], X_micro_calibrated, color="green", label="Microsensor Calibrated")
plt.legend()
plt.show()

print(f"Mean Squared Error: {sum(MSE) / len(MSE)}")
print(f"Mean Squared Error Raw: {sum(MSE_raw) / len(MSE_raw)}")

# Plot the MSE
plt.figure(figsize=(10, 6))
plt.plot(range(len(MSE)), MSE)
plt.plot(range(len(MSE_raw)), MSE_raw)
plt.show()

Mean Squared Error: [nan]
Mean Squared Error Raw: [54.52165416]


  x_micro_calibrated = (x_micro - bias[-1]) / parameters[-1]


## Uses the ARIMA model to predict and validate

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

MSE = []
MSE_raw = []

condition = []
condition_calibrated = []

X_ARIMA = []

p = 8
d = 0
q = 0

window_size = 168
step = 1
fit_update = True

for i in range(0, len(X_ref) // 24, step):
    t = range(i, i+window_size)
    
    if i + window_size > len(X_ref):
        t = range(i, len(X_ref))

    if len(t) < p + 1:
        continue

    x_ref = X_ref[t]
    x_micro = X_micro[t]
    x_micro_calibrated = X_micro_calibrated[t]
    
    model = ARIMA(x_ref, order=(p, d, q))
    model_fit = model.fit()

    # Forecast the next value
    results = model_fit.get_forecast(steps=1)
    conf_int = results.conf_int()
    forecast = results.predicted_mean

    # Microsensor reading
    micro_reading = x_micro[-1]
    micro_reading_calibrated = x_micro_calibrated[-1]

    # Check if the microsensor reading is within the confidence interval
    lower_bound, upper_bound = conf_int[0]
    if lower_bound <= micro_reading <= upper_bound:
        condition.append(1)
    else:
        condition.append(-1)

    if lower_bound <= micro_reading_calibrated <= upper_bound:
        condition_calibrated.append(1)
    else:
        condition_calibrated.append(-1)

    MSE.append((x_ref[-1] - forecast) ** 2)
    MSE_raw.append((x_ref[-1] - x_micro_calibrated[-1]) ** 2)

    X_ARIMA.extend(forecast)

X_ARIMA = np.array(X_ARIMA).reshape(-1, 1)

plt.figure(figsize=(10, 6))
plt.plot(range(len(MSE)), MSE, label="ARIMA")
plt.plot(range(len(MSE)), MSE_raw, label="Calibrated")
plt.legend()
plt.show()

print(f"Mean Squared Error: {sum(MSE) / len(MSE)}")
print(f"Mean Squared Error Raw: {sum(MSE_raw) / len(MSE_raw)}")

plt.figure(figsize=(10, 6))
plt.plot(X_ref[window_size:len(X_ref)//24], color="blue", label="Reference")
plt.plot(X_micro[window_size:len(X_ref)//24], color="red", label="Microsensor")
plt.plot(X_micro_calibrated[window_size:len(X_ref)//24], color="green", label="Microsensor Calibrated")
plt.plot(X_ARIMA[1:len(X_ref)//24], color="purple", label="ARIMA")

# Put a marker (dot) on the invalid measurements
for i, c in enumerate(condition):
    if c == -1:
        plt.scatter(i, X_micro[i + window_size], marker='.', color="black")
    
for i, c in enumerate(condition_calibrated):
    if c == -1:
        plt.scatter(i, X_micro_calibrated[i + window_size], marker='.', color="black")
    
plt.legend(["Reference", "Microsensor", "Microsensor Calibrated", "ARIMA", "Invalid Measurement"])
plt.show()


Mean Squared Error: [0.88573398]
Mean Squared Error Raw: [2.55049177]
