In [240]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

%matplotlib qt

# Create learning dataset

In [241]:
dataset = pd.read_csv("../data/ready_to_use_datasets/smh.csv")

In [242]:
dataset["date"] = pd.to_datetime(dataset["date"])

In [243]:
dataset.head()
# Use only 2023 data
dataset = dataset[dataset["date"].dt.year == 2023]

In [244]:
X = dataset[["microsensor_pm25", "reference_pm25"]]
X_ref = X["reference_pm25"].values.reshape(-1, 1)
X_micro = X["microsensor_pm25"].values.reshape(-1, 1)

# Isolation forest

## Train the model

In [246]:
model = IsolationForest()
model.fit(X_ref)

## Analyse model

In [247]:
y_pred = model.predict(X_micro)
outliers = y_pred == -1
regular = y_pred == 1

In [248]:
print(f"Proportion of outliers: {round(sum(outliers) / (sum(outliers) + sum(regular)), 3)}")

Proportion of outliers: 0.39


In [249]:
# Plot the reference data, the microsensor data and color the microsensor data that are outliers
plt.figure(figsize=(10, 6))
plt.plot(dataset["date"], X_ref, color="blue", label="Reference")
plt.plot(dataset["date"], X_micro, color="green", label="Microsensor")
plt.scatter(dataset["date"][outliers], X_micro[outliers], marker='.', color="red", label="Outliers")
plt.legend()
plt.show()

## Calibrate using Ridge Regression

In [250]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

MSE = []
MSE_raw = []

# One week is 168 hours, so we will use a window of 4 weeks
window_size = 168*4

X_micro_calibrated = []

for i in range(0, len(X_ref), window_size):
    calibration_model = Ridge()

    t = range(i, i+window_size)
    
    if i + window_size > len(X_ref):
        t = range(i, len(X_ref))

    week_ref = X_ref[t]
    week_micro = X_micro[t]

    calibration_model.fit(week_ref, week_micro)
    y_pred = calibration_model.predict(week_ref)

    parameters = calibration_model.coef_
    bias = calibration_model.intercept_

    # Calibrate the microsensor data
    week_micro_calibrated = (week_micro - bias) / parameters[0]

    MSE.append(sum((week_ref - week_micro_calibrated) ** 2) / len(week_ref))
    MSE_raw.append(sum((week_ref - week_micro) ** 2) / len(week_ref))

    X_micro_calibrated.extend(week_micro_calibrated)

X_micro_calibrated = np.array(X_micro_calibrated)
X_micro_calibrated = X_micro_calibrated.reshape(-1, 1)

plt.plot(dataset["date"], X_ref, color="blue", label="Reference")
plt.plot(dataset["date"], X_micro, color="red", label="Microsensor")
plt.plot(dataset["date"], X_micro_calibrated, color="green", label="Microsensor Calibrated")
plt.legend()
plt.show()

print(f"Mean Squared Error: {sum(MSE) / len(MSE)}")
print(f"Mean Squared Error Raw: {sum(MSE_raw) / len(MSE_raw)}")

# Plot the MSE
plt.figure(figsize=(10, 6))
plt.plot(range(len(MSE)), MSE)
plt.plot(range(len(MSE_raw)), MSE_raw)
plt.show()


Mean Squared Error: [10.02824697]
Mean Squared Error Raw: [44.56150317]


In [259]:
y_pred = model.predict(X_micro_calibrated)

window_size = 168*4

for i in range(0, len(X_ref), window_size):
    calibration_model = Ridge()

    t = range(i, i+window_size)
    
    if i + window_size > len(X_ref):
        t = range(i, len(X_ref))

    window_X_ref = X_ref[t]
    window_X_micro_calibrated = X_micro_calibrated[t]

    model = IsolationForest()
    model.fit(window_X_ref)

    y_pred = model.predict(window_X_micro_calibrated)

    outliers = y_pred == -1
    regular = y_pred == 1

    print(f"Proportion of outliers: {round(sum(outliers) / (sum(outliers) + sum(regular)), 3)}")

Proportion of outliers: 0.318
Proportion of outliers: 0.406
Proportion of outliers: 0.445
Proportion of outliers: 0.339
Proportion of outliers: 0.369
Proportion of outliers: 0.277
Proportion of outliers: 0.39
Proportion of outliers: 0.5
Proportion of outliers: 0.342
Proportion of outliers: 0.558
Proportion of outliers: 0.464
Proportion of outliers: 0.271
Proportion of outliers: 0.387


In [254]:
print(f"Proportion of outliers: {round(sum(outliers) / (sum(outliers) + sum(regular)), 3)}")

Proportion of outliers: 0.296


In [253]:
# Plot the reference data, the microsensor data and color the microsensor data that are outliers
plt.figure(figsize=(10, 6))
plt.plot(dataset["date"], X_ref, color="blue", label="Reference")
plt.plot(dataset["date"], X_micro_calibrated, color="green", label="Microsensor Calibrated")
plt.scatter(dataset["date"][outliers], X_micro_calibrated[outliers], marker='.', color="red", label="Outliers")
plt.legend()
plt.show()