In [1]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.weightstats import ttest_ind
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
import statsmodels.api as sm

In [13]:
pema_beacon_df = pd.read_csv("data/pema_beacon.csv")
low_rsd_df = pd.read_csv("data/lowest_rsd_hours.csv")
pema_reference_df = pd.read_csv("data/pema_raw_filtered.csv")

In [8]:
pema_beacon_df.head()

Unnamed: 0,local_timestamp,epoch,datetime,node_file_id,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,node_id
0,2024-12-17 00:00:00,1734422000.0,2024-12-17 08:00:00,5298509,-0.00042,0.029,0.0258,81.23332,14.60615,271
1,2024-12-17 00:01:00,1734422000.0,2024-12-17 08:01:00,5298509,-0.00055,0.03017,0.02645,81.06122,14.62692,271
2,2024-12-17 00:02:00,1734423000.0,2024-12-17 08:02:00,5298509,-0.00049,0.02977,0.02618,81.06264,14.62857,271
3,2024-12-17 00:03:00,1734423000.0,2024-12-17 08:03:00,5298509,-9e-05,0.0289,0.02612,81.06138,14.63,271
4,2024-12-17 00:04:00,1734423000.0,2024-12-17 08:04:00,5298509,-0.00044,0.02998,0.02639,81.05814,14.62615,271


In [9]:
low_rsd_df.head()

Unnamed: 0.1,Unnamed: 0,datetime_utc,o3_dpw,o3_pema,o3_pha,mean_o3,std_o3,rsd_o3
0,253,2024-12-27 13:00:00+00:00,23.992583,24.4885,23.827567,24.102883,0.343995,0.014272
1,330,2024-12-30 18:00:00+00:00,40.206917,40.636783,40.044233,40.295978,0.30615,0.007598
2,331,2024-12-30 19:00:00+00:00,39.086867,39.6603,39.2636,39.336922,0.293664,0.007465
3,332,2024-12-30 20:00:00+00:00,38.9671,38.579417,37.9759,38.507472,0.499501,0.012972
4,351,2024-12-31 15:00:00+00:00,42.828767,41.78015,42.596633,42.40185,0.550776,0.012989


In [10]:
pema_beacon_df = pema_beacon_df.drop(columns=["local_timestamp", "epoch", "node_file_id", "node_id"])
pema_beacon_df = pema_beacon_df.rename(columns={"datetime": "datetime_utc"})
pema_beacon_df["datetime_utc"] = pd.to_datetime(pema_beacon_df["datetime_utc"], utc=True)

pema_beacon_df.head()


Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp
0,2024-12-17 08:00:00+00:00,-0.00042,0.029,0.0258,81.23332,14.60615
1,2024-12-17 08:01:00+00:00,-0.00055,0.03017,0.02645,81.06122,14.62692
2,2024-12-17 08:02:00+00:00,-0.00049,0.02977,0.02618,81.06264,14.62857
3,2024-12-17 08:03:00+00:00,-9e-05,0.0289,0.02612,81.06138,14.63
4,2024-12-17 08:04:00+00:00,-0.00044,0.02998,0.02639,81.05814,14.62615


In [12]:
pema_beacon_df["datetime_utc"] = pd.to_datetime(pema_beacon_df["datetime_utc"], utc=True)
low_rsd_df["datetime_utc"] = pd.to_datetime(low_rsd_df["datetime_utc"], utc=True)

pema_beacon_df["hour"] = pema_beacon_df["datetime_utc"].dt.floor("h")

low_rsd_hours = set(low_rsd_df["datetime_utc"])

pema_low_rsd_minutely = pema_beacon_df[pema_beacon_df["hour"].isin(low_rsd_hours)].copy()

pema_low_rsd_minutely.drop(columns="hour", inplace=True)

pema_low_rsd_minutely.head()


Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp
14700,2024-12-27 13:00:00+00:00,0.00727,0.07954,0.01004,58.35494,1.13462
14701,2024-12-27 13:01:00+00:00,0.00705,0.07965,0.01076,56.66053,1.56385
14702,2024-12-27 13:02:00+00:00,0.00622,0.0801,0.01014,53.99303,2.23571
14703,2024-12-27 13:03:00+00:00,0.00649,0.0773,0.00981,53.92195,2.25231
14704,2024-12-27 13:04:00+00:00,0.00609,0.07664,0.00942,53.85045,2.275


In [15]:
pema_low_rsd_minutely = pema_low_rsd_minutely.sort_values("datetime_utc")
pema_reference_df = pema_reference_df.sort_values("datetime_utc")

pema_low_rsd_minutely["datetime_utc"] = pd.to_datetime(pema_low_rsd_minutely["datetime_utc"], utc=True)
pema_reference_df["datetime_utc"] = pd.to_datetime(pema_reference_df["datetime_utc"], utc=True)

pema_reference_df = pema_reference_df.rename(columns={"o3": "true_o3"})

pema_labeled_df = pd.merge_asof(
    pema_low_rsd_minutely,
    pema_reference_df[["datetime_utc", "true_o3"]],
    on="datetime_utc",
    direction="nearest",
    tolerance=pd.Timedelta("1min")
)

pema_labeled_df = pema_labeled_df.dropna(subset=["true_o3"])


In [16]:
pema_labeled_df = pema_labeled_df.dropna().copy()

pema_labeled_df["datetime_utc"] = pd.to_datetime(pema_labeled_df["datetime_utc"], utc=True)

pema_labeled_df["hour"] = pema_labeled_df["datetime_utc"].dt.hour

pema_labeled_df[["datetime_utc", "hour", "true_o3"]].head()


Unnamed: 0,datetime_utc,hour,true_o3
0,2024-12-27 13:00:00+00:00,13,22.49
1,2024-12-27 13:01:00+00:00,13,22.923
2,2024-12-27 13:02:00+00:00,13,22.529
3,2024-12-27 13:03:00+00:00,13,23.365
4,2024-12-27 13:04:00+00:00,13,22.543


In [17]:
pema_labeled_df.to_csv("data/training_data.csv")

In [18]:
pema_beacon_df.head()

Unnamed: 0,local_timestamp,epoch,datetime,node_file_id,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,node_id
0,2024-12-17 00:00:00,1734422000.0,2024-12-17 08:00:00,5298509,-0.00042,0.029,0.0258,81.23332,14.60615,271
1,2024-12-17 00:01:00,1734422000.0,2024-12-17 08:01:00,5298509,-0.00055,0.03017,0.02645,81.06122,14.62692,271
2,2024-12-17 00:02:00,1734423000.0,2024-12-17 08:02:00,5298509,-0.00049,0.02977,0.02618,81.06264,14.62857,271
3,2024-12-17 00:03:00,1734423000.0,2024-12-17 08:03:00,5298509,-9e-05,0.0289,0.02612,81.06138,14.63,271
4,2024-12-17 00:04:00,1734423000.0,2024-12-17 08:04:00,5298509,-0.00044,0.02998,0.02639,81.05814,14.62615,271


In [19]:
pema_reference_df.head()

Unnamed: 0.1,Unnamed: 0,datetime_utc,device_state,true_o3
225593,225643,2024-12-17 00:00:02+00:00,ACTIVE,37.36
225592,225642,2024-12-17 00:01:02+00:00,ACTIVE,37.75
225591,225641,2024-12-17 00:02:02+00:00,ACTIVE,36.945
225590,225640,2024-12-17 00:03:02+00:00,ACTIVE,37.742
225589,225639,2024-12-17 00:04:02+00:00,ACTIVE,36.945


In [23]:
pema_beacon_df["datetime_utc"] = pd.to_datetime(pema_beacon_df["datetime"], utc=True)
pema_reference_df["datetime_utc"] = pd.to_datetime(pema_reference_df["datetime_utc"], utc=True)

pema_beacon_df = pema_beacon_df.sort_values("datetime_utc")
pema_reference_df = pema_reference_df.sort_values("datetime_utc")

pema_test_df = pd.merge_asof(
    pema_beacon_df,
    pema_reference_df[["datetime_utc", "true_o3"]],
    on="datetime_utc",
    direction="nearest",
    tolerance=pd.Timedelta(seconds=5)
)

pema_test_df = pema_test_df.dropna(subset=["true_o3"])

pema_test_df.to_csv("data/testing_data.csv")


In [24]:
pema_test_df.head()

Unnamed: 0,local_timestamp,epoch,datetime,node_file_id,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,node_id,datetime_utc,true_o3
0,2024-12-17 00:00:00,1734422000.0,2024-12-17 08:00:00,5298509,-0.00042,0.029,0.0258,81.23332,14.60615,271,2024-12-17 08:00:00+00:00,45.07
1,2024-12-17 00:01:00,1734422000.0,2024-12-17 08:01:00,5298509,-0.00055,0.03017,0.02645,81.06122,14.62692,271,2024-12-17 08:01:00+00:00,44.679
2,2024-12-17 00:02:00,1734423000.0,2024-12-17 08:02:00,5298509,-0.00049,0.02977,0.02618,81.06264,14.62857,271,2024-12-17 08:02:00+00:00,44.679
3,2024-12-17 00:03:00,1734423000.0,2024-12-17 08:03:00,5298509,-9e-05,0.0289,0.02612,81.06138,14.63,271,2024-12-17 08:03:00+00:00,43.515
4,2024-12-17 00:04:00,1734423000.0,2024-12-17 08:04:00,5298509,-0.00044,0.02998,0.02639,81.05814,14.62615,271,2024-12-17 08:04:00+00:00,44.687
