In [50]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.weightstats import ttest_ind
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
import statsmodels.api as sm

## Setting up PEMA testing and training

In [51]:
pema_beacon_df = pd.read_csv("data/pema_beacon.csv")
low_rsd_df = pd.read_csv("data/lowest_rsd_hours.csv")
pema_reference_df = pd.read_csv("data/pema_raw_filtered.csv")

In [52]:
pema_beacon_df.head()

Unnamed: 0,local_timestamp,epoch,datetime,node_file_id,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,node_id
0,2024-12-17 00:00:00,1734422000.0,2024-12-17 08:00:00,5298509,-0.00042,0.029,0.0258,81.23332,14.60615,271
1,2024-12-17 00:01:00,1734422000.0,2024-12-17 08:01:00,5298509,-0.00055,0.03017,0.02645,81.06122,14.62692,271
2,2024-12-17 00:02:00,1734423000.0,2024-12-17 08:02:00,5298509,-0.00049,0.02977,0.02618,81.06264,14.62857,271
3,2024-12-17 00:03:00,1734423000.0,2024-12-17 08:03:00,5298509,-9e-05,0.0289,0.02612,81.06138,14.63,271
4,2024-12-17 00:04:00,1734423000.0,2024-12-17 08:04:00,5298509,-0.00044,0.02998,0.02639,81.05814,14.62615,271


In [53]:
low_rsd_df.head()

Unnamed: 0.1,Unnamed: 0,datetime_utc,o3_dpw,o3_pema,o3_pha,mean_o3,std_o3,rsd_o3
0,253,2024-12-27 13:00:00+00:00,23.992583,24.4885,23.827567,24.102883,0.343995,0.014272
1,330,2024-12-30 18:00:00+00:00,40.206917,40.636783,40.044233,40.295978,0.30615,0.007598
2,331,2024-12-30 19:00:00+00:00,39.086867,39.6603,39.2636,39.336922,0.293664,0.007465
3,332,2024-12-30 20:00:00+00:00,38.9671,38.579417,37.9759,38.507472,0.499501,0.012972
4,351,2024-12-31 15:00:00+00:00,42.828767,41.78015,42.596633,42.40185,0.550776,0.012989


In [54]:
pema_beacon_df = pema_beacon_df.drop(columns=["local_timestamp", "epoch", "node_file_id", "node_id"])
pema_beacon_df = pema_beacon_df.rename(columns={"datetime": "datetime_utc"})
pema_beacon_df["datetime_utc"] = pd.to_datetime(pema_beacon_df["datetime_utc"], utc=True)

pema_beacon_df.head()


Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp
0,2024-12-17 08:00:00+00:00,-0.00042,0.029,0.0258,81.23332,14.60615
1,2024-12-17 08:01:00+00:00,-0.00055,0.03017,0.02645,81.06122,14.62692
2,2024-12-17 08:02:00+00:00,-0.00049,0.02977,0.02618,81.06264,14.62857
3,2024-12-17 08:03:00+00:00,-9e-05,0.0289,0.02612,81.06138,14.63
4,2024-12-17 08:04:00+00:00,-0.00044,0.02998,0.02639,81.05814,14.62615


In [55]:
pema_beacon_df["datetime_utc"] = pd.to_datetime(pema_beacon_df["datetime_utc"], utc=True)
low_rsd_df["datetime_utc"] = pd.to_datetime(low_rsd_df["datetime_utc"], utc=True)

pema_beacon_df["hour"] = pema_beacon_df["datetime_utc"].dt.floor("h")

low_rsd_hours = set(low_rsd_df["datetime_utc"])

pema_low_rsd_minutely = pema_beacon_df[pema_beacon_df["hour"].isin(low_rsd_hours)].copy()

pema_low_rsd_minutely.drop(columns="hour", inplace=True)

pema_low_rsd_minutely.head()


Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp
14700,2024-12-27 13:00:00+00:00,0.00727,0.07954,0.01004,58.35494,1.13462
14701,2024-12-27 13:01:00+00:00,0.00705,0.07965,0.01076,56.66053,1.56385
14702,2024-12-27 13:02:00+00:00,0.00622,0.0801,0.01014,53.99303,2.23571
14703,2024-12-27 13:03:00+00:00,0.00649,0.0773,0.00981,53.92195,2.25231
14704,2024-12-27 13:04:00+00:00,0.00609,0.07664,0.00942,53.85045,2.275


In [56]:
pema_low_rsd_minutely = pema_low_rsd_minutely.sort_values("datetime_utc")
pema_reference_df = pema_reference_df.sort_values("datetime_utc")

pema_low_rsd_minutely["datetime_utc"] = pd.to_datetime(pema_low_rsd_minutely["datetime_utc"], utc=True)
pema_reference_df["datetime_utc"] = pd.to_datetime(pema_reference_df["datetime_utc"], utc=True)

pema_reference_df = pema_reference_df.rename(columns={"o3": "true_o3"})

pema_labeled_df = pd.merge_asof(
    pema_low_rsd_minutely,
    pema_reference_df[["datetime_utc", "true_o3"]],
    on="datetime_utc",
    direction="nearest",
    tolerance=pd.Timedelta("1min")
)

pema_labeled_df = pema_labeled_df.dropna(subset=["true_o3"])


In [57]:
pema_labeled_df = pema_labeled_df.dropna().copy()

pema_labeled_df["datetime_utc"] = pd.to_datetime(pema_labeled_df["datetime_utc"], utc=True)

pema_labeled_df["hour"] = pema_labeled_df["datetime_utc"].dt.hour

pema_labeled_df[["datetime_utc", "hour", "true_o3"]].head()


Unnamed: 0,datetime_utc,hour,true_o3
0,2024-12-27 13:00:00+00:00,13,22.49
1,2024-12-27 13:01:00+00:00,13,22.923
2,2024-12-27 13:02:00+00:00,13,22.529
3,2024-12-27 13:03:00+00:00,13,23.365
4,2024-12-27 13:04:00+00:00,13,22.543


In [58]:
pema_labeled_df.to_csv("data/pema_training_data.csv")

In [59]:
pema_beacon_df.head()

Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,hour
0,2024-12-17 08:00:00+00:00,-0.00042,0.029,0.0258,81.23332,14.60615,2024-12-17 08:00:00+00:00
1,2024-12-17 08:01:00+00:00,-0.00055,0.03017,0.02645,81.06122,14.62692,2024-12-17 08:00:00+00:00
2,2024-12-17 08:02:00+00:00,-0.00049,0.02977,0.02618,81.06264,14.62857,2024-12-17 08:00:00+00:00
3,2024-12-17 08:03:00+00:00,-9e-05,0.0289,0.02612,81.06138,14.63,2024-12-17 08:00:00+00:00
4,2024-12-17 08:04:00+00:00,-0.00044,0.02998,0.02639,81.05814,14.62615,2024-12-17 08:00:00+00:00


In [60]:
pema_reference_df.head()

Unnamed: 0.1,Unnamed: 0,datetime_utc,device_state,true_o3
225593,225643,2024-12-17 00:00:02+00:00,ACTIVE,37.36
225592,225642,2024-12-17 00:01:02+00:00,ACTIVE,37.75
225591,225641,2024-12-17 00:02:02+00:00,ACTIVE,36.945
225590,225640,2024-12-17 00:03:02+00:00,ACTIVE,37.742
225589,225639,2024-12-17 00:04:02+00:00,ACTIVE,36.945


In [61]:
pema_beacon_df["datetime_utc"] = pd.to_datetime(pema_beacon_df["datetime_utc"], utc=True)
pema_reference_df["datetime_utc"] = pd.to_datetime(pema_reference_df["datetime_utc"], utc=True)

pema_beacon_df = pema_beacon_df.sort_values("datetime_utc")
pema_reference_df = pema_reference_df.sort_values("datetime_utc")

pema_test_df = pd.merge_asof(
    pema_beacon_df,
    pema_reference_df[["datetime_utc", "true_o3"]],
    on="datetime_utc",
    direction="nearest",
    tolerance=pd.Timedelta(seconds=30)
)

pema_test_df = pema_test_df.dropna(subset=["true_o3"])

pema_test_df.to_csv("data/pema_testing_data.csv")


In [62]:
pema_test_df.head()

Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,hour,true_o3
0,2024-12-17 08:00:00+00:00,-0.00042,0.029,0.0258,81.23332,14.60615,2024-12-17 08:00:00+00:00,45.07
1,2024-12-17 08:01:00+00:00,-0.00055,0.03017,0.02645,81.06122,14.62692,2024-12-17 08:00:00+00:00,44.679
2,2024-12-17 08:02:00+00:00,-0.00049,0.02977,0.02618,81.06264,14.62857,2024-12-17 08:00:00+00:00,44.679
3,2024-12-17 08:03:00+00:00,-9e-05,0.0289,0.02612,81.06138,14.63,2024-12-17 08:00:00+00:00,43.515
4,2024-12-17 08:04:00+00:00,-0.00044,0.02998,0.02639,81.05814,14.62615,2024-12-17 08:00:00+00:00,44.687


In [63]:
len(pema_beacon_df["datetime_utc"])

223914

In [64]:
len(pema_reference_df["datetime_utc"])

225594

In [65]:
len(pema_test_df["datetime_utc"])

223487

In [66]:
len(pema_labeled_df["datetime_utc"])

22405

## Setting up PHA testing and training

In [71]:
pha_beacon_df = pd.read_csv("data/pha_beacon.csv")
pha_reference_df = pd.read_csv("data/pha_raw_filtered.csv")

In [72]:
pha_beacon_df.head()

Unnamed: 0,local_timestamp,epoch,datetime,node_file_id,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,node_id
0,2024-12-17 00:00:00,1734422000.0,2024-12-17 08:00:00,5298549,0.00635,-0.00411,0.0313,100.0,13.71929,257
1,2024-12-17 00:01:00,1734422000.0,2024-12-17 08:01:00,5298549,0.00649,-0.0006,0.03107,100.0,13.71923,257
2,2024-12-17 00:02:00,1734423000.0,2024-12-17 08:02:00,5298549,0.0073,-0.0021,0.03121,100.0,13.72214,257
3,2024-12-17 00:03:00,1734423000.0,2024-12-17 08:03:00,5298549,0.00676,-0.00403,0.03191,100.0,13.71571,257
4,2024-12-17 00:04:00,1734423000.0,2024-12-17 08:04:00,5298549,0.00616,-0.00583,0.03206,100.0,13.72,257


In [73]:
pha_beacon_df = pha_beacon_df.drop(columns=["local_timestamp", "epoch", "node_file_id", "node_id"])
pha_beacon_df = pha_beacon_df.rename(columns={"datetime": "datetime_utc"})
pha_beacon_df["datetime_utc"] = pd.to_datetime(pha_beacon_df["datetime_utc"], utc=True)

pha_beacon_df.head()

Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp
0,2024-12-17 08:00:00+00:00,0.00635,-0.00411,0.0313,100.0,13.71929
1,2024-12-17 08:01:00+00:00,0.00649,-0.0006,0.03107,100.0,13.71923
2,2024-12-17 08:02:00+00:00,0.0073,-0.0021,0.03121,100.0,13.72214
3,2024-12-17 08:03:00+00:00,0.00676,-0.00403,0.03191,100.0,13.71571
4,2024-12-17 08:04:00+00:00,0.00616,-0.00583,0.03206,100.0,13.72


In [74]:
pha_beacon_df["datetime_utc"] = pd.to_datetime(pha_beacon_df["datetime_utc"], utc=True)

pha_beacon_df["hour"] = pha_beacon_df["datetime_utc"].dt.floor("h")

pha_low_rsd_minutely = pha_beacon_df[pha_beacon_df["hour"].isin(low_rsd_hours)].copy()

pha_low_rsd_minutely.drop(columns="hour", inplace=True)

pha_low_rsd_minutely.head()

Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp
14700,2024-12-27 13:00:00+00:00,0.0067,0.03062,0.01748,58.20675,-0.39357
14701,2024-12-27 13:01:00+00:00,0.00672,0.03256,0.01747,58.13728,-0.38214
14702,2024-12-27 13:02:00+00:00,0.0073,0.03362,0.01744,55.9799,0.1725
14703,2024-12-27 13:03:00+00:00,0.00751,0.0348,0.01742,55.22231,0.36929
14704,2024-12-27 13:04:00+00:00,0.0071,0.03479,0.01738,55.19517,0.38786


In [75]:
pha_low_rsd_minutely = pha_low_rsd_minutely.sort_values("datetime_utc")
pha_reference_df = pha_reference_df.sort_values("datetime_utc")

pha_low_rsd_minutely["datetime_utc"] = pd.to_datetime(pha_low_rsd_minutely["datetime_utc"], utc=True)
pha_reference_df["datetime_utc"] = pd.to_datetime(pha_reference_df["datetime_utc"], utc=True)

pha_reference_df = pha_reference_df.rename(columns={"o3": "true_o3"})

pha_labeled_df = pd.merge_asof(
    pha_low_rsd_minutely,
    pha_reference_df[["datetime_utc", "true_o3"]],
    on="datetime_utc",
    direction="nearest",
    tolerance=pd.Timedelta("1min")
)

pha_labeled_df = pha_labeled_df.dropna(subset=["true_o3"])

In [76]:
pha_labeled_df = pha_labeled_df.dropna().copy()

pha_labeled_df["datetime_utc"] = pd.to_datetime(pha_labeled_df["datetime_utc"], utc=True)

pha_labeled_df["hour"] = pha_labeled_df["datetime_utc"].dt.hour

pha_labeled_df[["datetime_utc", "hour", "true_o3"]].head()

Unnamed: 0,datetime_utc,hour,true_o3
0,2024-12-27 13:00:00+00:00,13,21.318
1,2024-12-27 13:01:00+00:00,13,21.745
2,2024-12-27 13:02:00+00:00,13,20.921
3,2024-12-27 13:03:00+00:00,13,21.8
4,2024-12-27 13:04:00+00:00,13,21.804


In [77]:
pha_labeled_df.to_csv("data/pha_training_data.csv")

In [78]:
pha_beacon_df.head()

Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,hour
0,2024-12-17 08:00:00+00:00,0.00635,-0.00411,0.0313,100.0,13.71929,2024-12-17 08:00:00+00:00
1,2024-12-17 08:01:00+00:00,0.00649,-0.0006,0.03107,100.0,13.71923,2024-12-17 08:00:00+00:00
2,2024-12-17 08:02:00+00:00,0.0073,-0.0021,0.03121,100.0,13.72214,2024-12-17 08:00:00+00:00
3,2024-12-17 08:03:00+00:00,0.00676,-0.00403,0.03191,100.0,13.71571,2024-12-17 08:00:00+00:00
4,2024-12-17 08:04:00+00:00,0.00616,-0.00583,0.03206,100.0,13.72,2024-12-17 08:00:00+00:00


In [79]:
pha_reference_df.head()

Unnamed: 0.1,Unnamed: 0,datetime_utc,device_state,true_o3
225579,226060,2024-12-17 00:00:18+00:00,ACTIVE,16.218
225578,226059,2024-12-17 00:01:18+00:00,ACTIVE,15.05
225577,226058,2024-12-17 00:02:18+00:00,ACTIVE,15.433
225576,226057,2024-12-17 00:03:18+00:00,ACTIVE,16.236
225575,226056,2024-12-17 00:04:18+00:00,ACTIVE,16.236


In [90]:
pha_beacon_df["datetime_utc"] = pd.to_datetime(pha_beacon_df["datetime_utc"], utc=True)
pha_reference_df["datetime_utc"] = pd.to_datetime(pha_reference_df["datetime_utc"], utc=True)

pha_beacon_df = pha_beacon_df.sort_values("datetime_utc")
pha_reference_df = pha_reference_df.sort_values("datetime_utc")

pha_test_df = pd.merge_asof(
    pha_beacon_df,
    pha_reference_df[["datetime_utc", "true_o3"]],
    on="datetime_utc",
    direction="nearest",
    tolerance=pd.Timedelta(seconds=30)
)

pha_test_df = pha_test_df.dropna(subset=["true_o3"])

pha_test_df.to_csv("data/pha_testing_data.csv")

In [91]:
pha_test_df.head()

Unnamed: 0,datetime_utc,no2_wrk_aux,no_wrk_aux,o3_wrk_aux,rh,temp,hour,true_o3
0,2024-12-17 08:00:00+00:00,0.00635,-0.00411,0.0313,100.0,13.71929,2024-12-17 08:00:00+00:00,31.915
1,2024-12-17 08:01:00+00:00,0.00649,-0.0006,0.03107,100.0,13.71923,2024-12-17 08:00:00+00:00,31.915
2,2024-12-17 08:02:00+00:00,0.0073,-0.0021,0.03121,100.0,13.72214,2024-12-17 08:00:00+00:00,31.901
3,2024-12-17 08:03:00+00:00,0.00676,-0.00403,0.03191,100.0,13.71571,2024-12-17 08:00:00+00:00,31.103
4,2024-12-17 08:04:00+00:00,0.00616,-0.00583,0.03206,100.0,13.72,2024-12-17 08:00:00+00:00,31.118


In [92]:
len(pha_reference_df["datetime_utc"])

225580

In [93]:
len(pha_test_df["datetime_utc"])

222715

In [94]:
len(pha_labeled_df["datetime_utc"])

18304