In [81]:
import pandas as pd
import numpy as np

file_path = r"../data/at-dataset/Scats_Data.csv"

df = pd.read_csv(file_path, sep="\t")

df[["Detector_ID", "Lane"]] = df["Detector"].str.split("-", expand=True)
df = df.drop(columns=["Detector"])
df["Detector_ID"] = pd.to_numeric(df["Detector_ID"], errors="coerce")
df["Lane"] = pd.to_numeric(df["Lane"], errors="coerce")
df["DateTime"] = pd.to_datetime(df["Date"] + " " + df["Time"], format="%Y-%m-%d %H:%M")

df = df.drop_duplicates(subset=["Detector_ID", "Lane", "DateTime"])
# df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
# df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.time

In [None]:
## site dictionary

In [82]:
df.describe()

Unnamed: 0,Invalid,Volume,Detector_ID,Lane,DateTime
count,1268714.0,1268714.0,1268714.0,1268714.0,1268714
mean,0.5868234,124.1695,2184.701,7.2052,2024-04-14 22:37:38.667910400
min,0.0,0.0,2013.0,1.0,2023-01-01 00:00:00
25%,0.0,16.0,2061.0,3.0,2023-08-23 20:00:00
50%,0.0,72.0,2257.0,6.0,2024-04-14 06:00:00
75%,0.0,192.0,2257.0,10.0,2024-12-03 03:00:00
max,24.0,1498.0,2402.0,19.0,2025-07-31 23:00:00
std,2.590651,138.9751,139.6438,4.670805,


In [83]:
## check missing entry
full_time_index = pd.date_range(
    start=df["DateTime"].min(),
    end=df["DateTime"].max(),
    freq="h"
)

missing_summary = {}


for (det, lane), group in df.groupby(["Detector_ID", "Lane"]):
    group = group.set_index("DateTime").sort_index()
    # Drop duplicate DateTime entries to avoid reindex error
    group = group[~group.index.duplicated(keep='first')]
    reindexed = group.reindex(full_time_index)
    
    n_missing = reindexed["Volume"].isna().sum()
    if n_missing > 0:
        missing_summary[(det, lane)] = n_missing


missing_df = pd.DataFrame.from_dict(
    missing_summary, orient="index", columns=["MissingEntries"]
).reset_index()
missing_df.rename(columns={"level_0": "DetectorID", "level_1": "Lane"}, inplace=True)

print(missing_df.sort_values("MissingEntries", ascending=False).head(20))

         index  MissingEntries
49   (2322, 6)             459
48   (2322, 5)             459
47   (2322, 4)             459
46   (2322, 3)             459
45   (2322, 2)             459
44   (2322, 1)             459
43  (2257, 19)             459
42  (2257, 18)             459
41  (2257, 17)             459
40  (2257, 16)             459
39  (2257, 15)             459
38  (2257, 14)             459
37  (2257, 13)             459
36  (2257, 12)             459
35  (2257, 11)             459
34  (2257, 10)             459
33   (2257, 9)             459
32   (2257, 8)             459
31   (2257, 7)             459
30   (2257, 6)             459


In [84]:
df.groupby(["Detector_ID", "Lane"]).size()

Detector_ID  Lane
2013         1       22197
             2       22197
             3       22197
             4       22197
             5       22197
             6       22197
             7       22197
             8       22197
             9       22197
             10      22197
             11      22197
             12      22197
             13      22197
2061         1       22412
             2       22412
             3       22412
             4       22412
             5       22412
             6       22412
             7       22412
             8       22412
             9       22412
             10      22412
             11      22412
             12      22412
2257         1       22173
             2       22173
             3       22173
             4       22173
             5       22173
             6       22173
             7       22173
             8       22173
             9       22173
             10      22173
             11      22173
          

In [91]:
site_2013 = df[df['Detector_ID'] == 2013]
site_2061 = df[df['Detector_ID'] == 2061]
site_2257 = df[df['Detector_ID'] == 2257]
site_2322 = df[df['Detector_ID'] == 2322]
site_2402 = df[df['Detector_ID'] == 2402]