In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
# =========================
# df declaration
# =========================

file_path = r"../data/at-dataset/Scats-Data.csv"

df = pd.read_csv(file_path, sep="\t")

df[["Detector_ID", "Lane"]] = df["Detector"].str.split("-", expand=True)
df = df.drop(columns=["Detector"])
df["Detector_ID"] = pd.to_numeric(df["Detector_ID"], errors="coerce")
df["Lane"] = pd.to_numeric(df["Lane"], errors="coerce")
df["DateTime"] = pd.to_datetime(df["Date"] + " " + df["Time"], format="%Y-%m-%d %H:%M")

df = df.drop_duplicates(subset=["Detector_ID", "Lane", "DateTime"])
df.set_index("DateTime", inplace=True)
del df["Date"]
del df["Time"] 
del df['Invalid']
df = df[['Detector_ID', 'Lane', 'Volume']]
df = df.sort_values(by=['DateTime', 'Detector_ID', 'Lane'])

site_description = pd.read_csv(r"../data/at-dataset/site_description.csv")

df = df.join(
    site_description.set_index(['Site', 'Lane'])[['Direction']],
    on=['Detector_ID', 'Lane'],
    how='left'
)

# temporarily remove rows with missing
df = df.dropna(subset=['Direction'])

df = df[df['Direction'] != 'N']

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6386565 entries, 2023-01-01 00:00:00 to 2025-07-31 23:00:00
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   Detector_ID  int64 
 1   Lane         int64 
 2   Volume       int64 
 3   Direction    object
dtypes: int64(3), object(1)
memory usage: 243.6+ MB


In [None]:
# =========================
# code for check missing timestamp
# =========================

site = df['Detector_ID'].unique()
for s in site:
    site_df = df[df['Detector_ID'] == s]
    lane = site_df['Lane'].unique()
    lane = np.sort(lane)
    # plt.figure(figsize=(15, 5 * len(lane)))
    for i, l in enumerate(lane):
        lane_df = site_df[site_df['Lane'] == l].sort_index()
        full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='1h')
        missing_times = full_range.difference(lane_df.index)
        print(f"Site: {s}, Lane: {l}, Missing timestamps: {len(missing_times)}")

In [None]:
# =========================
# plotting the df and save into dir
# =========================
site = df['Detector_ID'].unique()
# site = df1['Detector_ID'].unique()

for s in site:
    site_df = df[df['Detector_ID'] == s]
    lane = site_df['Lane'].unique()
    lane = np.sort(lane)
    # plt.figure(figsize=(15, 5 * len(lane)))
    for i, l in enumerate(lane):
        plt.figure(figsize=(15, 5))
        lane_df = site_df[site_df['Lane'] == l].sort_index()
        plt.plot(lane_df.index, lane_df['Volume'],'-', label=f'Site {s} Lane {l}')
        plt.title(f'Site {s} Lane {l} Volume over Time')
        plt.xlabel('Time')
        plt.ylabel('Volume')
        plt.legend()
        plt.grid()
        
        # =========================
        # choose output dir
        # =========================
        output_dir = f'../results/site_plot/site_{s}/'
        # output_dir = f'../results/site_plot_1/site_{s}/'

        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(f'{output_dir}site_{s}_lane_{l}_volume.png')
        plt.close()

In [None]:
# =========================
# removing timestamp that have zero volume for the entire day
# =========================

site = df['Detector_ID'].unique()
df1 = pd.DataFrame()
for s in site:
    site_df = df[df['Detector_ID'] == s]
    lane = site_df['Lane'].unique()
    lane = np.sort(lane)
    for i, l in enumerate(lane):
        temp = df[(df['Detector_ID'] == s) & (df['Lane'] == l)]
        cnt = temp['Volume'].resample('D').size()
        all0 = temp['Volume'].eq(0).resample('D').apply(lambda x: x.all())
        baddays = (all0 & cnt.eq(24))
        temp = temp[~temp.index.normalize().isin(baddays[baddays].index)]
        df1 = pd.concat([df1, temp])

In [145]:
# =========================
# save the updated df into csv
# =========================

df1.to_csv(r"../data/at-dataset/SCATS-data/Scats-Data-Processed.csv", index=True)