In [16]:
import warnings

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [17]:
warnings.filterwarnings("ignore")

### add_d1_features

In [18]:
def add_d1_features(stop_times):
    stop_times["trip_id"] = stop_times["trip_id"].astype(int)
    stop_times["deviceid"] = stop_times["deviceid"].astype(int)
    stop_times["direction"] = stop_times["direction"].astype(int)
    stop_times["bus_stop"] = stop_times["bus_stop"].astype(str)
    stop_times = stop_times[stop_times["bus_stop"] != "BT01"]
    stop_times["bus_stop"] = stop_times["bus_stop"].astype(int)

    stop_times["time_of_day"] = pd.to_datetime(
        stop_times["arrival_time"], format="%H:%M:%S"
    ).dt.hour

    stop_times["date"] = pd.to_datetime(stop_times["date"])
    stop_times["week_no"] = stop_times["date"].dt.isocalendar().week
    stop_times["day_of_week"] = stop_times["date"].dt.dayofweek
    stop_times["is_weekday"] = stop_times["date"].dt.dayofweek.isin([0, 1, 2, 3, 4]).astype(int)
    stop_times["saturday"] = (stop_times["date"].dt.dayofweek == 5).astype(int)
    stop_times["sunday"] = (stop_times["date"].dt.dayofweek == 6).astype(int)
    stop_times["day"] = stop_times["date"].dt.day
    stop_times["month"] = stop_times["date"].dt.month

    df = stop_times

    df = df[df["direction"] == 1]

    long_stops = [101, 105, 109, 113]
    dfl = df.loc[df["bus_stop"].isin(long_stops)]

    short_stops = [102, 106, 107, 108, 110, 111, 112, 114]
    dfs = df.loc[df["bus_stop"].isin(short_stops)]

    mean = np.mean(dfs["dwell_time_in_seconds"], axis=0)
    sd = np.std(dfs["dwell_time_in_seconds"], axis=0)

    df = df.drop(df[df["dwell_time_in_seconds"] > 600].index)

    dft = df.groupby("week_no")

    groupings = list(dft.groups.keys())

    df.reset_index(drop=True, inplace=True)

    for i in range(3, len(groupings)):
        curr = dft.get_group(groupings[i])
        prev1 = dft.get_group(groupings[i - 1])
        prev2 = dft.get_group(groupings[i - 2])
        prev3 = dft.get_group(groupings[i - 3])

    for index, row in curr.iterrows():
        day = row["day_of_week"]
        time = row["time_of_day"]
        stop = row["bus_stop"]
        agg1 = prev1.loc[
            (prev1["day_of_week"] == day)
            & (prev1["time_of_day"] == time)
            & (prev1["bus_stop"] == stop)
        ]
        agg2 = prev2.loc[
            (prev2["day_of_week"] == day)
            & (prev2["time_of_day"] == time)
            & (prev2["bus_stop"] == stop)
        ]
        agg3 = prev3.loc[
            (prev3["day_of_week"] == day)
            & (prev3["time_of_day"] == time)
            & (prev3["bus_stop"] == stop)
        ]

        df.at[index, "dt(w-1)"] = round(agg1["dwell_time_in_seconds"].mean(), 1)
        df.at[index, "dt(w-2)"] = round(agg2["dwell_time_in_seconds"].mean(), 1)
        df.at[index, "dt(w-3)"] = round(agg3["dwell_time_in_seconds"].mean(), 1)

    df["dt(w-1)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )
    df["dt(w-2)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )
    df["dt(w-3)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )

    for name, group in df.groupby("date"):
        for index, row in group.iterrows():
            time = row["time_of_day"]
            stop = row["bus_stop"]

            df.at[index, "dt(t-1)"] = round(
                group["dwell_time_in_seconds"][
                    (group["time_of_day"] == (time - 1)) & (group["bus_stop"] == stop)
                ].mean(),
                1,
            )
            df.at[index, "dt(t-2)"] = round(
                group["dwell_time_in_seconds"][
                    (group["time_of_day"] == (time - 2)) & (group["bus_stop"] == stop)
                ].mean(),
                1,
            )
    df["dt(t-1)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )
    df["dt(t-2)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )

    for name, group in df.groupby("trip_id"):
        for index, row in group.iterrows():
            stop = row["bus_stop"]
            trip = row["trip_id"]
            df.at[index, "dt(n-1)"] = round(
                group["dwell_time_in_seconds"][
                    (group["bus_stop"] == (stop - 1))
                ].mean(),
                1,
            )
            df.at[index, "dt(n-2)"] = round(
                group["dwell_time_in_seconds"][
                    (group["bus_stop"] == (stop - 2))
                ].mean(),
                1,
            )
            df.at[index, "dt(n-3)"] = round(
                group["dwell_time_in_seconds"][
                    (group["bus_stop"] == (stop - 3))
                ].mean(),
                1,
            )

    df["dt(n-1)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )
    df["dt(n-2)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )
    df["dt(n-3)"].fillna(
        df.groupby(["bus_stop", "time_of_day"])["dwell_time_in_seconds"].transform(
            "mean"
        ),
        inplace=True,
    )

    df[
        [
            "dt(w-1)",
            "dt(w-2)",
            "dt(w-3)",
            "dt(t-1)",
            "dt(t-2)",
            "dt(n-1)",
            "dt(n-2)",
            "dt(n-3)",
        ]
    ] = df[
        [
            "dt(w-1)",
            "dt(w-2)",
            "dt(w-3)",
            "dt(t-1)",
            "dt(t-2)",
            "dt(n-1)",
            "dt(n-2)",
            "dt(n-3)",
        ]
    ].apply(
        pd.Series.round
    )

    stop_times = df

    stop_type_mapping = {
        101: "pro",
        102: "mod",
        103: "mod",
        104: "br",
        105: "pro",
        106: "mod",
        107: "br",
        108: "br",
        109: "pro",
        110: "mod",
        111: "br",
        112: "br",
        113: "mod",
        114: "br",
    }

    stop_times["stop_type"] = stop_times["bus_stop"].apply(
        lambda x: stop_type_mapping.get(x, "br")
    )

    stop_times["date"] = stop_times["date"].dt.strftime("%Y-%m-%d")

    return stop_times

### Feature engineering

In [19]:
stops_df = pd.read_csv("stops.csv")
stops_df.head()

Unnamed: 0,trip_id,deviceid,date,direction,bus_stop,arrival_time,departure_time,dwell_time,dwell_time_in_seconds,day_of_week,hour_of_day,is_weekday
0,1.0,116.0,2022-07-01,1.0,114,06:57:33,06:57:33,0:00:00,0.0,4,6,1
1,1.0,262.0,2022-07-01,1.0,105,07:01:59,07:01:59,0:00:00,0.0,4,7,1
2,1.0,116.0,2022-07-01,1.0,113,07:00:27,07:00:57,0:00:30,30.0,4,7,1
3,1.0,116.0,2022-07-01,1.0,BT01,06:19:20,18:55:00,12:35:40,45340.0,4,6,1


In [20]:
f"{stops_df.date[0]} - {stops_df.date[len(stops_df)-1]}"

'2022-07-01 - 2022-07-01'

In [21]:
dt_df = pd.read_csv("./datasets/bus_dwell_times_654.csv")
dt_df.head()

Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time_in_seconds
0,1,262,1,109,2021-10-01,07:10:18,07:11:02,44.0
1,1,262,1,102,2021-10-01,06:45:42,06:45:42,0.0
2,1,262,1,101,2021-10-01,06:40:58,06:42:12,74.0
3,1,262,1,105,2021-10-01,06:58:56,07:02:27,211.0
4,1,262,1,106,2021-10-01,07:05:57,07:06:32,35.0


In [22]:
f"{dt_df.date[0]} - {dt_df.date[len(dt_df)-1]}"

'2021-10-01 - 2022-11-01'

In [23]:
features_added_dir_1_dt_df = add_d1_features(dt_df)
features_added_dir_1_dt_df.head()

Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time_in_seconds,time_of_day,week_no,...,month,dt(w-1),dt(w-2),dt(w-3),dt(t-1),dt(t-2),dt(n-1),dt(n-2),dt(n-3),stop_type
0,1,262,1,109,2021-10-01,07:10:18,07:11:02,44.0,7,39,...,10,55.0,55.0,55.0,55.0,55.0,23.0,25.0,35.0,pro
1,1,262,1,102,2021-10-01,06:45:42,06:45:42,0.0,6,39,...,10,35.0,35.0,35.0,35.0,35.0,74.0,35.0,35.0,mod
2,1,262,1,101,2021-10-01,06:40:58,06:42:12,74.0,6,39,...,10,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,pro
3,1,262,1,105,2021-10-01,06:58:56,07:02:27,211.0,6,39,...,10,180.0,180.0,180.0,180.0,180.0,0.0,6.0,0.0,pro
4,1,262,1,106,2021-10-01,07:05:57,07:06:32,35.0,7,39,...,10,23.0,23.0,23.0,23.0,23.0,211.0,0.0,6.0,mod


In [24]:
features_added_dir_1_dt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99342 entries, 0 to 99341
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_id                99342 non-null  int64  
 1   deviceid               99342 non-null  int64  
 2   direction              99342 non-null  int64  
 3   bus_stop               99342 non-null  int64  
 4   date                   99342 non-null  object 
 5   arrival_time           99342 non-null  object 
 6   departure_time         99342 non-null  object 
 7   dwell_time_in_seconds  99342 non-null  float64
 8   time_of_day            99342 non-null  int32  
 9   week_no                99342 non-null  UInt32 
 10  day_of_week            99342 non-null  int32  
 11  is_weekday             99342 non-null  int64  
 12  saturday               99342 non-null  int64  
 13  sunday                 99342 non-null  int64  
 14  day                    99342 non-null  int32  
 15  mo

In [25]:
stop_type_le = LabelEncoder()
features_encoded_dir_1_dt_df = features_added_dir_1_dt_df.copy()
features_encoded_dir_1_dt_df["stop_type"] = stop_type_le.fit_transform(features_encoded_dir_1_dt_df["stop_type"])

In [26]:
dict(zip(stop_type_le.classes_, stop_type_le.transform(stop_type_le.classes_)))

{'br': 0, 'mod': 1, 'pro': 2}

### iBAT

![archi](./images/ibat.png)

In [2]:
pip install ./ibat-0.1.0rc1-py3-none-any.whl

Processing ./ibat-0.1.0rc1-py3-none-any.whl
Collecting river==0.13.0 (from ibat==0.1.0rc1)
  Using cached river-0.13.0.tar.gz (861 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting numpy==1.26.4 (from ibat==0.1.0rc1)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas==2.2.2 (from ibat==0.1.0rc1)
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting matplotlib==3.8.4 (from ibat==0.1.0rc1)
  Using cached matplotlib-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting scikit-learn==1.4.2 (from ibat==0.1.0rc1)
  Using cached scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting xgboost==2.0.3 (from ibat==0.1.0rc1)
  Using cached xgboos

In [27]:
from datetime import datetime

from ibat.concept_drift_detector.strategies import DDM
from ibat.datasets import BUS_654_FEATURES_ENCODED_DWELL_TIMES
from ibat.pipeline import run_dt_exp


def datetime_from_string(datetime_string: str) -> datetime:
    return datetime.strptime(datetime_string, "%Y-%m-%d")

In [28]:
if __name__ == "__main__":
    cdd_strategy = DDM(
        warning_level=0.1,
        drift_level=1.5,
        min_num_instances=1,
    )
    run_dt_exp(
        dt_df=features_encoded_dir_1_dt_df,
        hist_start=datetime_from_string("2021-10-01"),
        hist_end=datetime_from_string("2022-02-01"),
        stream_start=datetime_from_string("2022-02-01"),
        stream_end=datetime_from_string("2022-11-01"),
        interval_min=60 * 2,
        chunk_size=100,
        active_strategy=True,
        cdd_strategy=cdd_strategy,
        incremental_learning=True,
        is_buffer_enabled=False,
        output_parent_dir="./experiments",
        label="demo-dt-exp-for-hbp",
    )

BATCH PROCESSING TECHNIQUE: HYBRID | CONCEPT DRIFT HANDLING STRATEGY: ACTIVE | STRATEGY TO UPDATE THE OUTDATED MODEL: INCREMENTAL LEARNING
DATA STREAM: [2021-10-01 00:00:00 - 2022-02-01 00:00:00) | NUMBER OF INSTANCES: 53565 | MODEL INITIATED
DATA STREAM: [2022-02-01 00:00:00 - 2022-02-01 02:00:00) | NUMBER OF INSTANCES: 0000 | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.
DATA STREAM: [2022-02-01 00:00:00 - 2022-02-01 08:16:01) | NUMBER OF INSTANCES: 0100 | CDD at xgb_dt_classifier: False | CDD at xgb_dt_regressor: False
DATA STREAM: [2022-02-01 08:16:01 - 2022-02-01 10:16:01) | NUMBER OF INSTANCES: 0084 | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.
DATA STREAM: [2022-02-01 08:16:01 - 2022-02-01 10:37:15) | NUMBER OF INSTANCES: 0100 | CDD at xgb_dt_classifier: False | CDD at xgb_dt_regressor: False
DATA STREAM: [2022-02-01 10:37:15 - 2022-02-01 12:37:15) | NUMBER OF INSTANCES: 0095 | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.
DATA STREAM: [2022-02-01 10:37:15 - 202

In [2]:
if __name__ == "__main__":
    cdd_strategy = DDM(
        warning_level=0.1,
        drift_level=1.5,
        min_num_instances=1,
    )
    run_dt_exp(
        dt_df=BUS_654_FEATURES_ENCODED_DWELL_TIMES.dataframe,
        hist_start=datetime_from_string("2021-10-01"),
        hist_end=datetime_from_string("2022-02-01"),
        stream_start=datetime_from_string("2022-02-01"),
        stream_end=datetime_from_string("2022-11-01"),
        interval_min=60 * 2,
        chunk_size=100,
        active_strategy=True,
        cdd_strategy=cdd_strategy,
        incremental_learning=True,
        is_buffer_enabled=False,
        output_parent_dir="./experiments",
        label="demo-dt-exp-for-hbp",
    )

BATCH PROCESSING TECHNIQUE: HYBRID | CONCEPT DRIFT HANDLING STRATEGY: ACTIVE | STRATEGY TO UPDATE THE OUTDATED MODEL: INCREMENTAL LEARNING
DATA STREAM: [2021-10-01 00:00:00 - 2022-02-01 00:00:00) | NUMBER OF INSTANCES: 49021 | MODEL INITIATED
DATA STREAM: [2022-02-01 00:00:00 - 2022-02-01 02:00:00) | NUMBER OF INSTANCES: 0000 | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.
DATA STREAM: [2022-02-01 00:00:00 - 2022-02-01 08:16:01) | NUMBER OF INSTANCES: 0100 | CDD at xgb_dt_classifier: False | CDD at xgb_dt_regressor: False
DATA STREAM: [2022-02-01 08:16:01 - 2022-02-01 10:16:01) | NUMBER OF INSTANCES: 0084 | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.
DATA STREAM: [2022-02-01 08:16:01 - 2022-02-01 10:37:15) | NUMBER OF INSTANCES: 0100 | CDD at xgb_dt_classifier: False | CDD at xgb_dt_regressor: False
DATA STREAM: [2022-02-01 10:37:15 - 2022-02-01 12:37:15) | NUMBER OF INSTANCES: 0095 | COUNT IS NOT ENOUGH. WAITING FOR MORE DATA POINTS.
DATA STREAM: [2022-02-01 10:37:15 - 202