In [None]:
from pathlib import Path
import pandas as pd
import pyarrow as pa
from pyarrow import csv


## Load and prepare data

In [None]:
DATA_PATH = Path("..") / "data"

In [None]:
df = csv.read_csv(
    DATA_PATH / "sample.csv",
    convert_options=csv.ConvertOptions(
        column_types={
            "gare": pa.string(),
            "trip_short": pa.string(),
            "trip_headsign": pa.string(),
            "destination": pa.string(),
            "date.mode": pa.string(),
            "direction": pa.float64(),
            "etat": pa.string(),
            "heure_arrive": pa.string(),
        }
    ),
)
df = df.to_pandas()
df = df[
    [
        "gare",
        "destination",
        "trip_short",
        "heure_arrive",
        "direction",
    ]
]
df = df.assign(heure_arrive=lambda df: pd.to_datetime(df.heure_arrive))
df

## Normalization

In [None]:
df = df[~df["heure_arrive"].isna()]
df = df.sort_values(by=['heure_arrive'], ascending=True)
df

## Lag Window

In [None]:
df = pd.concat([df, df.groupby(["trip_short"]).shift(-1).add_prefix("next_")], axis=1)
df


In [None]:
df[df["trip_short"] == "KLEE84"]

In [None]:
df = df.assign(duree=lambda df: df["next_heure_arrive"] - df["heure_arrive"])
df

## Statistic Window Model

In [None]:
sample_df = df[(df["gare"] == "87271460") & (df["destination"] == "87758847")]
sample_df = sample_df.assign(duree=lambda df: df["duree"] / pd.Timedelta(1, unit="s"))
sample_df["duree"].plot.kde()

In [None]:
model_df = df.assign(duree=lambda df: df["duree"] / pd.Timedelta(1, unit="s"))
model_df = model_df.groupby(["gare", "next_gare", "direction"]).std().reset_index()
model_df

In [None]:
model_df[
    (
        (model_df["gare"] == "87271411")
        & (model_df["next_gare"] == "87271452")
        & (model_df["direction"] == 1)
    )
    | (
        (model_df["gare"] == "87271452")
        & (model_df["next_gare"] == "87271452")
        & (model_df["direction"] == 1)
    )
]
