# core

> Core functions

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from tqdm import tqdm
from typing import List, Optional

In [None]:
#| export
def add_missing_slots(
        df: pd.DataFrame,    # input dataframe with datetime, entity and value columns - time series format
        datetime_col: str,   # name of the datetime column
        entity_col: str,     # name of the entity column. If a time series is associated to a location, this column will be 'location_id'
        value_col: str,      # name of the value column
        freq: str='H',       # frequency of the time series. Default is hourly
        fill_value: int = 0  # value to use to fill missing slots
) -> pd.DataFrame:
    """
    Add missing slots to a time series dataframe.
    This function is useful to fill missing slots in a time series dataframe.
    For example, if a time series is associated to a location, this function will add missing slots for each location.
    Missing slots are filled with the value specified in the 'fill_value' parameter.
    By default, the frequency of the time series is hourly.
    """

    entity_ids = df[entity_col].unique()
    all_hours = pd.date_range(start=df[datetime_col].min(), end=df[datetime_col].max(), freq=freq)

    output = pd.DataFrame()

    for entity_id in tqdm(entity_ids):

        # keep only rides for this 'location_id'
        df_entity_id = df.loc[df[entity_col] == entity_id, [datetime_col, value_col]]

        # quick way to add missing dates with 0 in a Series
        # taken from https://stackoverflow.com/a/19324591
        df_entity_id.set_index(datetime_col, inplace=True)
        df_entity_id.index = pd.DatetimeIndex(df_entity_id.index)
        df_entity_id = df_entity_id.reindex(all_hours, fill_value=0)

        # add back 'location_id' column
        df_entity_id[entity_col] = entity_id

        output = pd.concat([output, df_entity_id])

    # move the purchase_day from index to column
    output = output.reset_index().rename(columns={'index': datetime_col})
    output = output[[datetime_col, entity_col, value_col]].copy()

    return output

In [None]:
df = pd.DataFrame({
    'pickup_hour': ['2022-01-01 00:00:00', '2022-01-01 01:00:00', '2022-01-01 03:00:00', '2022-01-01 01:00:00', '2022-01-01 02:00:00', '2022-01-01 05:00:00'],
    'pickup_location_id': [1, 1, 1, 2, 2, 2],
    'rides': [2, 3, 1, 1, 2, 1]
})
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01 00:00:00,1,2
1,2022-01-01 01:00:00,1,3
2,2022-01-01 03:00:00,1,1
3,2022-01-01 01:00:00,2,1
4,2022-01-01 02:00:00,2,2
5,2022-01-01 05:00:00,2,1


In [None]:
add_missing_slots(df, datetime_col='pickup_hour', entity_col='pickup_location_id', value_col='rides', freq='H')

100%|██████████| 2/2 [00:00<00:00, 667.14it/s]


Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01 00:00:00,1,2
1,2022-01-01 01:00:00,1,3
2,2022-01-01 02:00:00,1,0
3,2022-01-01 03:00:00,1,1
4,2022-01-01 04:00:00,1,0
5,2022-01-01 05:00:00,1,0
6,2022-01-01 00:00:00,2,0
7,2022-01-01 01:00:00,2,1
8,2022-01-01 02:00:00,2,2
9,2022-01-01 03:00:00,2,0


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()