# ts2ml

> Tools to Transform a Time Series into Features and Target a.k.a Supervised Learning

## Install

```sh
pip install ts2ml
```

## How to use

In [None]:
import pandas as pd
from ts2ml.core import add_missing_slots

In [None]:
df = pd.DataFrame({
    'pickup_hour': ['2022-01-01 00:00:00', '2022-01-01 01:00:00', '2022-01-01 03:00:00', '2022-01-01 01:00:00', '2022-01-01 02:00:00', '2022-01-01 05:00:00'],
    'pickup_location_id': [1, 1, 1, 2, 2, 2],
    'rides': [2, 3, 1, 1, 2, 1]
})
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01 00:00:00,1,2
1,2022-01-01 01:00:00,1,3
2,2022-01-01 03:00:00,1,1
3,2022-01-01 01:00:00,2,1
4,2022-01-01 02:00:00,2,2
5,2022-01-01 05:00:00,2,1


In [None]:
add_missing_slots(df, datetime_col='pickup_hour', entity_col='pickup_location_id', value_col='rides', freq='H')

100%|██████████| 2/2 [00:00<00:00, 352.17it/s]


Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01 00:00:00,1,2
1,2022-01-01 01:00:00,1,3
2,2022-01-01 02:00:00,1,0
3,2022-01-01 03:00:00,1,1
4,2022-01-01 04:00:00,1,0
5,2022-01-01 05:00:00,1,0
6,2022-01-01 00:00:00,2,0
7,2022-01-01 01:00:00,2,1
8,2022-01-01 02:00:00,2,2
9,2022-01-01 03:00:00,2,0


# Another Example
Montly spaced time series

In [None]:
import pandas as pd
import numpy as np

# Generate timestamp index with monthly frequency
date_rng = pd.date_range(start='1/1/2020', end='12/1/2022', freq='MS')

# Create list of city codes
cities = ['FOR', 'SP', 'RJ']

# Create dataframe with random sales data for each city on each month
df = pd.DataFrame({
    'timestamp': date_rng,
    'city': np.repeat(cities, len(date_rng)//len(cities)),
    'sales': np.random.randint(1000, 5000, size=len(date_rng))
})
df

Unnamed: 0,timestamp,city,sales
0,2020-01-01,FOR,4216
1,2020-02-01,FOR,4309
2,2020-03-01,FOR,3639
3,2020-04-01,FOR,3685
4,2020-05-01,FOR,4481
5,2020-06-01,FOR,4133
6,2020-07-01,FOR,3504
7,2020-08-01,FOR,3957
8,2020-09-01,FOR,2781
9,2020-10-01,FOR,2996


In [None]:
df.groupby('city').agg({'timestamp': ['min', 'max']})

Unnamed: 0_level_0,timestamp,timestamp
Unnamed: 0_level_1,min,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2
FOR,2020-01-01,2020-12-01
RJ,2022-01-01,2022-12-01
SP,2021-01-01,2021-12-01


FOR city only have data for 2020 year, RJ only for 2022 and SP only for 2021. Let's also simulate more missing slots between the years.

In [None]:
# Generate random indices to drop
drop_indices = np.random.choice(df.index, size=int(len(df)*0.2), replace=False)

# Drop selected rows from dataframe
df = df.drop(drop_indices)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,timestamp,city,sales
0,2020-01-01,FOR,4216
1,2020-03-01,FOR,3639
2,2020-05-01,FOR,4481
3,2020-06-01,FOR,4133
4,2020-07-01,FOR,3504
5,2020-08-01,FOR,3957
6,2020-09-01,FOR,2781
7,2020-10-01,FOR,2996
8,2020-11-01,FOR,3963
9,2020-12-01,FOR,2381


Now lets fill the missing slots with zero values. The function will complete the missing slots with zeros:

In [None]:
df_full = add_missing_slots(df, datetime_col='timestamp', entity_col='city', value_col='sales', freq='MS')
df_full

100%|██████████| 3/3 [00:00<00:00, 844.15it/s]


Unnamed: 0,timestamp,city,sales
0,2020-01-01,FOR,4216
1,2020-02-01,FOR,0
2,2020-03-01,FOR,3639
3,2020-04-01,FOR,0
4,2020-05-01,FOR,4481
...,...,...,...
103,2022-08-01,RJ,3924
104,2022-09-01,RJ,1577
105,2022-10-01,RJ,0
106,2022-11-01,RJ,1867


In [None]:
df_full.groupby('city').agg({'timestamp': ['min', 'max']})

Unnamed: 0_level_0,timestamp,timestamp
Unnamed: 0_level_1,min,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2
FOR,2020-01-01,2022-12-01
RJ,2020-01-01,2022-12-01
SP,2020-01-01,2022-12-01
