# Preprocessing with Fugue

## Loading in Data

We'll take a quick look at the data given to us to understand the problem more. Most of the code snippets here are taken from [Rob Mulla's Starter Notebook](https://www.kaggle.com/code/robikscube/m5-forecasting-starter-data-exploration). We're not going to go to deep to understand everything. We're only interested in setting up an end-to-end modelling pipeline.

In [1]:
import pandas as pd
import os

download_path = os.path.abspath(os.path.join(".","..","data","m5-forecasting-accuracy.zip"))
unzipped_path = os.path.abspath(os.path.join(".","..","data","m5-forecasting-accuracy-unzipped"))

# Read in the data
INPUT_DIR = unzipped_path
WORKING_DIR = os.path.join(unzipped_path, "..", "working")
training_data = pd.read_csv(f'{INPUT_DIR}/sales_train_evaluation.csv')


## Training Data

In [2]:
training_data.iloc[0:1]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1


In [4]:
def get_calendar_data():
    df = pd.read_csv(f'{INPUT_DIR}/calendar.csv')
    df["date"] = pd.to_datetime(df["date"])
    return df

In [5]:
from typing import Iterable, List, Any, Dict
from fugue import transform
from datetime import timedelta

start = get_calendar_data()['date'].min()

# schema: unique_id:str,item_id:str,store_id:str,ds:date,y:int
def format_sales(df:Iterable[List[Any]], start) -> Iterable[List[Any]]:
    for row in df:
        counter = 0
        for y in row[6:]:
            # help with convergence
            if y == 0:
                y = y + 0.01
            date = start + timedelta(counter-1)
            yield row[:2] + [row[4]] + [date, y]
            counter=counter+1

In [6]:
transform(training_data.iloc[0:1], format_sales, params={"start": start})

Unnamed: 0,unique_id,item_id,store_id,ds,y
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-28,0
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,0
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-30,0
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-31,0
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-02-01,0
...,...,...,...,...,...
1936,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2016-05-17,0
1937,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2016-05-18,3
1938,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2016-05-19,3
1939,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2016-05-20,0


In [7]:
ddf = transform(training_data[0:100], 
                format_sales, 
                params={"start": start}, 
                engine="dask")
ddf.compute().head(5)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 58670 instead


Unnamed: 0,unique_id,item_id,store_id,ds,y
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-28,0
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,0
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-30,0
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-31,0
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-02-01,0


## Exogenous Regressors

We want to add price in.

In [7]:
sell_prices = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv')
sell_prices.head(2)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58


In [8]:
get_calendar_data().head(2)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0


In [9]:
from fugue import FugueWorkflow

sampled_sales = training_data.iloc[0:2]
calendar = get_calendar_data()
start = calendar['date'].min()

def process_data(sample=True) -> FugueWorkflow:
    dag = FugueWorkflow()
    if sample:
        sales = dag.df(sampled_sales)
    else:
        sales = dag.load(f'{INPUT_DIR}/sales_train_evaluation.csv', header=True)
    prices = dag.load(f'{INPUT_DIR}/sell_prices.csv', header=True)
    calendar = dag.load(f'{INPUT_DIR}/calendar.csv', header=True).rename({"date": "ds"}).alter_columns("ds:date")
    sales = sales.transform(format_sales, params={"start": start})
    combined = sales.join(calendar[["ds","wm_yr_wk"]], how="left_outer")\
                    .join(prices, how="inner")
    combined.show()
    combined.save(f"{WORKING_DIR}/combined.parquet")
    return dag

In [10]:
dag = process_data(sample=True)
dag.run()

Unnamed: 0,unique_id,item_id,store_id,ds,y,wm_yr_wk,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-13,0,11325,9.58
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-14,0,11325,9.58
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-15,0,11325,9.58
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-16,0,11325,9.58
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-17,1,11325,9.58
5,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-18,0,11325,9.58
6,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-19,0,11325,9.58
7,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-20,0,11326,9.58
8,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-21,0,11326,9.58
9,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2013-07-22,0,11326,9.58


DataFrames()

In order to run on the full dataset and get the full combined file, you can execute:

```python
dag = process_data(sample=False)
dag.run("dask")
```

In [11]:
from statsforecast.distributed.utils import forecast
from statsforecast.models import AutoARIMA
from statsforecast import StatsForecast

df = pd.read_parquet(f"{WORKING_DIR}/combined.parquet")
df = df[["unique_id", "ds", "y","sell_price"]]

model = StatsForecast(
    df=df,
    models=[AutoARIMA(season_length=7)], 
    freq='D', 
    n_jobs=-1
)

# We can forecast for one timeseries. 
# We need the future values of exogenous regressors.
sample = pd.DataFrame({
    'unique_id': ['HOBBIES_1_002_CA_1_evaluation']*7,
    'ds': pd.date_range(start='2016-05-22', end='2016-05-28').tolist(),
    'sell_price': [3.97] * 7
})
# Pass a forecast to the forecast
model.forecast(7, sample)

## Next Steps

How can we make this better?
1. Hierarchichal Forecasting
2. Passing Data Is Current Inefficient
3. Scaling to Coiled Cluster

## Hierarchichal Preprocessing

We need to keep the hierchichal columns to preserve the relationships between the timeseries.

In [12]:
start = get_calendar_data()['date'].min()

# schema: unique_id:str,item_id:str,dept_id:str,cat_id:str,store_id:str,state_id:str,ds:date,y:int
def format_sales_hierarchical(df:Iterable[List[Any]], start) -> Iterable[List[Any]]:
    for row in df:
        counter = 0
        for y in row[6:]:
            # help with convergence
            if y == 0:
                y = y + 0.01
            date = start + timedelta(counter-1)
            yield row[:6] + [date, y]
            counter=counter+1

In [13]:
transform(training_data, format_sales_hierarchical, params={"start": start}, engine="dask", save_path=f"{WORKING_DIR}/hierarchical.parquet")



'/Users/kevinkho/Work/pydata-timeseries-forecasting/data/m5-forecasting-accuracy-unzipped/../working/hierarchical.parquet'