# Running on a Dask Cluster (with Coiled)

Previously, we expanded each row to the full timeseries to use with the `forecast` function. In practice, we want to be minimizing network data transfer when it comes to distributed computing.

## Setup

In [1]:
import pandas as pd
import os

download_path = os.path.abspath(os.path.join(".","..","data","m5-forecasting-accuracy.zip"))
unzipped_path = os.path.abspath(os.path.join(".","..","data","m5-forecasting-accuracy-unzipped"))

# Read in the data
INPUT_DIR = unzipped_path
WORKING_DIR = os.path.join(unzipped_path, "..", "working")
calendar = pd.read_csv(f'{INPUT_DIR}/calendar.csv')
sales = pd.read_csv(f'{INPUT_DIR}/sales_train_evaluation.csv')
sell_prices = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv')

## Minimizing Data Footprint

In [2]:
from typing import List, Dict, Any, Iterable
from datetime import date
import pickle

def prices_to_series(df:pd.DataFrame) -> List[Dict[str,Any]]:
    # Assert each date has a price entry
    assert df.shape[0] == (df.date.iloc[-1]-df.date.iloc[0]).days + 1
    return [dict(store_id=df.iloc[0]["store_id"],
                 item_id=df.iloc[0]['item_id'],
                 price_start=df.iloc[0]['date'], 
                 prices=df["sell_price"].tolist())]


df = pd.DataFrame([["store1","item1",date(2020,1,2),2.2], 
                   ["store1","item1",date(2020,1,3),3.3],
                   ["store1","item1", date(2020,1,4),4.4]], 
                   columns=["store_id", "item_id", "date","sell_price"])
print(prices_to_series(df))

[{'store_id': 'store1', 'item_id': 'item1', 'price_start': datetime.date(2020, 1, 2), 'prices': [2.2, 3.3, 4.4]}]


In [3]:
joined = sell_prices.merge(calendar[["date","wm_yr_wk"]], how="inner", on="wm_yr_wk")
joined['date'] = pd.to_datetime(joined['date'])
joined.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,date
0,CA_1,HOBBIES_1_001,11325,9.58,2013-07-13
1,CA_1,HOBBIES_1_001,11325,9.58,2013-07-14
2,CA_1,HOBBIES_1_001,11325,9.58,2013-07-15
3,CA_1,HOBBIES_1_001,11325,9.58,2013-07-16
4,CA_1,HOBBIES_1_001,11325,9.58,2013-07-17


In [4]:
from fugue import transform

sell_prices = transform(joined, 
                prices_to_series, 
                schema="store_id:str,item_id:str,price_start:date,prices:[float]",
                partition={"by": ["store_id", "item_id"], "presort": "date asc"})
sell_prices.head()

Unnamed: 0,store_id,item_id,price_start,prices
0,CA_1,FOODS_1_001,2011-01-29,"[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ..."
1,CA_1,FOODS_1_002,2011-01-29,"[7.88, 7.88, 7.88, 7.88, 7.88, 7.88, 7.88, 7.8..."
2,CA_1,FOODS_1_003,2011-01-29,"[2.88, 2.88, 2.88, 2.88, 2.88, 2.88, 2.88, 2.8..."
3,CA_1,FOODS_1_004,2012-03-03,"[1.78, 1.78, 1.78, 1.78, 1.78, 1.78, 1.78, 1.7..."
4,CA_1,FOODS_1_005,2011-01-29,"[2.94, 2.94, 2.94, 2.94, 2.94, 2.94, 2.94, 2.9..."


In [5]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [6]:
# schema: unique_id:str,item_id:str,store_id:str,sales_start:date,sales:[float]
def sales_to_series(df:Iterable[List[Any]], start) -> Iterable[List[Any]]:
    for row in df:
        yield row[:2] + [row[4]] + [start, row[6:]]

sales = transform(sales, sales_to_series, params={"start": calendar['date'].min()})

In [7]:
sales.head()

Unnamed: 0,unique_id,item_id,store_id,sales_start,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
combined = sales.merge(sell_prices, on=["item_id", "store_id"])

## Defining Logic for Each Timeseries

In [9]:
combined.iloc[0:1]

Unnamed: 0,unique_id,item_id,store_id,sales_start,sales,price_start,prices
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2013-07-13,"[9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.5..."


In [10]:
def format_series(df:List[Dict[str,Any]]) -> pd.DataFrame:
    row = df[0]
    dr1 = pd.date_range(row["sales_start"],periods=len(row["sales"]), freq="d")
    df = pd.DataFrame({"quantity":row["sales"]},index = dr1)
    dr2 = pd.date_range(row["price_start"],periods=len(row["prices"]), freq="d")
    df["price"] = pd.Series(row["prices"],index = dr2)
    df=df.dropna().reset_index()
    df.columns=["ds", "quantity", "price"]
    df['unique_id'] = row['unique_id'] 
    return df

In [11]:
test = format_series(combined.iloc[0:1].to_dict("records"))
test.head()

Unnamed: 0,ds,quantity,price,unique_id
0,2013-07-13,0,9.58,HOBBIES_1_001_CA_1_evaluation
1,2013-07-14,0,9.58,HOBBIES_1_001_CA_1_evaluation
2,2013-07-15,0,9.58,HOBBIES_1_001_CA_1_evaluation
3,2013-07-16,0,9.58,HOBBIES_1_001_CA_1_evaluation
4,2013-07-17,0,9.58,HOBBIES_1_001_CA_1_evaluation


## Time Series Cross Validation

For timeseries cross validations, we perform the modelling with a sliding window of test sets. This is so we don't predict past data points with future information.

![img](https://nixtla.github.io/statsforecast/examples/CrossValidation_files/figure-html/cell-5-output-2.png)

In [12]:
from statsforecast import StatsForecast
from statsforecast.models import Naive, CrostonClassic, IMAPA, ADIDA, AutoARIMA

def run_model_cv(df: pd.DataFrame):
  sf = StatsForecast(df=df, 
      models=[Naive(),
        CrostonClassic(),
        IMAPA(),
        ADIDA(),
        AutoARIMA()
    ], 
      freq="D",
      n_jobs=1)

  return sf.cross_validation(h=28, n_windows=2)

  from tqdm.autonotebook import tqdm


In [13]:
test2 = run_model_cv(test)
test2.head()

Unnamed: 0_level_0,ds,cutoff,y,Naive,CrostonClassic,IMAPA,ADIDA,AutoARIMA
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HOBBIES_1_001_CA_1_evaluation,2016-04-24,2016-04-23,1.0,1.0,1.635531,1.058726,1.042473,0.995748
HOBBIES_1_001_CA_1_evaluation,2016-04-25,2016-04-23,0.0,1.0,1.635531,1.058726,1.042473,0.995289
HOBBIES_1_001_CA_1_evaluation,2016-04-26,2016-04-23,0.0,1.0,1.635531,1.058726,1.042473,0.995289
HOBBIES_1_001_CA_1_evaluation,2016-04-27,2016-04-23,0.0,1.0,1.635531,1.058726,1.042473,0.995289
HOBBIES_1_001_CA_1_evaluation,2016-04-28,2016-04-23,2.0,1.0,1.635531,1.058726,1.042473,0.995289


In [14]:
from sklearn.metrics import mean_absolute_error

def calculate_metrics(cv_df: pd.DataFrame) -> pd.DataFrame:
    models = []
    metrics = []
    for model in ["Naive", "CrostonClassic", "IMAPA", "ADIDA", "AutoARIMA"]:
        models.append(model)
        metrics.append(mean_absolute_error(cv_df['y'], cv_df[model]))
    out = pd.DataFrame({"models": models, "metric": metrics})
    out['unique_id'] = cv_df.index[0]
    return out


In [15]:
calculate_metrics(test2)

Unnamed: 0,models,metric,unique_id
0,Naive,1.107143,HOBBIES_1_001_CA_1_evaluation
1,CrostonClassic,1.279644,HOBBIES_1_001_CA_1_evaluation
2,IMAPA,1.12363,HOBBIES_1_001_CA_1_evaluation
3,ADIDA,1.119766,HOBBIES_1_001_CA_1_evaluation
4,AutoARIMA,1.109033,HOBBIES_1_001_CA_1_evaluation


In [16]:
combined.head()

Unnamed: 0,unique_id,item_id,store_id,sales_start,sales,price_start,prices
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2013-07-13,"[9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.5..."
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2011-06-18,"[3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.9..."
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2014-02-01,"[2.97, 2.97, 2.97, 2.97, 2.97, 2.97, 2.97, 2.9..."
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2011-03-05,"[4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.3..."
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2011-05-21,"[2.98, 2.98, 2.98, 2.98, 2.98, 2.98, 2.98, 2.9..."


In [17]:
def process(df: pd.DataFrame) -> pd.DataFrame:
    timeseries = format_series(df.to_dict("records"))
    model_cv = run_model_cv(timeseries)
    metrics = calculate_metrics(model_cv).reset_index(drop=True)
    return metrics

def dummy(df:List[Dict[str,Any]]) -> pd.DataFrame:
    return df

In [18]:
transform(combined.iloc[0:2], 
          process, 
          schema="models:str,metric:float,unique_id:str", 
          partition={"by": "unique_id"},)

Unnamed: 0,models,metric,unique_id
0,Naive,1.107143,HOBBIES_1_001_CA_1_evaluation
1,CrostonClassic,1.279644,HOBBIES_1_001_CA_1_evaluation
2,IMAPA,1.12363,HOBBIES_1_001_CA_1_evaluation
3,ADIDA,1.119766,HOBBIES_1_001_CA_1_evaluation
4,AutoARIMA,1.109033,HOBBIES_1_001_CA_1_evaluation
5,Naive,0.25,HOBBIES_1_002_CA_1_evaluation
6,CrostonClassic,0.930472,HOBBIES_1_002_CA_1_evaluation
7,IMAPA,0.330838,HOBBIES_1_002_CA_1_evaluation
8,ADIDA,0.352756,HOBBIES_1_002_CA_1_evaluation
9,AutoARIMA,0.418041,HOBBIES_1_002_CA_1_evaluation


## Running on a Coiled Cluster

You need to be logged into to [Coiled](https://www.coiled.io/) to create a Dask cluster with them and follow these steps. Without a Coiled account, you can still use a local cluster as seen before.

```python
import coiled

coiled.create_software_environment(
    name="pydata",
    conda=["python=3.8.13"],
    pip=["fugue[dask]", "statsforecast", "scikit-learn", "numpy==1.22.4"],
)
```

In [19]:
# from dask.distributed import Client
# from coiled import Cluster

# cluster = Cluster(name="pydata", software="pydata", n_workers=10)
# client = Client(cluster)

Output()

In [20]:
# results = transform(combined.iloc[0:50], 
#                     process, 
#                     schema="models:str,metric:float,unique_id:str", 
#                     engine=client, 
#                     partition={"by": "unique_id"})

In [23]:
# results = results.compute()

In [24]:
results = transform(combined.iloc[0:50], 
          process, 
          schema="models:str,metric:float,unique_id:str", 
          partition={"by": "unique_id"},
          engine="ray",
          as_local=True)

2022-11-11 13:06:16,868	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
Map_Batches: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]
Sort Sample: 100%|██████████| 1/1 [00:00<00:00, 338.41it/s]
Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 70.05it/s]
Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 72.49it/s]
Map_Batches: 100%|██████████| 1/1 [02:17<00:00, 137.91s/it]


In [28]:
best_models = results.to_pandas()

In [29]:
best_models = best_models.sort_values('metric', ascending=True).groupby("unique_id").first()

In [32]:
best_models['models'].value_counts()

Naive             27
AutoARIMA         12
ADIDA              8
IMAPA              2
CrostonClassic     1
Name: models, dtype: int64