# Running on a Spark (with Databricks)

Previously, we expanded each row to the full timeseries to use with the `forecast` function. In practice, we want to be minimizing network data transfer when it comes to distributed computing.

## Setup

In [1]:
import pandas as pd
import os

# Read in the data
INPUT_DIR = os.path.abspath('data')
WORKING_DIR = os.path.abspath("data/working")
calendar = pd.read_csv(f'{INPUT_DIR}/calendar.csv')
sales = pd.read_csv(f'{INPUT_DIR}/sales_train_evaluation.csv')
sell_prices = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv')

## Minimizing Data Footprint

In [2]:
from typing import List, Dict, Any, Iterable
from datetime import date
import pickle

def prices_to_series(df:pd.DataFrame) -> List[Dict[str,Any]]:
    # Assert each date has a price entry
    assert df.shape[0] == (df.date.iloc[-1]-df.date.iloc[0]).days + 1
    return [dict(store_id=df.iloc[0]["store_id"],
                 item_id=df.iloc[0]['item_id'],
                 price_start=df.iloc[0]['date'], 
                 prices=df["sell_price"].tolist())]


df = pd.DataFrame([["store1","item1",date(2020,1,2),2.2], 
                   ["store1","item1",date(2020,1,3),3.3],
                   ["store1","item1", date(2020,1,4),4.4]], 
                   columns=["store_id", "item_id", "date","sell_price"])
print(prices_to_series(df))

[{'store_id': 'store1', 'item_id': 'item1', 'price_start': datetime.date(2020, 1, 2), 'prices': [2.2, 3.3, 4.4]}]


In [3]:
joined = sell_prices.merge(calendar[["date","wm_yr_wk"]], how="inner", on="wm_yr_wk")
joined['date'] = pd.to_datetime(joined['date'])
joined.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,date
0,CA_1,HOBBIES_1_001,11325,9.58,2013-07-13
1,CA_1,HOBBIES_1_001,11325,9.58,2013-07-14
2,CA_1,HOBBIES_1_001,11325,9.58,2013-07-15
3,CA_1,HOBBIES_1_001,11325,9.58,2013-07-16
4,CA_1,HOBBIES_1_001,11325,9.58,2013-07-17


In [4]:
from fugue import transform

sell_prices = transform(joined, 
                prices_to_series, 
                schema="store_id:str,item_id:str,price_start:date,prices:[float]",
                partition={"by": ["store_id", "item_id"], "presort": "date asc"})
sell_prices.head()

Unnamed: 0,store_id,item_id,price_start,prices
0,CA_1,FOODS_1_001,2011-01-29,"[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ..."
1,CA_1,FOODS_1_002,2011-01-29,"[7.88, 7.88, 7.88, 7.88, 7.88, 7.88, 7.88, 7.8..."
2,CA_1,FOODS_1_003,2011-01-29,"[2.88, 2.88, 2.88, 2.88, 2.88, 2.88, 2.88, 2.8..."
3,CA_1,FOODS_1_004,2012-03-03,"[1.78, 1.78, 1.78, 1.78, 1.78, 1.78, 1.78, 1.7..."
4,CA_1,FOODS_1_005,2011-01-29,"[2.94, 2.94, 2.94, 2.94, 2.94, 2.94, 2.94, 2.9..."


In [5]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [5]:
# schema: unique_id:str,item_id:str,store_id:str,sales_start:date,sales:[float]
def sales_to_series(df:Iterable[List[Any]], start) -> Iterable[List[Any]]:
    for row in df:
        yield row[:2] + [row[4]] + [start, row[6:]]

sales = transform(sales, sales_to_series, params={"start": calendar['date'].min()})

In [6]:
sales.head()

Unnamed: 0,unique_id,item_id,store_id,sales_start,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
combined = sales.merge(sell_prices, on=["item_id", "store_id"])

## Defining Logic for Each Timeseries

In [8]:
combined.iloc[0:1]

Unnamed: 0,unique_id,item_id,store_id,sales_start,sales,price_start,prices
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2013-07-13,"[9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.5..."


In [10]:
def format_series(df:List[Dict[str,Any]]) -> pd.DataFrame:
    row = df[0]
    dr1 = pd.date_range(row["sales_start"],periods=len(row["sales"]), freq="d")
    df = pd.DataFrame({"quantity":row["sales"]},index = dr1)
    dr2 = pd.date_range(row["price_start"],periods=len(row["prices"]), freq="d")
    df["price"] = pd.Series(row["prices"],index = dr2)
    df=df.dropna().reset_index()
    df.columns=["ds", "quantity", "price"]
    df['unique_id'] = row['unique_id'] 
    return df

In [11]:
test = format_series(combined.iloc[0:1].to_dict("records"))
test.head()

Unnamed: 0,ds,quantity,price,unique_id
0,2013-07-13,0,9.58,HOBBIES_1_001_CA_1_evaluation
1,2013-07-14,0,9.58,HOBBIES_1_001_CA_1_evaluation
2,2013-07-15,0,9.58,HOBBIES_1_001_CA_1_evaluation
3,2013-07-16,0,9.58,HOBBIES_1_001_CA_1_evaluation
4,2013-07-17,0,9.58,HOBBIES_1_001_CA_1_evaluation


## Time Series Cross Validation

For timeseries cross validations, we perform the modelling with a sliding window of test sets. This is so we don't predict past data points with future information.

![img](https://nixtla.github.io/statsforecast/examples/CrossValidation_files/figure-html/cell-5-output-2.png)

In [11]:
from statsforecast import StatsForecast
from statsforecast.models import Naive, CrostonClassic, IMAPA, ADIDA, AutoARIMA

def run_model_cv(df: pd.DataFrame):
  sf = StatsForecast(df=df, 
      models=[CrostonClassic(),
        IMAPA(),
        AutoARIMA()
    ], 
      freq="D",
      n_jobs=1)

  return sf.cross_validation(h=28, n_windows=2)

  from tqdm.autonotebook import tqdm


In [13]:
test2 = run_model_cv(test)
test2.head()

Unnamed: 0_level_0,ds,cutoff,y,Naive,CrostonClassic,IMAPA,ADIDA,AutoARIMA
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HOBBIES_1_001_CA_1_evaluation,2016-04-24,2016-04-23,1.0,1.0,1.635531,1.058726,1.042473,0.995748
HOBBIES_1_001_CA_1_evaluation,2016-04-25,2016-04-23,0.0,1.0,1.635531,1.058726,1.042473,0.995289
HOBBIES_1_001_CA_1_evaluation,2016-04-26,2016-04-23,0.0,1.0,1.635531,1.058726,1.042473,0.995289
HOBBIES_1_001_CA_1_evaluation,2016-04-27,2016-04-23,0.0,1.0,1.635531,1.058726,1.042473,0.995289
HOBBIES_1_001_CA_1_evaluation,2016-04-28,2016-04-23,2.0,1.0,1.635531,1.058726,1.042473,0.995289


In [12]:
from sklearn.metrics import mean_absolute_error

def calculate_metrics(cv_df: pd.DataFrame) -> pd.DataFrame:
    models = []
    metrics = []
    for model in ["Naive", "CrostonClassic", "IMAPA", "ADIDA", "AutoARIMA"]:
        models.append(model)
        metrics.append(mean_absolute_error(cv_df['y'], cv_df[model]))
    out = pd.DataFrame({"models": models, "metric": metrics})
    out['unique_id'] = cv_df.index[0]
    return out


In [15]:
calculate_metrics(test2)

Unnamed: 0,models,metric,unique_id
0,Naive,1.107143,HOBBIES_1_001_CA_1_evaluation
1,CrostonClassic,1.279644,HOBBIES_1_001_CA_1_evaluation
2,IMAPA,1.12363,HOBBIES_1_001_CA_1_evaluation
3,ADIDA,1.119766,HOBBIES_1_001_CA_1_evaluation
4,AutoARIMA,1.109033,HOBBIES_1_001_CA_1_evaluation


In [16]:
combined.head()

Unnamed: 0,unique_id,item_id,store_id,sales_start,sales,price_start,prices
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2013-07-13,"[9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.5..."
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2011-06-18,"[3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.9..."
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2014-02-01,"[2.97, 2.97, 2.97, 2.97, 2.97, 2.97, 2.97, 2.9..."
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2011-03-05,"[4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.34, 4.3..."
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,CA_1,2011-01-29,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2011-05-21,"[2.98, 2.98, 2.98, 2.98, 2.98, 2.98, 2.98, 2.9..."


In [13]:
def process(df: pd.DataFrame) -> pd.DataFrame:
    timeseries = format_series(df.to_dict("records"))
    model_cv = run_model_cv(timeseries)
    metrics = calculate_metrics(model_cv).reset_index(drop=True)
    return metrics

In [18]:
transform(combined.iloc[0:2], 
          process, 
          schema="models:str,metric:float,unique_id:str", 
          partition={"by": "unique_id"},)

Unnamed: 0,models,metric,unique_id
0,Naive,1.107143,HOBBIES_1_001_CA_1_evaluation
1,CrostonClassic,1.279644,HOBBIES_1_001_CA_1_evaluation
2,IMAPA,1.12363,HOBBIES_1_001_CA_1_evaluation
3,ADIDA,1.119766,HOBBIES_1_001_CA_1_evaluation
4,AutoARIMA,1.109033,HOBBIES_1_001_CA_1_evaluation
5,Naive,0.25,HOBBIES_1_002_CA_1_evaluation
6,CrostonClassic,0.930472,HOBBIES_1_002_CA_1_evaluation
7,IMAPA,0.330838,HOBBIES_1_002_CA_1_evaluation
8,ADIDA,0.352756,HOBBIES_1_002_CA_1_evaluation
9,AutoARIMA,0.418041,HOBBIES_1_002_CA_1_evaluation


## Running on Spark Cluster

You can either use `databricks-connect` to connect to a Spark cluster or you can run this on Databricks.

In [14]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Output()

In [20]:
# results = transform(combined.iloc[0:50], 
#                     process, 
#                     schema="models:str,metric:float,unique_id:str", 
#                     engine=client, 
#                     partition={"by": "unique_id"})

In [23]:
# results = results.compute()

In [15]:
results = transform(combined.iloc[0:50], 
          process, 
          schema="models:str,metric:float,unique_id:str", 
          partition={"by": "unique_id"},
          engine="spark").toPandas()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/11 14:44:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():
[Stage 2:>                                                          (0 + 1) / 1]

22/11/11 14:44:54 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 8)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3803, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'AutoARIMA'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/opt/a



Py4JJavaError: An error occurred while calling o113.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 8) (10.106.28.12 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3803, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'AutoARIMA'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue_spark/execution_engine.py", line 730, in run
    res = self.map_func(cursor, sub_df)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue/extensions/_builtins/processors.py", line 335, in run
    return self.transformer.transform(df)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue/extensions/transformer/convert.py", line 265, in transform
    return self._wrapper.run(
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue/_utils/interfaceless.py", line 224, in run
    rt = self._func(**rargs)
  File "/var/folders/w2/91_v34nx0xs2npnl3zsl9tmm0000gn/T/ipykernel_10849/2047948164.py", line 4, in process
  File "/var/folders/w2/91_v34nx0xs2npnl3zsl9tmm0000gn/T/ipykernel_10849/4050578545.py", line 8, in calculate_metrics
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/frame.py", line 3804, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    raise KeyError(key) from err
KeyError: 'AutoARIMA'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:765)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:747)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:424)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3688)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3685)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3803, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'AutoARIMA'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue_spark/execution_engine.py", line 730, in run
    res = self.map_func(cursor, sub_df)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue/extensions/_builtins/processors.py", line 335, in run
    return self.transformer.transform(df)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue/extensions/transformer/convert.py", line 265, in transform
    return self._wrapper.run(
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/fugue/_utils/interfaceless.py", line 224, in run
    rt = self._func(**rargs)
  File "/var/folders/w2/91_v34nx0xs2npnl3zsl9tmm0000gn/T/ipykernel_10849/2047948164.py", line 4, in process
  File "/var/folders/w2/91_v34nx0xs2npnl3zsl9tmm0000gn/T/ipykernel_10849/4050578545.py", line 8, in calculate_metrics
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/frame.py", line 3804, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/opt/anaconda3/envs/fugue/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    raise KeyError(key) from err
KeyError: 'AutoARIMA'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:765)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:747)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [29]:
best_models = best_models.sort_values('metric', ascending=True).groupby("unique_id").first()

In [32]:
best_models['models'].value_counts()

Naive             27
AutoARIMA         12
ADIDA              8
IMAPA              2
CrostonClassic     1
Name: models, dtype: int64