In [1]:
import warnings
warnings.simplefilter("ignore")

from typing import Iterator, Any, Union, List, Dict
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

## First Look at Dask and Spark

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

X = pd.DataFrame({"x_1": [1, 1, 2, 2], "x_2":[1, 2, 2, 3]})
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)

In [3]:
def predict(df: pd.DataFrame, model: LinearRegression) -> pd.DataFrame:
    return df.assign(predicted=model.predict(df))

input_df = pd.DataFrame({"x_1": [3, 4, 6, 6], "x_2":[3, 3, 6, 6]})

# test the function
predict(input_df.copy(), reg)

Unnamed: 0,x_1,x_2,predicted
0,3,3,12.0
1,4,3,13.0
2,6,6,21.0
3,6,6,21.0


**Bring it to Dask (map_partitions)**

In [4]:
import dask.dataframe as dd

dask_df = dd.from_pandas(input_df, npartitions=2)

In [5]:
ddf = dask_df.map_partitions(predict, reg)
ddf.compute()

Unnamed: 0,x_1,x_2,predicted
0,3,3,12.0
1,4,3,13.0
2,6,6,21.0
3,6,6,21.0


**Bring it to Spark (mapInPandas)**

In [6]:
from pyspark.sql.types import StructType, StructField, DoubleType
from pyspark.sql import DataFrame

def predict_wrapper(dfs: Iterator[pd.DataFrame], model):
    for df in dfs:
        yield predict(df, model)

def run_predict(input_df: Union[DataFrame, pd.DataFrame], model):
    # conversion
    if isinstance(input_df, pd.DataFrame):
        sdf = spark.createDataFrame(input_df.copy())
    else:
        sdf = input_df.copy()

    schema = StructType(list(sdf.schema.fields))
    schema.add(StructField("predicted", DoubleType()))
    return sdf.mapInPandas(lambda dfs: predict_wrapper(dfs, model), 
                           schema=schema)

result = run_predict(input_df.copy(), reg)
result.show()

+---+---+---------+
|x_1|x_2|predicted|
+---+---+---------+
|  3|  3|     12.0|
|  4|  3|     13.0|
|  6|  6|     21.0|
|  6|  6|     21.0|
+---+---+---------+



**Using Fugue**

In [7]:
from fugue import transform

result = transform(
    input_df.copy(),
    predict,
    schema="*,predicted:double",
    params=dict(model=reg),
)
result.head()

Unnamed: 0,x_1,x_2,predicted
0,3,3,12.0
1,4,3,13.0
2,6,6,21.0
3,6,6,21.0


In [8]:
result = transform(
    input_df.copy(),
    predict,
    schema="*,predicted:double",
    params=dict(model=reg),
    engine="dask"
)
print(type(result))
result.compute().head()

<class 'dask.dataframe.core.DataFrame'>


Unnamed: 0,x_1,x_2,predicted
0,3,3,12.0
0,4,3,13.0
0,6,6,21.0
0,6,6,21.0


In [9]:
result = transform(
    input_df.copy(),
    predict,
    schema="*,predicted:double",
    params=dict(model=reg),
    engine=spark,
)
print(type(result))
result.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+---+---------+
|x_1|x_2|predicted|
+---+---+---------+
|  3|  3|     12.0|
|  4|  3|     13.0|
|  6|  6|     21.0|
|  6|  6|     21.0|
+---+---+---------+



**Change in business logic**

In [10]:
from typing import List, Dict, Any

def predict_row(df: List[Dict[str,Any]], model: LinearRegression) -> List[Dict[str,Any]]:
    for row in df:
        if row["x_2"] == 3:
            row["predicted"] = row["x_1"] + row["x_2"]
        else:
            row["predicted"] = float(model.predict([list(row.values())])[0])
    return df

In [11]:
predict_row([{"x_1": 3, "x_2":3}], reg)

[{'x_1': 3, 'x_2': 3, 'predicted': 6}]

In [12]:
result = transform(
    input_df.copy(),
    predict_row,
    schema="*,predicted:double",
    params=dict(model=reg),
    engine="dask"
)
result.compute().head()

Unnamed: 0,x_1,x_2,predicted
0,3,3,6.0
0,4,3,7.0
0,6,6,21.0
0,6,6,21.0


In [13]:
result = transform(
    input_df,
    predict_row,
    schema="*,predicted:double",
    params=dict(model=reg),
    engine="spark"
)
print(type(result))
result.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+---+---------+
|x_1|x_2|predicted|
+---+---+---------+
|  3|  3|      6.0|
|  4|  3|      7.0|
|  6|  6|     21.0|
|  6|  6|     21.0|
+---+---+---------+



## Partitioning

In [14]:
from fugue_notebook import setup
setup()

<IPython.core.display.Javascript object>

In [15]:
# schema: *,predicted:int
def predict(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(predicted=reg.predict(df))

In [16]:
%%fsql 
SELECT *
  FROM input_df
 WHERE x_1 >= 4
TRANSFORM USING predict
 PRINT

Unnamed: 0,x_1,x_2,predicted
0,4,3,13
1,6,6,21
2,6,6,21
