## Distributed Compute

<img src="https://docs.dask.org/en/latest/_images/dask-dataframe.svg" align="left" width="400"/>

<img src="https://user-images.githubusercontent.com/11656932/62263986-bbba2f00-b3e3-11e9-9b5c-8446ba4efcf9.png" align="left" width="700"/>

In [1]:
import pandas as pd
from typing import Dict

input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
map_dict = {"A": "Apple", "B": "Banana", "C": "Carrot"}
input_df["food"] = input_df["value"].map(map_dict)

In [4]:
input_df["food"] = input_df["value"].map(map_dict)

In [None]:
# PySpark
from pyspark.sql.functions import col, create_map, lit
from itertools import chain

mapping_expr = create_map([lit(x) for x in chain(*mapping.items())])

df.withColumn("value", mapping_expr.getItem(col("key")))

In [9]:
import dask.dataframe as dd
ddf = dd.from_pandas(input_df, npartitions=2)
ddf["food"] = ddf["value"].map(map_dict)
ddf.compute()

Unnamed: 0,id,value,food
0,0,A,Apple
1,1,B,Banana
2,2,C,Carrot


In [11]:
from fugue import transform
from typing import Dict, List, Iterable, Any

def map_letter_to_food(df: pd.DataFrame, map_dict:Dict) -> pd.DataFrame:
    df["food"] = df["value"].map(map_dict)
    return df

In [13]:
input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
map_letter_to_food(input_df, map_dict)

Unnamed: 0,id,value,food
0,0,A,Apple
1,1,B,Banana
2,2,C,Carrot


In [17]:
input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
transform(input_df, map_letter_to_food, schema="*, food:str", params={"map_dict": map_dict})

  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))


Unnamed: 0,id,value,food
0,0,A,Apple
1,1,B,Banana
2,2,C,Carrot


In [None]:
@transform
def map_letter_to_food(df: pd.DataFrame, map_dict:Dict) -> pd.DataFrame:
    df["food"] = df["value"].map(map_dict)
    return df

In [None]:
with FugueWorkflow() as dag:
    df = dag.create(df)
    df2 = dag.load(df)
    df.join(df2)

In [21]:
import fugue_spark

input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
sdf = transform(input_df, map_letter_to_food, schema="*, food:str", params={"map_dict": map_dict}, engine="spark")
sdf.show()

  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))


+---+-----+------+
| id|value|  food|
+---+-----+------+
|  0|    A| Apple|
|  1|    B|Banana|
|  2|    C|Carrot|
+---+-----+------+



In [23]:
import fugue_dask

input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
ddf = transform(input_df, map_letter_to_food, schema="*, food:str", params={"map_dict": map_dict}, engine="dask")
ddf.compute().head()

  (pandas.RangeIndex, pandas.Int64Index, pandas.UInt64Index, pd.Index),
  (pandas.RangeIndex, pandas.Int64Index, pandas.UInt64Index, pd.Index),
  (pandas.RangeIndex, pandas.Int64Index, pandas.UInt64Index, pd.Index),
  (pandas.RangeIndex, pandas.Int64Index, pandas.UInt64Index, pd.Index),
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))


Unnamed: 0,id,value,food
0,0,A,Apple
0,1,B,Banana
0,2,C,Carrot


In [25]:
def map_letter_to_food2(df: List[Dict[str,Any]], map_dict:Dict) -> List[Dict[str,Any]]:
    for row in df:
        row["food"] = map_dict[row["value"]]
    return df

In [35]:
# schema: *, food:str
def map_letter_to_food2(df: List[Dict[str,Any]], map_dict:Dict) -> List[Dict[str,Any]]:
    for row in df:
        row["food"] = map_dict[row["value"]]
    return df

In [37]:
# schema: *, food:str
def map_letter_to_food3(df: List[Dict[str,Any]], map_dict:Dict) -> Iterable[Dict[str,Any]]:
    for row in df:
        row["food"] = map_dict[row["value"]]
        yield row

In [None]:
input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
df = transform(input_df, map_letter_to_food3, params={"map_dict": map_dict})
df.head()

In [39]:
from fugue_notebook import setup
setup()

<IPython.core.display.Javascript object>

In [42]:
# schema: *, food:str
def map_letter_to_food3(df: List[Dict[str,Any]]) -> Iterable[Dict[str,Any]]:
    for row in df:
        row["food"] = map_dict[row["value"]]
        yield row

In [41]:
input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})

In [56]:
import fugue_duckdb

In [59]:
%%fsql duckdb

SELECT *
  FROM input_df
TRANSFORM USING map_letter_to_food3
PRINT

  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))
  return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))


Unnamed: 0,id,value,food
0,0,A,Apple
1,1,B,Banana
2,2,C,Carrot


In [61]:
import ray
ray.init()
    
import modin.pandas as mpd
from typing import Dict

input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
map_dict = {"A": "Apple", "B": "Banana", "C": "Carrot"}
input_df["food"] = input_df["value"].map(map_dict)



In [1]:
import pandas as pd
import numpy as np

import os
os.environ["MODIN_ENGINE"] = "ray"

import pyspark.pandas as spd
import dask.dataframe as dd
import modin.pandas as mpd

from time import sleep



In [2]:
import ray
ray.init()

RayContext(dashboard_url='', python_version='3.8.13', ray_version='1.12.0', ray_commit='f18fc31c7562990955556899090f8e8656b48d2d', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-04-20_15-55-15_740084_36346/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-04-20_15-55-15_740084_36346/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-04-20_15-55-15_740084_36346', 'metrics_export_port': 57648, 'gcs_address': '127.0.0.1:61584', 'address': '127.0.0.1:61584', 'node_id': '876ba53c487fa14ec5cad99efed54e3ad70bb8a6f61d8e52cc286234'})

In [3]:
n = 100000
df = pd.DataFrame({"cat": np.random.choice(["a","b","c","d"], size=n), 
                   "cat2": np.random.choice(["z","y","x","w"], size=n), 
                   "value1": np.random.random(n),
                   "value2": np.random.randint(1,10,n)})
mdf = mpd.DataFrame(df)
kdf = spd.from_pandas(df)
ddf = dd.from_pandas(df, npartitions=4)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/20 15:55:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
%%time
df.sort_values("value1", ascending=False).head(5)

CPU times: user 22.8 ms, sys: 4.13 ms, total: 26.9 ms
Wall time: 25.2 ms


Unnamed: 0,cat,cat2,value1,value2
51671,a,z,0.999998,7
30292,a,y,0.999989,5
83409,b,z,0.999964,8
26765,d,w,0.999964,6
15804,d,x,0.999963,5


In [5]:
%%time
mdf.sort_values("value1", ascending=False).head(5)

Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


CPU times: user 76 ms, sys: 18.5 ms, total: 94.5 ms
Wall time: 286 ms


Unnamed: 0,cat,cat2,value1,value2
51671,a,z,0.999998,7
30292,a,y,0.999989,5
83409,b,z,0.999964,8
26765,d,w,0.999964,6
15804,d,x,0.999963,5


In [6]:
%%time
kdf.sort_values("value1", ascending=False).head(5)

CPU times: user 11.7 ms, sys: 4.06 ms, total: 15.8 ms
Wall time: 326 ms


                                                                                

Unnamed: 0,cat,cat2,value1,value2
51671,a,z,0.999998,7
30292,a,y,0.999989,5
83409,b,z,0.999964,8
26765,d,w,0.999964,6
15804,d,x,0.999963,5


In [8]:
%%time
df.sort_values("value1", ascending=False).iloc[5:10]

CPU times: user 22.8 ms, sys: 3.5 ms, total: 26.3 ms
Wall time: 25.5 ms


Unnamed: 0,cat,cat2,value1,value2
5899,d,y,0.999951,4
28301,a,x,0.999948,9
39368,b,y,0.999939,3
25717,b,x,0.999937,8
55593,a,x,0.999932,6


In [None]:
df.groupby().apply().reset_index()

In [9]:
%%time
mdf.sort_values("value1", ascending=False).iloc[5:10]

CPU times: user 64.2 ms, sys: 12.5 ms, total: 76.7 ms
Wall time: 150 ms




Unnamed: 0,cat,cat2,value1,value2
5899,d,y,0.999951,4
28301,a,x,0.999948,9
39368,b,y,0.999939,3
25717,b,x,0.999937,8
55593,a,x,0.999932,6


In [10]:
%%time
kdf.sort_values("value1", ascending=False).iloc[5:10]

CPU times: user 15.6 ms, sys: 4.7 ms, total: 20.3 ms
Wall time: 568 ms


                                                                                

Unnamed: 0,cat,cat2,value1,value2
5899,d,y,0.999951,4
28301,a,x,0.999948,9
39368,b,y,0.999939,3
25717,b,x,0.999937,8
55593,a,x,0.999932,6


In [11]:
%%time
ddf.sort_values("value1", ascending=False).iloc[5:10]

NotImplementedError: 'DataFrame.iloc' only supports selecting columns. It must be used like 'df.iloc[:, column_indexer]'.

In [7]:
%%time
a

CPU times: user 243 ms, sys: 143 ms, total: 387 ms
Wall time: 300 ms


Unnamed: 0,cat,cat2,value1,value2
51671,a,z,0.999998,7
30292,a,y,0.999989,5
83409,b,z,0.999964,8
26765,d,w,0.999964,6
15804,d,x,0.999963,5


In [74]:
%%time
df.groupby("cat")["value1"].max().reset_index()

CPU times: user 14.7 ms, sys: 3.34 ms, total: 18.1 ms
Wall time: 15.8 ms


Unnamed: 0,cat,value1
0,a,0.999829
1,b,0.999987
2,c,0.999993
3,d,0.999986


In [75]:
%%time
mdf.groupby("cat")["value1"].max().reset_index()

CPU times: user 68.2 ms, sys: 21.6 ms, total: 89.8 ms
Wall time: 1.64 s


Unnamed: 0,cat,value1
0,a,0.999829
1,b,0.999987
2,c,0.999993
3,d,0.999986


In [76]:
%%time
kdf.groupby("cat")["value1"].max().reset_index()

CPU times: user 24.8 ms, sys: 6.55 ms, total: 31.4 ms
Wall time: 730 ms


22/04/20 15:51:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/20 15:51:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/20 15:51:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/20 15:51:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Unnamed: 0,cat,value1
0,d,0.999986
1,c,0.999993
2,b,0.999987
3,a,0.999829


In [77]:
%time
ddf.groupby("cat")["value1"].max().compute()

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 11 µs


cat
a    0.999829
b    0.999987
c    0.999993
d    0.999986
Name: value1, dtype: float64

In [78]:
df.groupby("cat")["value1"].max().reset_index()

Unnamed: 0,cat,value1
0,a,0.999829
1,b,0.999987
2,c,0.999993
3,d,0.999986
