In [1]:
import pandas as pd
import numpy as np

import os
os.environ["MODIN_ENGINE"] = "ray"

import pyspark.pandas as spd
import dask.dataframe as dd
import modin.pandas as mpd

from time import sleep



In [2]:
import ray
ray.init()

RayContext(dashboard_url='', python_version='3.8.13', ray_version='1.12.0', ray_commit='f18fc31c7562990955556899090f8e8656b48d2d', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-04-20_11-13-03_883263_3365/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-04-20_11-13-03_883263_3365/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-04-20_11-13-03_883263_3365', 'metrics_export_port': 62342, 'gcs_address': '127.0.0.1:63194', 'address': '127.0.0.1:63194', 'node_id': '4ecf78ca128dd49611ddaaa893699e7ad6b52dfa16939bc2ddef8e59'})

## Partitions

### Ideal Partitioning Strategy
![Partitioning](https://blog.scottlogic.com/mdebeneducci/assets/Ideal-Partitioning.png)
### Skewed Partitions
![Skewed Partitions](https://blog.scottlogic.com/mdebeneducci/assets/Skewed-Partitions.png)
### Inefficient Scheduling
![Inefficient Scheduling](https://blog.scottlogic.com/mdebeneducci/assets/Inefficient-Scheduling.png)
### Data Shuffling
![Shuffle](https://blog.scottlogic.com/mdebeneducci/assets/Shuffle-Diagram.png)

## Creation

In [169]:
n = 100000

In [170]:
df = pd.DataFrame({"cat": np.random.choice(["a","b","c","d"], size=n), 
                   "cat2": np.random.choice(["z","y","x","w"], size=n), 
                   "value1": np.random.random(n),
                   "value2": np.random.randint(1,10,n)})

In [171]:
%%time
mdf = mpd.DataFrame(df)

CPU times: user 37.2 ms, sys: 6.62 ms, total: 43.9 ms
Wall time: 37.7 ms




In [6]:
%%time
kdf = spd.from_pandas(df)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/20 11:13:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


CPU times: user 784 ms, sys: 117 ms, total: 901 ms
Wall time: 24.7 s


In [31]:
%%time
ddf = dd.from_pandas(df, npartitions=4)

CPU times: user 23.3 ms, sys: 1.85 ms, total: 25.2 ms
Wall time: 23 ms


## Simple Groupby

In [8]:
%%time
df.groupby("cat")["value1"].max().reset_index()

CPU times: user 6.88 ms, sys: 2.72 ms, total: 9.6 ms
Wall time: 17.9 ms


Unnamed: 0,cat,value1
0,a,0.999986
1,b,0.999762
2,c,0.999943
3,d,0.999972


In [9]:
%%time
mdf.groupby("cat")["value1"].max().reset_index()

CPU times: user 62.8 ms, sys: 18.6 ms, total: 81.4 ms
Wall time: 214 ms


Unnamed: 0,cat,value1
0,a,0.999986
1,b,0.999762
2,c,0.999943
3,d,0.999972


In [10]:
%%time
kdf.groupby("cat")["value1"].max().reset_index()

CPU times: user 21.2 ms, sys: 6.51 ms, total: 27.8 ms
Wall time: 833 ms


22/04/20 11:13:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/20 11:13:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/20 11:13:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/20 11:13:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Unnamed: 0,cat,value1
0,d,0.999972
1,c,0.999943
2,b,0.999762
3,a,0.999986


In [11]:
%time
ddf.groupby("cat")["value1"].max().compute()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 2.86 µs


cat
a    0.999986
b    0.999762
c    0.999943
d    0.999972
Name: value1, dtype: float64

## Complicated Groupby

In [None]:
def _test1():
    gp = df.groupby(["a","b","d"])["c"]
    print(self.to_local((gp.max()-gp.min()).mean()))

def _test2():
    gp = df.groupby(["a","b","d"]).agg({'c':["min", "max"]})
    print(self.to_local((gp[("c","max")]-gp[("c","min")]).mean()))

## Accessing Records

## Schema

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   cat     100000 non-null  object 
 1   cat2    100000 non-null  object 
 2   value1  100000 non-null  float64
 3   value2  100000 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.1+ MB


In [134]:
def add_col(df):
    if df["cat"].iloc[0] == "c":
        return df.assign(value2=None)
    else:
        return df

In [135]:
df.groupby("cat").apply(add_col)

Unnamed: 0,cat,cat2,value1,value2
0,c,y,0.938560,
1,d,w,0.137667,8
2,d,y,0.146884,8
3,a,w,0.591771,5
4,c,w,0.987334,
...,...,...,...,...
99995,a,x,0.696163,9
99996,d,z,0.008613,8
99997,c,y,0.161956,
99998,c,w,0.906369,


In [136]:
df.groupby("cat").apply(add_col).dtypes

cat        object
cat2       object
value1    float64
value2     object
dtype: object

**Dask**

In [137]:
ddf.groupby("cat").apply(add_col).dtypes

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


cat        object
cat2       object
value1    float64
value2      int64
dtype: object

https://stackoverflow.com/a/54596197/11163214

In [138]:
ddf = dd.from_pandas(df, npartitions=4)

In [111]:
def add_col_2(df):
    print(df["cat"])
    if df["cat"].iloc[0] == "c":
        return df.assign(value2=None)
    else:
        return df

In [146]:
ddf.groupby("cat").apply(add_col_2).dtypes

0    foo
1    foo
Name: cat, dtype: object


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


cat        object
cat2       object
value1    float64
value2      int64
dtype: object

In [142]:
ddf = dd.from_pandas(df, npartitions=2)

In [156]:
def add_col_3(df):
    if df["cat"].iloc[0] == "c":
        sleep(5)
        return df.assign(value3="a")
    return df.assign(value3=2)

In [157]:
%%time
ddf.groupby("cat").apply(add_col_3).dtypes

CPU times: user 24.6 ms, sys: 1.96 ms, total: 26.5 ms
Wall time: 25.1 ms


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


cat        object
cat2       object
value1    float64
value2      int64
value3      int64
dtype: object

In [159]:
%%time
ddf.groupby("cat").apply(add_col_3).compute()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


CPU times: user 164 ms, sys: 23.6 ms, total: 187 ms
Wall time: 5.13 s


cat        object
cat2       object
value1    float64
value2      int64
value3     object
dtype: object

## Sort and Top

**Sort values and head**

In [185]:
%%time
df.sort_values("value1", ascending=False).head(5)

CPU times: user 28.6 ms, sys: 3.26 ms, total: 31.8 ms
Wall time: 28.6 ms


Unnamed: 0,cat,cat2,value1,value2
5441,d,w,0.999995,8
37740,c,y,0.999994,2
24114,c,y,0.999992,2
50414,c,x,0.99997,7
65478,c,w,0.999964,3


In [186]:
%%time
mdf.sort_values("value1", ascending=False).head(5)

CPU times: user 69.3 ms, sys: 11.1 ms, total: 80.3 ms
Wall time: 121 ms




Unnamed: 0,cat,cat2,value1,value2
5441,d,w,0.999995,8
37740,c,y,0.999994,2
24114,c,y,0.999992,2
50414,c,x,0.99997,7
65478,c,w,0.999964,3


In [187]:
%%time
kdf.sort_values("value1", ascending=False).head(5)

CPU times: user 10.3 ms, sys: 3.79 ms, total: 14.1 ms
Wall time: 381 ms


[Stage 3:>                                                          (0 + 8) / 8]                                                                                

Unnamed: 0,cat,cat2,value1,value2
81855,a,y,0.999986,3
17215,a,z,0.999985,9
98971,d,x,0.999972,6
11381,a,w,0.999967,1
25879,d,w,0.999945,5


In [188]:
%%time
ddf.sort_values("value1", ascending=False).head(5)

CPU times: user 87.8 ms, sys: 17.8 ms, total: 106 ms
Wall time: 96.3 ms


Unnamed: 0,cat,cat2,value1,value2
9135,b,w,0.999996,7
73093,d,x,0.999992,2
86690,c,x,0.999951,2
3629,d,x,0.999941,7
46830,b,y,0.99992,9


**nlargest**

In [194]:
%%time
df.nlargest(5,columns=["value1"])

CPU times: user 9.17 ms, sys: 2.69 ms, total: 11.9 ms
Wall time: 10.6 ms


Unnamed: 0,cat,cat2,value1,value2
5441,d,w,0.999995,8
37740,c,y,0.999994,2
24114,c,y,0.999992,2
50414,c,x,0.99997,7
65478,c,w,0.999964,3


In [195]:
%%time
mdf.nlargest(5,columns=["value1"])

CPU times: user 26.7 ms, sys: 6.48 ms, total: 33.2 ms
Wall time: 52 ms


Unnamed: 0,cat,cat2,value1,value2
5441,d,w,0.999995,8
37740,c,y,0.999994,2
24114,c,y,0.999992,2
50414,c,x,0.99997,7
65478,c,w,0.999964,3


In [196]:
%%time
kdf.nlargest(5,columns=["value1"])

CPU times: user 8.84 ms, sys: 2.71 ms, total: 11.5 ms
Wall time: 179 ms


Unnamed: 0,cat,cat2,value1,value2
81855,a,y,0.999986,3
17215,a,z,0.999985,9
98971,d,x,0.999972,6
11381,a,w,0.999967,1
25879,d,w,0.999945,5


In [198]:
%%time
ddf.nlargest(5,columns=["value1"]).compute()

CPU times: user 24.3 ms, sys: 3.97 ms, total: 28.2 ms
Wall time: 22.6 ms


Unnamed: 0,cat,cat2,value1,value2
9135,b,w,0.999996,7
73093,d,x,0.999992,2
86690,c,x,0.999951,2
3629,d,x,0.999941,7
46830,b,y,0.99992,9


**sort_values but drop the top**

In [200]:
%%time
df.sort_values("value1", ascending=False).iloc[5:10]

CPU times: user 22.1 ms, sys: 2.2 ms, total: 24.3 ms
Wall time: 22.7 ms


Unnamed: 0,cat,cat2,value1,value2
97601,c,x,0.999947,8
75682,d,w,0.999937,2
41421,c,x,0.999928,9
40270,d,z,0.999925,7
80337,a,x,0.999925,9


In [201]:
%%time
mdf.sort_values("value1", ascending=False).iloc[5:10]

CPU times: user 68.1 ms, sys: 11.7 ms, total: 79.8 ms
Wall time: 118 ms




Unnamed: 0,cat,cat2,value1,value2
97601,c,x,0.999947,8
75682,d,w,0.999937,2
41421,c,x,0.999928,9
40270,d,z,0.999925,7
80337,a,x,0.999925,9


In [202]:
%%time
kdf.sort_values("value1", ascending=False).iloc[5:10]

CPU times: user 14 ms, sys: 4.31 ms, total: 18.3 ms
Wall time: 416 ms


                                                                                

Unnamed: 0,cat,cat2,value1,value2
17264,c,x,0.999943,1
99276,a,y,0.999929,4
7234,d,w,0.999928,4
91397,a,y,0.999926,7
17364,c,y,0.999922,2


In [204]:
%%time
ddf.sort_values("value1", ascending=False).iloc[5:10]

NotImplementedError: 'DataFrame.iloc' only supports selecting columns. It must be used like 'df.iloc[:, column_indexer]'.

## 