In [1]:
import os

import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, monotonically_increasing_id
from sklearn import datasets

# Spark - Debugging and Performance

## Partitioning - In Memory

In [2]:
iris = datasets.load_iris()
cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
iris_df = pd.DataFrame(iris.data, columns=cols)
iris_df["class"] = iris.target
iris_df["class"] = iris_df["class"].map({0: "setosa", 1: "versicolor", 2: "virginica"})
iris_df.shape

(150, 5)

In [3]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(iris_df)
df.rdd.getNumPartitions()

12

In [5]:
spark = (SparkSession.builder
         .config("spark.default.parallelism", 6)
         .config("spark.default.parallelism", 6)
         .getOrCreate())
df = spark.createDataFrame(iris_df)
df.rdd.getNumPartitions()

6

### Checking config

In [6]:
spark.sparkContext.getConf().getAll()

[('spark.default.parallelism', '6'),
 ('spark.driver.host', '192.168.1.164'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1613681321121'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.port', '34273'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]

### Explicitly Repartitioning

In [7]:
df.repartition(20).rdd.getNumPartitions()

20

In [8]:
df.repartition("class").rdd.getNumPartitions()

200

In [9]:
df.repartition(10, "class").rdd.getNumPartitions()

10

In [10]:
df.repartition(10, "class").rdd.mapPartitions(lambda it: [sum(1 for _ in it)]).collect()

[0, 50, 0, 0, 50, 0, 0, 0, 50, 0]

In [11]:
df.sort("petal_length").rdd.getNumPartitions()

44

### Input Files

In [12]:
def save_parts(fp_format, df, num_parts):
    parts = np.array_split(df, num_parts)
    os.makedirs(os.path.dirname(fp_format), exist_ok=True)
    for part_n, part in enumerate(parts):
        part.to_csv(fp_format.format(part_n), index=False)

In [13]:
save_parts("./3_parts/iris_part{}.csv", iris_df, 3)

spark = (SparkSession.builder
         .config("spark.default.parallelism", 6)
         .getOrCreate())
df = spark.read.option("header", "true").csv("./3_parts/iris_part*.csv")
df.rdd.getNumPartitions()

3

In [14]:
save_parts("./20_parts/iris_part{}.csv", iris_df, 20)
spark = (SparkSession.builder
         .config("spark.default.parallelism", 6)
         .getOrCreate())
df = spark.read.option("header", "true").csv("./20_parts/iris_part*.csv")
df.rdd.getNumPartitions()

5

## Partitioning - On Disk

- Partitions represented in directory structure with {partition_name}={value} format

In [15]:
df.write.parquet("./not_partitioned_df/", mode="overwrite", compression="snappy")

In [16]:
!tree ./not_partitioned_df/

[01;34m./not_partitioned_df/[00m
├── part-00000-ac7d9d1b-176a-4ac8-a62a-12cc29f0f9c1-c000.snappy.parquet
├── part-00001-ac7d9d1b-176a-4ac8-a62a-12cc29f0f9c1-c000.snappy.parquet
├── part-00002-ac7d9d1b-176a-4ac8-a62a-12cc29f0f9c1-c000.snappy.parquet
├── part-00003-ac7d9d1b-176a-4ac8-a62a-12cc29f0f9c1-c000.snappy.parquet
├── part-00004-ac7d9d1b-176a-4ac8-a62a-12cc29f0f9c1-c000.snappy.parquet
└── _SUCCESS

0 directories, 6 files


In [17]:
df.write.partitionBy("class").parquet("./partitioned_df/", mode="overwrite", compression="snappy")

In [18]:
!tree ./partitioned_df/

[01;34m./partitioned_df/[00m
├── [01;34mclass=setosa[00m
│   ├── part-00000-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
│   ├── part-00001-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
│   ├── part-00002-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
│   └── part-00003-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
├── [01;34mclass=versicolor[00m
│   ├── part-00000-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
│   └── part-00001-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
├── [01;34mclass=virginica[00m
│   ├── part-00001-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
│   ├── part-00003-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
│   └── part-00004-3a42591a-a849-4b20-af29-ca91397873f1.c000.snappy.parquet
└── _SUCCESS

3 directories, 10 files


- Why bother with these partitions?

In [21]:
spark.read.parquet("./not_partitioned_df").filter("class = 'setosa'").explain()

== Physical Plan ==
*(1) Project [sepal_length#110, sepal_width#111, petal_length#112, petal_width#113, class#114]
+- *(1) Filter (isnotnull(class#114) && (class#114 = setosa))
   +- *(1) FileScan parquet [sepal_length#110,sepal_width#111,petal_length#112,petal_width#113,class#114] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/joel/repos/pyspark-pres/not_partitioned_df], PartitionFilters: [], PushedFilters: [IsNotNull(class), EqualTo(class,setosa)], ReadSchema: struct<sepal_length:string,sepal_width:string,petal_length:string,petal_width:string,class:string>


In [22]:
spark.read.parquet("./partitioned_df").filter("class = 'setosa'").explain()

== Physical Plan ==
*(1) FileScan parquet [sepal_length#120,sepal_width#121,petal_length#122,petal_width#123,class#124] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/joel/repos/pyspark-pres/partitioned_df], PartitionCount: 1, PartitionFilters: [isnotnull(class#124), (class#124 = setosa)], PushedFilters: [], ReadSchema: struct<sepal_length:string,sepal_width:string,petal_length:string,petal_width:string>


- Why in one case does the output have 9 files instead of 5?

In [23]:
df.rdd.getNumPartitions()

5

### Partitions in Memory -> Files on Disk

- Highest number of files?

In [25]:
spark = (SparkSession.builder
         .config("spark.default.parallelism", 6)
         .config("spark.sql.shuffle.partitions", 5)
         .getOrCreate())
df = spark.createDataFrame(iris_df)
shuffled_df = df.sort(rand(seed=1))
shuffled_df.rdd.getNumPartitions()

5

In [26]:
shuffled_df.write.partitionBy("class").parquet("./shuffled_df/", mode="overwrite", compression="snappy")

In [28]:
!tree shuffled_df

[01;34mshuffled_df[00m
├── [01;34mclass=setosa[00m
│   ├── part-00000-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00001-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00002-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00003-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   └── part-00004-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
├── [01;34mclass=versicolor[00m
│   ├── part-00000-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00001-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00002-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00003-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   └── part-00004-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
├── [01;34mclass=virginica[00m
│   ├── part-00000-aa6a5067-7f36-4e32-a005-4bdba0dce652.c000.snappy.parquet
│   ├── part-00001-aa6a5067-7f36-4e32-a005

- Lowest number of files?

In [29]:
sorted_df = df.sort("class")
sorted_df.rdd.getNumPartitions()

4

In [30]:
sorted_df.write.partitionBy("class").parquet("./sorted_df/", mode="overwrite", compression="snappy")

In [31]:
!tree ./sorted_df/

[01;34m./sorted_df/[00m
├── [01;34mclass=setosa[00m
│   └── part-00000-8bcd65af-f831-4997-bffe-3786a4c03160.c000.snappy.parquet
├── [01;34mclass=versicolor[00m
│   └── part-00001-8bcd65af-f831-4997-bffe-3786a4c03160.c000.snappy.parquet
├── [01;34mclass=virginica[00m
│   └── part-00002-8bcd65af-f831-4997-bffe-3786a4c03160.c000.snappy.parquet
└── _SUCCESS

3 directories, 4 files


In [32]:
sorted_df.rdd.mapPartitions(lambda it: [sum(1 for _ in it)]).collect()

[50, 50, 50, 0]

## Shuffle

- regrouping of data among partitions
- expensive operation

![mapreduce](https://d1jnx9ba8s6j9r.cloudfront.net/blog/wp-content/uploads/2016/11/MapReduce-Way-MapReduce-Tutorial-Edureka-768x339.png)

Image from https://www.edureka.co/blog/mapreduce-tutorial/

- triggered by join and aggregation operations: joins, repartitioning, sorting, grouping, reducing
- data can be sorted or hashed into partitions based on operation
- use explain method to view

In [33]:
sorted_df.explain()

== Physical Plan ==
*(1) Sort [class#145 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(class#145 ASC NULLS FIRST, 5)
   +- Scan ExistingRDD[sepal_length#141,sepal_width#142,petal_length#143,petal_width#144,class#145]


## Debugging

- probably useful to experiment in a notebook to better understand the situation and possible fixes
- start working locally, then with a small cluster, then bigger 
- cheaper and faster to catch problems at those earlier steps but not every problem can be

### EMR

- cluster summary
- S3 logs for each node (bootstrap, steps, etc)
- Spark history server
- YARN application history