In [1]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from random import random
from operator import add

In [2]:
kubernetes = True
hdfs = False

In [3]:
# Create Spark config for our Kubernetes based cluster manager
sparkConf = SparkConf()
sparkConf.setAppName("spark")
sparkConf.setMaster("local[*]")
sparkConf.set("spark.executor.instances", "7")
sparkConf.set("spark.executor.cores", "2")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
spark_home = os.getenv('SPARK_HOME')
if kubernetes:
    spark_home = ''
    sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
    sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
    sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
    sparkConf.set("spark.kubernetes.container.image", "jgckruger/spark-py:v3.0.1")
    sparkConf.set("spark.kubernetes.namespace", "spark")
    sparkConf.set("spark.driver.port", "29413")
    sparkConf.set("spark.driver.host", "my-notebook-deployment.spark.svc.cluster.local")
    sparkConf.set('spark.kubernetes.driver.volumes.persistentVolumeClaim.rwxpvc.options.claimName','pv-claim')
    sparkConf.set('spark.kubernetes.driver.volumes.persistentVolumeClaim.rwxpvc.mount.path','/data')
    sparkConf.set('spark.kubernetes.executor.volumes.persistentVolumeClaim.rwxpvc.options.claimName','pv-claim')
    sparkConf.set('spark.kubernetes.executor.volumes.persistentVolumeClaim.rwxpvc.mount.path','/data')
if hdfs:
    sparkConf.set("fs.defaultFS", "hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000/")

In [4]:
# Initialize our Spark cluster,
# generate the worker nodes.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

In [5]:
partitions = 2
n = 100000 * partitions

In [6]:
def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 <= 1 else 0

In [7]:
%%time
count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

Pi is roughly 3.141900
CPU times: user 5.25 ms, sys: 12.6 ms, total: 17.9 ms
Wall time: 2.21 s


In [8]:
spark.stop()