In [1]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
# Create Spark config for our Kubernetes based cluster manager
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "jgckruger/spark-py:v3.0.1")
sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.executor.instances", "7")
sparkConf.set("spark.executor.cores", "2")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
sparkConf.set("spark.driver.port", "29413")
sparkConf.set("spark.driver.host", "my-notebook-deployment.spark.svc.cluster.local")
sparkConf.set("fs.defaultFS", "hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000/")

<pyspark.conf.SparkConf at 0x7fc1b45c7690>

In [4]:
# Initialize our Spark cluster,
# generate the worker nodes.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

In [5]:
from pyspark.sql.functions import randn, round as roundNum

data = [(i, i) for i in range(10)] # random data

columns = ['id', 'txt']    # add your columns label here

df = spark.createDataFrame(data, columns)
df = df.drop('txt')
for i in range(10):
    df = df.withColumn('col'+str(i), roundNum(randn(), 3))
df.show()

+---+------+------+------+------+------+------+------+------+------+------+
| id|  col0|  col1|  col2|  col3|  col4|  col5|  col6|  col7|  col8|  col9|
+---+------+------+------+------+------+------+------+------+------+------+
|  0| 0.429| 0.198| 1.331| 0.368|-0.402| 0.107| 0.133|  0.44| 0.587|-0.909|
|  1| 1.033|-0.343|-0.267| -0.37|-1.241|-0.067| 0.313| 0.053| 0.497| -2.13|
|  2|-1.448| 0.845|-0.885| 0.471| 0.731| 0.281|  0.96|-1.771| 0.265|-0.464|
|  3|-1.707|-0.633| 0.727|-0.943| 0.718| 2.427| 1.089|-0.185| 0.449|-0.574|
|  4| 0.331| 1.079| 1.091| 0.339| -0.61| 0.011|-1.149| 1.895|-0.198|-0.242|
|  5|-0.565|-0.538| 0.615| 0.882| 0.216|-0.572| 1.147| 0.725| 0.559|-0.025|
|  6| 0.597|-1.093| 0.907| 1.164| 0.951|-0.076|-0.629| -1.03| 0.691|-0.365|
|  7| 0.336| 0.441| 1.319| 1.318|  1.46| 1.212| 0.248| 0.209|-0.676| 0.505|
|  8| 0.707| 0.435|-1.019| 0.347| 1.556|-0.406| 0.104| 0.483| 1.636| 0.964|
|  9| 0.414| 0.476|-0.409|-0.626| 0.525| 0.567|-0.634|-0.777|-1.363|-0.437|
+---+------+

In [None]:
URI           = sc._gateway.jvm.java.net.URI
Path          = sc._gateway.jvm.org.apache.hadoop.fs.Path
FileSystem    = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration


#fs = FileSystem.get(URI("hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000"), Configuration())
fs = FileSystem.get(URI("hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000"), Configuration())

status = fs.listStatus(Path('/'))

for fileStatus in status:
    print(fileStatus.getPath())