In [3]:
## Bindings and imports to get a Spark Session in Jupyter/Python

bindings="/share/hadoop_custom/conf/spark/spark_2.2.0_bindings.py"
exec(open(bindings).read())

from pyspark.sql import SparkSession
from pyspark import SparkConf

sconf = SparkConf()

sconf.set('spark.submit.deployMode','client')
sconf.set('spark.master','local')


sconf.set('spark.shuffle.service.enabled',True)
sconf.set('spark.dynamicAllocation.enabled',False)
sconf.set('spark.executor.memory','1G')
sconf.set('spark.driver.memory','1G')
sconf.set('spark.executor.instances','4')#Number of executors
sconf.set('spark.executor.cores','1') # number of cores on same worker

app_name="NetCDFSpark"
sconf.set('spark.app.name',app_name) 

spark = ( SparkSession
    .builder
    .config(conf=sconf)
    .getOrCreate()
)

<pyspark.conf.SparkConf at 0x7f7b9c38add0>

In [None]:
## Other imports
from snakebite.client import Client # snakebite is a python api for hdfs 
                                    # needs to be swapped with a google bucket equivqlent
    

In [None]:
# What does raw data look like

In [106]:
%%bash
cat /mnt/hdfs/tmp/1.csv

1,hei,200.0
2,ho,400.9
3,ha,43.23


In [107]:
# Create list of file paths to process
myNetCdfFilePahts=["/tmp/1.csv","/tmp/2.csv","/tmp/3.csv"]

# Create Spark RDD that reads data
numPartitions = 3 # This number is key, because it controls Spark input parallelizm. 
                  # One partition can process one or more files provided they fit the size of the Spark container
                  # 
                  # The size of the conatiner is typically controlled by configs such as:
                  # 'spark.executor.memory', 'spark.executor.cores'

inPathRDD = spark.sparkContext.parallelize(myNetCdfFilePahts,numPartitions)
inPathRDD.take(3)

['/tmp/1.csv', '/tmp/2.csv', '/tmp/3.csv']

In [108]:
# Map over each file path and apply the function myDataReader() to each fpath
# In my example this function will return a tuple: (fpath, list[dict/json with the imported data])
inPathDataRDD = inPathRDD.map(lambda fpath: myDataReader(fpath))
inPathDataRDD.take(3)

[('/tmp/1.csv',
  [{'index': '1', 'name': 'hei', 'value': '200.0'},
   {'index': '2', 'name': 'ho', 'value': '400.9'},
   {'index': '3', 'name': 'ha', 'value': '43.23'}]),
 ('/tmp/2.csv',
  [{'index': '1', 'name': 'hei', 'value': '200.0'},
   {'index': '2', 'name': 'ho', 'value': '400.9'},
   {'index': '3', 'name': 'ha', 'value': '43.23'}]),
 ('/tmp/3.csv',
  [{'index': '1', 'name': 'hei', 'value': '200.0'},
   {'index': '2', 'name': 'ho', 'value': '400.9'},
   {'index': '3', 'name': 'ha', 'value': '43.23'}])]

In [109]:
# We dont care about the link between data and filepaths any more
inDataRDD = inPathDataRDD.map(lambda mytuple: mytuple[1])
inDataRDD.take(3)

[[{'index': '1', 'name': 'hei', 'value': '200.0'},
  {'index': '2', 'name': 'ho', 'value': '400.9'},
  {'index': '3', 'name': 'ha', 'value': '43.23'}],
 [{'index': '1', 'name': 'hei', 'value': '200.0'},
  {'index': '2', 'name': 'ho', 'value': '400.9'},
  {'index': '3', 'name': 'ha', 'value': '43.23'}],
 [{'index': '1', 'name': 'hei', 'value': '200.0'},
  {'index': '2', 'name': 'ho', 'value': '400.9'},
  {'index': '3', 'name': 'ha', 'value': '43.23'}]]

In [110]:
# FlatMap and convert to DataFram for easy write
outDF = inDataRDD.flatMap(lambda x: x).toDF()

outDF.show()

+-----+----+-----+
|index|name|value|
+-----+----+-----+
|    1| hei|200.0|
|    2|  ho|400.9|
|    3|  ha|43.23|
|    1| hei|200.0|
|    2|  ho|400.9|
|    3|  ha|43.23|
|    1| hei|200.0|
|    2|  ho|400.9|
|    3|  ha|43.23|
+-----+----+-----+



In [None]:
# Write DF as Parquet/csv/or whatever
outDF.write(...)

In [111]:
# Cusom data reader function. Swap this with NetCDF version
def myDataReader(fpath):
    # I use the snakebite client to import hdfs csv files as text
    HDFSclient=Client('1.sherpa.client.sysedata.no',8020,use_trash=False)
    dataGenerator=HDFSclient.text(["/tmp/1.csv"])
    
    dataList = []
    
    # Not very elegant with a double for loop here ;)
    for item in dataGenerator:
        splitNewline=item.strip().split('\n')
        for string in splitNewline:
             splitList=string.split(',')
             dataList.append({'index': splitList[0], 'name': splitList[1], 'value': splitList[2]})
    
    
    return (fpath,dataList)

In [6]:
# Stop Spark
spark.stop()