# PySpark training for data engineers
## 08. Deploying

### Goal
Explain the logical steps to go from notebooks to a script to be deployed on a Spark cluster.

### Highlights

1. Combine all the important parts in one cell
2. Restructure the code. First imports, next the functions and finally the main function
3. Rename the code where needed
4. Add docstrings and comments to the important steps of the main function
5. Create a script from the combined cell
6. Copy the script to the Spark cluster

### Combine the code
First step is to combine the code from the previous notebooks into one cell, so it can be executed at once.

In [15]:
from pyspark import SparkConf, SparkContext
config = SparkConf().setMaster('local')
spark = SparkContext.getOrCreate(conf=config)

# Notebook 02
csvrdd = spark.wholeTextFiles('file://///home/jovyan/*.csv')
# Notebook 03
csvrdd = csvrdd.flatMap(lambda x: x[1].split('\n'))
# Notebook 04
from pyspark.sql import Row

def processCSV(row):
    # Split the row into a list
    row = row.split(',')
    # Return the four fields
    return Row(row[0], row[1], row[2], row[3])

csvrdd = csvrdd.map(lambda row: processCSV(row))
csvrdd.collect()
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
schema = StructType([
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("age", StringType(), True)
])
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)
csvdf = sqlContext.createDataFrame(csvrdd, schema=schema)
# Notebook 05
from pyspark.sql.functions import udf
@udf('integer')
def calc_name_length(name):
    return len(name)
csvdf = csvdf.withColumn('last_name_length', calc_name_length('last_name'))
# Notebook 07
csvdf.write.mode('overwrite').save("csvdf.json", format="json")

In [3]:
ls *.json

part-00000-2247a872-6f05-4381-8253-f34f26be06e3-c000.json  _SUCCESS


### Create the script
Create a script by restructuring the cell, moving the parts in a logical order.

In [16]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import udf

@udf('integer')
def calc_name_length(input_var):
    """ Calculate the length of the input_var """
    return len(input_var)

def processCSV(row):
    """ Create a Spark Row from each CSV line """
    row = row.split(',')
    return Row(row[0], row[1], row[2], row[3])

config = SparkConf().setMaster('local')
spark = SparkContext.getOrCreate(conf=config)
sqlContext = SQLContext(spark)

csvrdd = spark.wholeTextFiles('file://///home/jovyan/*.csv')
csvrdd = csvrdd.flatMap(lambda x: x[1].split('\n'))

csvrdd = csvrdd.map(lambda row: processCSV(row))
csvrdd.collect()
schema = StructType([
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("age", StringType(), True)
])
csvdf = sqlContext.createDataFrame(csvrdd, schema=schema)
csvdf = csvdf.withColumn('last_name_length', calc_name_length('last_name'))
csvdf.write.mode('overwrite').save("csvdf.json", format="json")

Clean up a little more:

In [17]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SQLContext
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import udf

@udf('integer')
def calc_name_length(input_var):
    """ Calculate the length of the input_var """
    return len(input_var)

def processCSV(row):
    """ Create a Spark Row from each CSV line """
    row = row.split(',')
    return Row(row[0], row[1], row[2], row[3])

def main():
    """ Main function"""
    
    # Init
    config = SparkConf().setMaster('local')
    spark = SparkContext.getOrCreate(conf=config)
    sqlContext = SQLContext(spark)
    
    # Process input
    csvrdd = spark.wholeTextFiles('file://///home/jovyan/*.csv')
    csvrdd = csvrdd.flatMap(lambda x: x[1].split('\n'))
    csvrdd = csvrdd.map(lambda row: processCSV(row))
    
    # Create dataframe
    schema = StructType([
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("age", StringType(), True)
    ])
    csvdf = sqlContext.createDataFrame(csvrdd, schema=schema)
    
    # Execute the custom functions
    csvdf = csvdf.withColumn('last_name_length', calc_name_length('last_name'))

    # Write the data
    csvdf.write.mode('overwrite').save("csvdf.json", format="json")

main()

### Create the file from the above cell by using the `%%file` cell magic

In [19]:
%%file spark_script.py
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SQLContext
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import udf

@udf('integer')
def calc_length(input_var):
    """ Calculate the length of the input_var """
    return len(input_var)

def processCSV(row):
    """ Create a Spark Row from each CSV line """
    row = row.split(',')
    return Row(row[0], row[1], row[2], row[3])

def main():
    """ Main function"""
    
    # Init
    config = SparkConf().setMaster('local')
    spark = SparkContext.getOrCreate(conf=config)
    sqlContext = SQLContext(spark)
    
    # Process input
    csvrdd = spark.wholeTextFiles('file://///home/jovyan/*.csv')
    csvrdd = csvrdd.flatMap(lambda x: x[1].split('\n'))
    csvrdd = csvrdd.map(lambda row: processCSV(row))
    
    # Create dataframe
    schema = StructType([
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("age", StringType(), True)
    ])
    csvdf = sqlContext.createDataFrame(csvrdd, schema=schema)
    
    # Execute the custom functions
    csvdf = csvdf.withColumn('last_name_length', calc_length('last_name'))
    
    # Write the data
    csvdf.write.mode('overwrite').save("csvdf.json", format="json")

if __name__ == "__main__":
    main()

Overwriting spark_script.py


This script can be copied to the cluster and be executed with

```
$ spark-submit spark_script.py
```