### A basic Python example of applying a map function to a collection.

In [None]:
x = ['one', 'two', 'three', 'four']
print(list(map(str.title, x)))

### To do this in Beam, turn the local collection into a PCollection and apply a Map PTransform on it.

In [None]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | beam.Create(['one', 'two', 'three', 'four'])
          | beam.Map(str.title)
          | beam.Map(print)
    )

# lines is a PCollection object
print('lines = ', lines)


### Install a Spark docker using the following commands

In [None]:
! docker pull bitnami/spark && \
docker network create spark_network && \
docker run -d --name spark --network=spark_network -e SPARK_MODE=master bitnami/spark


### Install pyspark.

In [None]:
import pip

def install(package):
    if hasattr(pip, 'main'):
        pip.main(['install', package])
    else:
        pip._internal.main(['install', package])

install('pyspark')
        


### Initialize the Spark context variables.

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *

def initspark(appname = "Notebook", servername = "local[*]"):
    print ('initializing pyspark')
    conf = SparkConf().setAppName(appname).setMaster(servername)
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.appName(appname).enableHiveSupport().getOrCreate()
    sc.setLogLevel("WARN")
    print ('pyspark initialized')
    return sc, spark, conf

sc, spark, conf = initspark()
print(sc, spark)

### Upload a local Python list into a Spark RDD and do a simple transformation.

In [None]:
rdd1 = ( sc.parallelize(['one', 'two', 'three', 'four'])
           .map(str.title)
       )
rdd1.collect()