### Install a Spark docker using the following commands

In [3]:
! docker pull bitnami/spark && \
docker network create spark_network && \
docker run -d --name spark --network=spark_network -e SPARK_MODE=master bitnami/spark
print('Done')

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Using default tag: latest
latest: Pulling from bitnami/spark

[1Bbf144e3f: Pulling fs layer 
[1Bdfdbf03f: Pulling fs layer 
[1B6a57d646: Pulling fs layer 
[1Ba0a848e1: Pulling fs layer 
[1Bc8759024: Pulling fs layer 
[1B3c864c1b: Pulling fs layer 
[1B9721cb00: Pulling fs layer 
[1Baeb7c3eb: Pulling fs layer 
[1Ba2bd7f89: Pulling fs layer 
[1Bd73ad102: Pull complete 2.6MB/452.6MBB[8A[2K[10A[2K[8A[2K[10A[2K[10A[2K[7A[2K[7A[2K[6A[2K[7A[2K[6A[2K[5A[2K[10A[2K[5A[2K[6A[2K[5A[2K[10A[2K[5A[2K[6A[2K[5A[2K[6A[2K[5A[2K[6A[2K[10A[2K[6A[2K[10A[2K[6A[2K[10A[2K[6A[2K[10A[2K[5A[2K[10A[2K[5A[2K[10A[2K[5A[2K[6A[2K[10A[2K[5A[2K[1A[2K[5A[2K[5A[2K[10A[2K[5A[2K[10A[2K[1A[2K[10A[2K[1A[2K[1A[2K[1A[2K[1A[2K[9A[2K[1A[2K[1A[2K[8A[2K[1A[2K[8A[2K[1A[2K[8A[2K[5A[2K[8A[2K[5A[2K[8A[2K[1A[2

### Install pyspark.

In [5]:
import pip

def install(package):
    if hasattr(pip, 'main'):
        pip.main(['install', package])
    else:
        pip._internal.main(['install', package])

install('pyspark')
        
print('Done')

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Collecting pyspark
  Using cached pyspark-3.2.0.tar.gz (281.3 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py4j==0.10.9.2
  Using cached py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
Using legacy 'setup.py install' for pyspark, since package 'wheel' is not installed.
Installing collected packages: py4j, pyspark
    Running setup.py install for pyspark: started
    Running setup.py install for pyspark: finished with status 'done'
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
Done


### Initialize the Spark context variables.

In [6]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *

def initspark(appname = "Notebook", servername = "local[*]"):
    print ('initializing pyspark')
    conf = SparkConf().setAppName(appname).setMaster(servername)
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.appName(appname).enableHiveSupport().getOrCreate()
    sc.setLogLevel("WARN")
    print ('pyspark initialized')
    return sc, spark, conf

sc, spark, conf = initspark()
print(sc, spark)

initializing pyspark
pyspark initialized
<SparkContext master=local[*] appName=Notebook> <pyspark.sql.session.SparkSession object at 0x7f1fa2c3dad0>


### Initialize helper functions to run Java inside cells.

In [12]:
# https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/get-started/try-apache-beam-java.ipynb#scrollTo=CgTXBdTsBn1F
# Run and print a shell command.
def run(cmd):
  print('>> {}'.format(cmd))
  !{cmd}  # This is magic to run 'cmd' in the shell.
  print('')


In [13]:
import os

# Download the gradle source.
gradle_version = 'gradle-5.0'
gradle_path = f"/opt/{gradle_version}"
if not os.path.exists(gradle_path):
  run(f"wget -q -nc -O gradle.zip https://services.gradle.org/distributions/{gradle_version}-bin.zip")
  run('unzip -q -d /opt gradle.zip')
  run('rm -f gradle.zip')

# We're choosing to use the absolute path instead of adding it to the $PATH environment variable.
def gradle(args):
  run(f"{gradle_path}/bin/gradle --console=plain {args}")

gradle('-v')
print('Done')


>> /opt/gradle-5.0/bin/gradle --console=plain -v
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[m
------------------------------------------------------------
Gradle 5.0
------------------------------------------------------------

Build time:   2018-11-26 11:48:43 UTC
Revision:     7fc6e5abf2fc5fe0824aec8a0f5462664dbcd987

Kotlin DSL:   1.0.4
Kotlin:       1.3.10
Groovy:       2.5.4
Ant:          Apache Ant(TM) version 1.9.13 compiled on July 10 2018
JVM:          1.8.0_92 (Azul Systems, Inc. 25.92-b15)
OS:           Linux 4.19.0-18-cloud-amd64 amd64

[m
Done


### Definition for %%java Python magic cell function.

In [14]:
from IPython.core.magic import register_line_magic, register_cell_magic, register_line_cell_magic
@register_cell_magic
def java(line, cell):
    """
    Written by Joseph Gagliardo Jr.
    joegagliardo@gmail.com
    2021-12-22
    """
    text = """
plugins {
  // id 'idea'     // Uncomment for IntelliJ IDE
  // id 'eclipse'  // Uncomment for Eclipse IDE

  // Apply java plugin and make it a runnable application.
  id 'java'
  id 'application'

  // 'shadow' allows us to embed all the dependencies into a fat jar.
  id 'com.github.johnrengelman.shadow' version '4.0.3'
}

// This is the path of the main class, stored within ./src/main/java/
mainClassName = 'samples.quickstart.{class_name}'

// Declare the sources from which to fetch dependencies.
repositories {
  mavenCentral()
}

// Java version compatibility.
sourceCompatibility = 1.8
targetCompatibility = 1.8

// Use the latest Apache Beam major version 2.
// You can also lock into a minor version like '2.9.+'.
ext.apacheBeamVersion = '2.+'

// Declare the dependencies of the project.
dependencies {
  shadow "org.apache.beam:beam-sdks-java-core:$apacheBeamVersion"

  runtime "org.apache.beam:beam-runners-direct-java:$apacheBeamVersion"
  runtime "org.slf4j:slf4j-api:1.+"
  runtime "org.slf4j:slf4j-jdk14:1.+"

  testCompile "junit:junit:4.+"
}

// Configure 'shadowJar' instead of 'jar' to set up the fat jar.
shadowJar {
  baseName = '{class_name}' // Name of the fat jar file.
  classifier = null       // Set to null, otherwise 'shadow' appends a '-all' to the jar file name.
  manifest {
    attributes('Main-Class': mainClassName)  // Specify where the main class resides.
  }
}
"""   
    if len(line) == 0:
        start = cell.find('class ')
        end = cell.find(' {')
        class_name = cell[start+6:end]
    else:
        class_name = line
        
    
    with open('build.gradle', 'w') as f:
        f.write(text.replace('{class_name}', class_name))

    with open(f'src/main/java/samples/quickstart/{class_name}.java', 'w') as f:
        f.write(cell)
        
    # Build the project.
    run(f"{gradle_path}/bin/gradle --console=plain build")
    run('ls -lh build/libs/')
    run('rm outputs/*')
    run(f"{gradle_path}/bin/gradle --console=plain runShadow")
    run('head -n 20 outputs/part*')

    print('Done')


### A basic Python example of applying a map function to a collection.

In [1]:
x = ['one', 'two', 'three', 'four']
print(list(map(str.title, x)))

['One', 'Two', 'Three', 'Four']


### To do this in Beam, turn the local collection into a PCollection and apply a Map PTransform on it.

In [16]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | beam.Create(['one', 'two', 'three', 'four'])
          | beam.Map(str.title)
          | beam.Map(print)
    )

# lines is a PCollection object
print('lines = ', lines)


Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.




Default Python SDK image for environment is apache/beam_python3.7_sdk:2.34.0
Creating state cache with size 100
Created Worker handler <apache_beam.runners.portability.fn_api_runner.worker_handlers.EmbeddedWorkerHandler object at 0x7f1fa2a994d0> for environment ref_Environment_default_environment_1 (beam:env:embedded_python:v1, b'')
Running ((((ref_AppliedPTransform_-16-Create-Impulse_3)+(ref_AppliedPTransform_-16-Create-FlatMap-lambda-at-core-py-3222-_4))+(ref_AppliedPTransform_-16-Create-MaybeReshuffle-Reshuffle-AddRandomKeys_7))+(ref_AppliedPTransform_-16-Create-MaybeReshuffle-Reshuffle-ReshufflePerKey-Map-reify_timestamps-_9))+([16]: Create/MaybeReshuffle/Reshuffle/ReshufflePerKey/GroupByKey/Write)
Running ((((([16]: Create/MaybeReshuffle/Reshuffle/ReshufflePerKey/GroupByKey/Read)+(ref_AppliedPTransform_-16-Create-MaybeReshuffle-Reshuffle-ReshufflePerKey-FlatMap-restore_timestamps_11))+(ref_AppliedPTransform_-16-Create-MaybeReshuffle-Reshuffle-RemoveRandomKeys_12))+(ref_AppliedPTra

### The Spark equivalent would be to pload a local Python list into a Spark RDD and do a simple transformation.

In [19]:
rdd1 = ( sc.parallelize(['one', 'two', 'three', 'four'])
        
#           .map(str.title)
       )
rdd1.collect()


[1, 2, 3]

### Simple Java transformation using a lambda.


In [15]:
%%java
package samples.quickstart;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.io.TextIO;
import java.util.*;

public class Create1 {
    public static void main(String[] args) {

        String outputsPrefix = "outputs/part";
        Pipeline p = Pipeline.create();
        
        PCollection<String> lines = p.apply(Create.of("one", "two", "three", "four"));
        lines = lines.apply(MapElements.into(TypeDescriptors.strings()).via((String line) -> line.toUpperCase()));
        lines.apply(TextIO.write().to(outputsPrefix));

        p.run().waitUntilFinish();
    }
}


>> /opt/gradle-5.0/bin/gradle --console=plain build
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[m> Task :compileJava
> Task :processResources NO-SOURCE
> Task :classes
> Task :jar
> Task :startScripts
> Task :distTar
> Task :distZip
> Task :shadowJar
> Task :startShadowScripts
> Task :shadowDistTar
> Task :shadowDistZip
> Task :assemble
> Task :compileTestJava NO-SOURCE
> Task :processTestResources NO-SOURCE
> Task :testClasses UP-TO-DATE
> Task :test NO-SOURCE
> Task :check UP-TO-DATE
> Task :build

BUILD SUCCESSFUL in 17s
9 actionable tasks: 9 executed
[m
>> ls -lh build/libs/
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
total 85M
-rw-r--r-- 1 root root  43M Dec 23 06:38 Create1.jar
-rw-r--r-- 1 root root 4.7K Dec 23 06:38 Dataflowclass1.jar
-rw-r--r-- 1 root root  43M Dec 23 06:30 WordCount.jar

>> rm outputs/*
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information 