# 1. Initialization Code

## Beam

### Initialize helper functions to run Java inside cells.

In [None]:
# https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/get-started/try-apache-beam-java.ipynb#scrollTo=CgTXBdTsBn1F
# Run and print a shell command.
def run(cmd, progress = True, verbose = False):
  if progress:
      print('>> {}'.format(cmd))
    
  if verbose:
      !{cmd}  # This is magic to run 'cmd' in the shell.
      print('')
  else:
      ! {cmd} > /dev/null 2>&1

import os

# Download the gradle source.
gradle_version = 'gradle-5.0'
gradle_path = f"/opt/{gradle_version}"
if not os.path.exists(gradle_path):
  run(f"wget -q -nc -O gradle.zip https://services.gradle.org/distributions/{gradle_version}-bin.zip")
  run('unzip -q -d /opt gradle.zip')
  run('rm -f gradle.zip')

# We're choosing to use the absolute path instead of adding it to the $PATH environment variable.
def gradle(args):
  run(f"{gradle_path}/bin/gradle --console=plain {args}")

gradle('-v')

! mkdir -p src/main/java/samples/quickstart/
print('Done')
        

### Definition for <font color='blue' face="Fixedsys, monospace" size="+2">%%java</font> Python magic cell function.

In [None]:
from IPython.core.magic import register_line_magic, register_cell_magic, register_line_cell_magic
@register_cell_magic
def java(line, cell):
    """
    Written by Joseph Gagliardo Jr.
    joegagliardo@gmail.com
    2021-12-22
    """
    gradle_text = """
plugins {
  // id 'idea'     // Uncomment for IntelliJ IDE
  // id 'eclipse'  // Uncomment for Eclipse IDE

  // Apply java plugin and make it a runnable application.
  id 'java'
  id 'application'

  // 'shadow' allows us to embed all the dependencies into a fat jar.
  id 'com.github.johnrengelman.shadow' version '4.0.3'
}

// This is the path of the main class, stored within ./src/main/java/
mainClassName = 'samples.quickstart.{class_name}'

// Declare the sources from which to fetch dependencies.
repositories {
  mavenCentral()
}

// Java version compatibility.
sourceCompatibility = 1.8
targetCompatibility = 1.8

// Use the latest Apache Beam major version 2.
// You can also lock into a minor version like '2.9.+'.
ext.apacheBeamVersion = '2.+'

// Declare the dependencies of the project.
dependencies {
  shadow "org.apache.beam:beam-sdks-java-core:$apacheBeamVersion"

  runtime "org.apache.beam:beam-runners-direct-java:$apacheBeamVersion"
  runtime "org.apache.beam:beam-sdks-java-extensions-sql:$apacheBeamVersion"
  runtime "com.google.auto.value:auto-value-annotations:1.6"
  runtime "com.google.code.gson:gson:2.8.8"
  compile "org.apache.beam:beam-sdks-java-extensions-join-library:$apacheBeamVersion"
  runtime "org.slf4j:slf4j-api:1.+"
  runtime "org.slf4j:slf4j-jdk14:1.+"

  annotationProcessor "com.google.auto.value:auto-value:1.6"

  testCompile "junit:junit:4.+"
}

// Configure 'shadowJar' instead of 'jar' to set up the fat jar.
shadowJar {
  zip64 true
  baseName = '{class_name}' // Name of the fat jar file.
  classifier = null       // Set to null, otherwise 'shadow' appends a '-all' to the jar file name.
  manifest {
    attributes('Main-Class': mainClassName)  // Specify where the main class resides.
  }
}
"""   
    start = cell.find('class ')
    end = cell.find(' {')
    class_name = cell[start+6:end]
    progress = 'noprogress' not in line.lower()
    verbose = 'verbose' in line.lower()
    output = 'nooutput' not in line.lower()

        
    # if len(line) == 0:
    #     start = cell.find('class ')
    #     end = cell.find(' {')
    #     class_name = cell[start+6:end]
    # else:
    #     class_name = line
        
    
    run('rm src/main/java/samples/quickstart/*.java')
    run('rm build/libs/*.jar')
    run('rm -rf /tmp/outputs*', progress = progress, verbose = verbose)

    with open('build.gradle', 'w') as f:
        f.write(gradle_text.replace('{class_name}', class_name))

    with open(f'src/main/java/samples/quickstart/{class_name}.java', 'w') as f:
        f.write(cell)
        
    # Build the project.
    run(f"{gradle_path}/bin/gradle --console=plain build", progress = progress, verbose = verbose)
    run('ls -lh build/libs/', progress = progress, verbose = verbose)
    run(f"{gradle_path}/bin/gradle --console=plain runShadow", progress = progress, verbose = verbose)
    # run('head -n 20 /tmp/outputs*')
    if output:
        run('cat /tmp/outputs*', progress = False, verbose = True)

    print('Done')

print('Done')

In [None]:
# additional dependencies sometimes needed
  compile "org.apache.beam:beam-sdks-java-extensions-google-cloud-platform-core:2.22.0"
  compile "org.apache.beam:beam-runners-google-cloud-dataflow-java:2.22.0"
  compile "org.apache.beam:beam-sdks-java-io-google-cloud-platform:2.22.0"



## Spark

### Install a Spark docker using the following commands.

In [None]:
! docker pull bitnami/spark && \
docker network create spark_network && \
docker run -d --name spark --network=spark_network -e SPARK_MODE=master bitnami/spark
! ln -s /opt/conda/lib/libtinfo.so /opt/conda/lib/libtinfor.so.6
print('Done')

### Install pyspark.

In [None]:
import pip

def install(package):
    if hasattr(pip, 'main'):
        pip.main(['install', package])
    else:
        pip._internal.main(['install', package])

install('pyspark')
        
print('Done')

### Initialize the Spark context variables.

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *

def initspark(appname = "Notebook", servername = "local[*]"):
    print ('initializing pyspark')
    conf = SparkConf().setAppName(appname).setMaster(servername)
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.appName(appname).enableHiveSupport().getOrCreate()
    sc.setLogLevel("ERROR")
    print ('pyspark initialized')
    return sc, spark, conf

sc, spark, conf = initspark()
print(sc, spark)
print('Done')

# __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ 

# 2. <font color='blue' face="Fixedsys, monospace" size="+2">Create</font> allows you to upload data into a <font color='green' size="+2">PCollection</font>.

## <img src="https://github.com/joegagliardo/dataflowclass1/blob/main/python.png?raw=1" width=40 height=40 /><font color='cadetblue' size="+2">Python</font>

### Non Beam example of applying a <font color='blue' face="Fixedsys, monospace" size="+2">	map</font> function to a collection. 

In [None]:
x = ['one', 'two', 'three', 'four']
print(list(map(str.title, x)))

### Simple transformation, turn the local collection into a <font color='green' size="+2">PCollection</font> and apply a <font color='blue' face="Fixedsys, monospace" size="+2">Map</font> <font color='green' size="+2">PTransform</font> on it.

In [None]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | beam.Create(['one', 'two', 'three', 'four'])
          | beam.Map(str.title)
          | beam.Map(print)
    )

# lines is a PCollection object
print('lines = ', lines)


### Simple transformation using a <font color='blue' face="Fixedsys, monospace" size="+2">lambda</font> instead of a built in function.

In [None]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | beam.Create(['one', 'two', 'three', 'four'])
          | beam.Map(lambda x : x.title())
          | beam.Map(print)
    )


### Simple transformation using a user defined function.

In [None]:
import apache_beam as beam

def title(x):
    return x.title()

with beam.Pipeline() as p:
    lines = (
        p | beam.Create(['one', 'two', 'three', 'four'])
          | beam.Map(title)
          | beam.Map(print)
    )


### The pipe <font color='blue' face="Fixedsys, monospace" size="+2">|</font> is actually just an operator overload to call the <font color='blue' face="Fixedsys, monospace" size="+2">apply</font> method of the pipeline. You would never do this in Python, but it helps to understand what is going on under the hood.

In [None]:
import apache_beam as beam

with beam.Pipeline() as p:
        lines = ((p | beam.Create(['one', 'two', 'three', 'four']))
             .apply(beam.Map(str.title)) 
             .apply(beam.Map(print))
        )

### The Spark equivalent would be to upload a local Python <font color='blue' face="Fixedsys, monospace" size="+2">list</font> into a Spark <font color='green' size="+2">RDD</font> and do a simple transformation.

In [None]:
rdd1 = ( sc.parallelize(['one', 'two', 'three', 'four'])
        
#           .map(str.title)
       )
rdd1.collect()


## <img src="https://github.com/joegagliardo/dataflowclass1/blob/main/java.png?raw=1" width=40 height=40 /><font color='indigo' size="+2">Java</font>

### Simple transformation using a <font color='green' size="+2">lambda</font>.


In [None]:
%%java verbose
package samples.quickstart;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.io.TextIO;

import java.util.*;

public class Create1 {
    public static void main(String[] args) {

        String outputsPrefix = "/tmp/outputs";
        Pipeline p = Pipeline.create();
        
        PCollection<String> lines = p.apply(Create.of("one", "two", "three", "four"));
        lines = lines.apply(MapElements.into(TypeDescriptors.strings()).via((String line) -> line.toUpperCase()));
        lines.apply(TextIO.write().to(outputsPrefix));

        p.run().waitUntilFinish();
    }
}


### Simple transformation using <font color='blue' face="Fixedsys, monospace" size="+2">SimpleFunction</font> instead of <font color='green' size="+2">lambda</font>.


In [None]:
%%java verbose
package samples.quickstart;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.io.TextIO;
import java.util.*;

public class Create2 {
    public static void main(String[] args) {

        String outputsPrefix = "/tmp/outputs";
        Pipeline p = Pipeline.create();
        
        PCollection<String> lines = p.apply(Create.of("one", "two", "three", "four"));
        lines = lines.apply(MapElements.via(
            new SimpleFunction<String, String>() {
              @Override
              public String apply(String line) {
                String ret = line.toUpperCase();
                //System.out.println("** " + ret);
                return ret;
              }
            }));

        lines.apply("Write", TextIO.write().to(outputsPrefix));

        p.run().waitUntilFinish();
    }
}


### Java simple transformation using <font color='blue' face="Fixedsys, monospace" size="+2">SimpleFunction</font> to wrap a User Defined Function.


In [None]:
%%java
package samples.quickstart;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.io.TextIO;
import java.util.*;

public class Create3 {
    public static void main(String[] args) {

        String outputsPrefix = "/tmp/outputs";
        Pipeline p = Pipeline.create();
        
        PCollection<String> lines = p.apply(Create.of("one", "two", "three", "four"));
        lines = lines.apply(MapElements.via(
            new SimpleFunction<String, String>() {
              @Override
              public String apply(String line) {
                return upper(line);
              }
            }));

        lines.apply("Write", TextIO.write().to(outputsPrefix));

        p.run().waitUntilFinish();
    }
    
    public static String upper(String line) {
        return line.toUpperCase();
    }
}


# __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ 

# LAB: #
## Put the regions folder found in /class/datasets/northwind/CSV/regions into HDFS. Read it into an RDD and convert it into a tuple shape.
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use <font color='blue' face="Fixedsys, monospace" size="+1">hadoop fs -put</font> or <font color='blue' face="Fixedsys, monospace" size="+1">hdfs dfs -put</font>
<br>
    Read the file using <font color='blue' face="Fixedsys, monospace" size="+1">sc.textFile</font>
<br>
    Do a <font color='green' size="+1">map</font> to split and another to convert the datatypes
<br>
<br>
</p>
</details>

<details><summary>Click for <b>code</b></summary>
<p>

```python
! hadoop fs -put /class/datasets/northwind/CSV/regions /regions
regions = sc.textFile('hdfs://localhost:9000/regions')
regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))
print(regions.collect())
```
</p>
</details>

# __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ __ 

![alt.text](PowerPoint.png "PowerPoint")