Try Apache Beam - Java
In this notebook, we set up a Java development environment and work through a simple example using the DirectRunner. You can explore other runners with the Beam Capatibility Matrix.

To navigate through different sections, use the table of contents. From View drop-down list, select Table of contents.

To run a code cell, you can click the Run cell button at the top left of the cell, or by select it and press Shift+Enter. Try modifying a code cell and re-running it to see what happens.

To learn more about Colab, see Welcome to Colaboratory!.



In [1]:
# https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/get-started/try-apache-beam-java.ipynb#scrollTo=CgTXBdTsBn1F
# Run and print a shell command.
def run(cmd):
  print('>> {}'.format(cmd))
  !{cmd}  # This is magic to run 'cmd' in the shell.
  print('')


In [2]:
# Copy the input file into the local filesystem.
run('mkdir -p data')
run('gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt data/')
run('mkdir -p src/main/java/samples/quickstart')
print('Done')

>> mkdir -p data

>> gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt data/
Copying gs://dataflow-samples/shakespeare/kinglear.txt...
/ [1 files][153.6 KiB/153.6 KiB]                                                
Operation completed over 1 objects/153.6 KiB.                                    

>> mkdir -p src/main/java/samples/quickstart

Done


In [None]:
# Update and upgrade the system before installing anything else.
# run('apt-get update > /dev/null')
# run('apt-get upgrade > /dev/null')

# # Install the Java JDK.
# run('apt-get install -y default-jdk > /dev/null')

run('apt-get update')
run('apt-get upgrade -y')

# Install the Java JDK.
run('apt-get install -y default-jdk')

# Check the Java version to see if everything is working well.
run('javac -version')
print('Done')

>> apt-get update
Get:1 http://packages.cloud.google.com/apt gcsfuse-focal InRelease [5386 B]
Get:2 https://packages.cloud.google.com/apt cloud-sdk InRelease [6739 B]       
Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]      
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease                         
Get:5 http://packages.cloud.google.com/apt gcsfuse-focal/main amd64 Packages [816 B]
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]        
Get:7 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [203 kB]
Get:8 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [1335 kB]
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:10 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [733 kB]
Get:11 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [828 kB]
Get:12 http://archive.ubuntu.com/ubuntu focal-updates/restricted amd64 Packages

In [None]:
import os

# Download the gradle source.
gradle_version = 'gradle-5.0'
gradle_path = f"/opt/{gradle_version}"
if not os.path.exists(gradle_path):
  run(f"wget -q -nc -O gradle.zip https://services.gradle.org/distributions/{gradle_version}-bin.zip")
  run('unzip -q -d /opt gradle.zip')
  run('rm -f gradle.zip')

# We're choosing to use the absolute path instead of adding it to the $PATH environment variable.
def gradle(args):
  run(f"{gradle_path}/bin/gradle --console=plain {args}")

gradle('-v')
print('Done')

In [None]:
%%writefile build.gradle

plugins {
  // id 'idea'     // Uncomment for IntelliJ IDE
  // id 'eclipse'  // Uncomment for Eclipse IDE

  // Apply java plugin and make it a runnable application.
  id 'java'
  id 'application'

  // 'shadow' allows us to embed all the dependencies into a fat jar.
  id 'com.github.johnrengelman.shadow' version '4.0.3'
}

// This is the path of the main class, stored within ./src/main/java/
mainClassName = 'samples.quickstart.WordCount'

// Declare the sources from which to fetch dependencies.
repositories {
  mavenCentral()
}

// Java version compatibility.
sourceCompatibility = 1.8
targetCompatibility = 1.8

// Use the latest Apache Beam major version 2.
// You can also lock into a minor version like '2.9.+'.
ext.apacheBeamVersion = '2.+'

// Declare the dependencies of the project.
dependencies {
  shadow "org.apache.beam:beam-sdks-java-core:$apacheBeamVersion"

  runtime "org.apache.beam:beam-runners-direct-java:$apacheBeamVersion"
  runtime "org.slf4j:slf4j-api:1.+"
  runtime "org.slf4j:slf4j-jdk14:1.+"

  testCompile "junit:junit:4.+"
}

// Configure 'shadowJar' instead of 'jar' to set up the fat jar.
shadowJar {
  baseName = 'WordCount' // Name of the fat jar file.
  classifier = null       // Set to null, otherwise 'shadow' appends a '-all' to the jar file name.
  manifest {
    attributes('Main-Class': mainClassName)  // Specify where the main class resides.
  }
}


In [None]:
%%writefile src/main/java/samples/quickstart/WordCount.java

package samples.quickstart;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Count;
import org.apache.beam.sdk.transforms.Filter;
import org.apache.beam.sdk.transforms.FlatMapElements;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.TypeDescriptors;

import java.util.Arrays;

public class WordCount {
  public static void main(String[] args) {
    String inputsDir = "data/*";
    String outputsPrefix = "outputs/part";

    PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
    Pipeline pipeline = Pipeline.create(options);
    pipeline
        .apply("Read lines", TextIO.read().from(inputsDir))
        .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
            .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
        .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
        .apply("Count words", Count.perElement())
        .apply("Write results", MapElements.into(TypeDescriptors.strings())
            .via((KV<String, Long> wordCount) ->
                  wordCount.getKey() + ": " + wordCount.getValue()))
        .apply(TextIO.write().to(outputsPrefix));
    pipeline.run();
  }
}

In [None]:
# Build the project.
gradle('build')

# Check the generated build files.
run('ls -lh build/libs/')
print('Done')

In [None]:
# Run the shadow (fat jar) build.
gradle('runShadow')

# Sample the first 20 results, remember there are no ordering guarantees.
run('head -n 20 outputs/part-00000-of-*')
print('Done')

In [None]:
# You can now distribute and run your Java application as a standalone jar file.
run('cp build/libs/WordCount.jar .')
run('java -jar WordCount.jar')

# Sample the first 20 results, remember there are no ordering guarantees.
run('head -n 20 outputs/part-00000-of-*')
print('Done')

In [None]:
! rm outputs/*
java_run()
run('head -n 20 outputs/part-00000-of-*')
print('Done')

In [None]:
from IPython.core.magic import register_line_magic, register_cell_magic, register_line_cell_magic

@register_line_magic
def lmagic(line):
    "my line magic"
    return line

@register_cell_magic
def cmagic(line, cell):
    "my cell magic"
    return line, cell

@register_line_cell_magic
def lcmagic(line, cell=None):
    "Magic that works both as %lcmagic and as %%lcmagic"
    if cell is None:
        print("Called as line magic")
        return line
    else:
        print("Called as cell magic")
        return line, cell

# In an interactive session, we need to delete these to avoid
# name conflicts for automagic to work on line magics.
del lmagic, lcmagic


In [None]:
@register_cell_magic
def java(line, cell):
    "my cell magic"
    return line, cell


In [None]:
# Build the project.
gradle('build')

# Check the generated build files.
run('ls -lh build/libs/')

# # Run the shadow (fat jar) build.
gradle('runShadow')

# # Sample the first 20 results, remember there are no ordering guarantees.
run('head -n 20 outputs/regions*')

print('Done')


In [None]:
"""
IPython magics for displaying source code files with syntax highlighting.
This uses the Pygments library: http://pygments.org.
Two magics are available:
%highlight: This uses a terminal formatter and will work in any of IPython's
    front ends.
%highlight_html: This uses an HTML formatter and is best used in the
    IPython Notebook. This gives access to all available Pygments styles.
"""
from __future__ import print_function

import uuid

from pygments import highlight
from pygments.lexers import get_lexer_by_name, get_lexer_for_filename
from pygments.formatters import HtmlFormatter, TerminalFormatter

from IPython.utils.ipstruct import Struct
from IPython.core.magic import Magics, magics_class, line_magic
from IPython.display import display, HTML

HTML_TEMPLATE = """<style>
{}
</style>
{}
"""


@magics_class
class PygmentsMagic(Magics):
    def __init__(self, shell):
        super(PygmentsMagic, self).__init__(shell)

    @line_magic
    def highlight(self, parameter_s=''):
        """
        Display the contents of a source code file with syntax highlighting.
        Requires the pygments library.
        Usage:
            %highlight [options] <file name>
        Options:
            -g {'dark', 'light'}: Specify the 'dark' or 'light' color scheme.
                Defaults to 'dark'.
            -l <lexer name>: Manually specify the language of the code using
                any lexer name from http://pygments.org/docs/lexers/.
                By default the source language is guessed from the file name.
        """
        opts_def = Struct(l='', g='dark')
        opts, arg_str = self.parse_options(parameter_s, 'l:g:')
        opts.merge(opts_def)

        if opts.l:
            lexer = get_lexer_by_name(opts.l)
        else:
            lexer = get_lexer_for_filename(arg_str)

        formatter = TerminalFormatter(bg=opts.g)

        with open(arg_str) as f:
            code = f.read()

        print(highlight(code, lexer, formatter))

    @line_magic
    def highlight_html(self, parameter_s=''):
        """
        Display the contents of a source code file with syntax highlighting.
        You must be in an environment that can display HTML output.
        Requires the pygments library.
        Usage:
            %highlight [options] <file name>
        Options:
            -n: Show line numbers.
            -s <style name>: An available Pygments style, default is 'default'.
            -l <lexer name>: Manually specify the language of the code using
                any lexer name from http://pygments.org/docs/lexers/.
                By default the source language is guessed from the file name.
        """
        opts_def = Struct(l='', s='default')
        opts, arg_str = self.parse_options(parameter_s, 'l:s:n')
        opts.merge(opts_def)

        if opts.l:
            lexer = get_lexer_by_name(opts.l)
        else:
            lexer = get_lexer_for_filename(arg_str)

        if 'n' in opts:
            linenos = 'table'
        else:
            linenos = False

        formatter = HtmlFormatter(style=opts.s,
                                  cssclass='pygments' + str(uuid.uuid4()),
                                  linenos=linenos)

        with open(arg_str) as f:
            code = f.read()

        html_code = highlight(code, lexer, formatter)
        css = formatter.get_style_defs()

        html = HTML_TEMPLATE.format(css, html_code)

        display(HTML(html))


def load_ipython_extension(ipython):
    ipython.register_magics(PygmentsMagic)

In [None]:
load_ipython_extension()

In [None]:
ipython

In [None]:
from IPython.core.magic import register_line_magic, register_cell_magic
@register_line_magic
def hello(line):
    if line == 'french':
        print("Salut tout le monde!")
    else:
        print("Hello world!")
        
%hello
%hello french


In [None]:
%hello

In [None]:
@register_cell_magic
def java(line, cell):
    text = """
plugins {
  // id 'idea'     // Uncomment for IntelliJ IDE
  // id 'eclipse'  // Uncomment for Eclipse IDE

  // Apply java plugin and make it a runnable application.
  id 'java'
  id 'application'

  // 'shadow' allows us to embed all the dependencies into a fat jar.
  id 'com.github.johnrengelman.shadow' version '4.0.3'
}

// This is the path of the main class, stored within ./src/main/java/
mainClassName = 'samples.quickstart.{class_name}'

// Declare the sources from which to fetch dependencies.
repositories {
  mavenCentral()
}

// Java version compatibility.
sourceCompatibility = 1.8
targetCompatibility = 1.8

// Use the latest Apache Beam major version 2.
// You can also lock into a minor version like '2.9.+'.
ext.apacheBeamVersion = '2.+'

// Declare the dependencies of the project.
dependencies {
  shadow "org.apache.beam:beam-sdks-java-core:$apacheBeamVersion"

  runtime "org.apache.beam:beam-runners-direct-java:$apacheBeamVersion"
  runtime "org.slf4j:slf4j-api:1.+"
  runtime "org.slf4j:slf4j-jdk14:1.+"

  testCompile "junit:junit:4.+"
}

// Configure 'shadowJar' instead of 'jar' to set up the fat jar.
shadowJar {
  baseName = '{class_name}' // Name of the fat jar file.
  classifier = null       // Set to null, otherwise 'shadow' appends a '-all' to the jar file name.
  manifest {
    attributes('Main-Class': mainClassName)  // Specify where the main class resides.
  }
}
"""   
    if len(line) == 0:
        start = cell.find('class ')
        end = cell.find(' {')
        class_name = cell[start+6:end]
    else:
        class_name = line
        
    
    with open('build.gradle', 'w') as f:
        f.write(text.replace('{class_name}', class_name))

    with open(f'src/main/java/samples/quickstart/{class_name}.java', 'w') as f:
        f.write(cell)
        
    # Build the project.
    run(f"{gradle_path}/bin/gradle --console=plain build")
    run('ls -lh build/libs/')
    run('rm outputs/*')
    run(f"{gradle_path}/bin/gradle --console=plain runShadow")
    run('head -n 20 outputs/part*')

    print('Done')


In [None]:
%%java
package samples.quickstart;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.io.TextIO;


public class Simple1 {
    public static void main(String[] args) {
        Pipeline p = Pipeline.create();

        String regionsInputFileName = "regions.csv";
        String outputsPrefix = "outputs/part";


        PCollection<String> regions = p
            .apply("Read", TextIO.read().from(regionsInputFileName))
            .apply("Parse", MapElements.into(TypeDescriptors.strings()).via((String element) -> element + "*"));
        regions.apply("Write", TextIO.write().to(outputsPrefix));
        p.run().waitUntilFinish();
    }
}




In [None]:
! cat build.gradle
#! cat src/main/java/samples/quickstart/Simple1.java