# Prerrequisites

Installing Spark and Apache Kafka Library in VM


---



In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark==1.3.0
!pip install py4j==0.10.9

# For plotting
!pip install folium
!pip install plotly

In [None]:
!ls /content

In [None]:
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/libs/libs-kafka_301.zip --directory-prefix=/content/spark-3.0.1-bin-hadoop2.7/jars/
!unzip -n /content/spark-3.0.1-bin-hadoop2.7/jars/libs-kafka_301.zip -d /content/spark-3.0.1-bin-hadoop2.7/jars/
!ls /content/spark-3.0.1-bin-hadoop2.7/jars/*kafka*

Define the environment (Java & Spark homes)

---

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
os.environ["PYSPARK_SUBMIT_ARGS"] = ""

Starting Spark Session and print the version


---


In [None]:
import findspark
findspark.add_packages(["org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1"])
findspark.add_jars(["/content/spark-3.0.1-bin-hadoop2.7/jars/kafka-clients-2.0.0.jar","/content/spark-3.0.1-bin-hadoop2.7/jars/lz4-java-1.4.1-jar","/content/spark-3.0.1-bin-hadoop2.7/jars/scala-library-2.11.12.jar","/content/spark-3.0.1-bin-hadoop2.7/jars/slf4j-api-1.7.25.jar","/content/spark-3.0.1-bin-hadoop2.7/jars/snappy-java-1.1.7.1.jar","/content/spark-3.0.1-bin-hadoop2.7/jars/spark-sql-kafka-0-10_2.11-2.4.5.jar","/content/spark-3.0.1-bin-hadoop2.7/jars/spark-tags_2.11-2.4.5.jar","/content/spark-3.0.1-bin-hadoop2.7/jars/unused-1.0.0.jar"])
findspark.init("spark-3.0.1-bin-hadoop2.7")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.ui.port", "4050") \
        .getOrCreate()

spark.version

In [None]:
spark

In [None]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

Creating ngrok tunnel to allow Spark UI (Optional)


In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

# Download Datasets

In [None]:
!mkdir -p /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/trades.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/trades.json -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/offshore_leaks.edges.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/offshore_leaks.nodes.address.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/offshore_leaks.nodes.intermediary.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/offshore_leaks.nodes.officer.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/offshore_leaks.nodes.entity.csv -P /dataset
!ls /dataset

# Project 1 - Regulatory Banking Project
---

Input files: /dataset/trades.csv & /dataset/trades.json


# Project 2 - Transactions Notifications

*Hint: https://databricks.com/blog/2017/05/08/event-time-aggregation-watermarking-apache-sparks-structured-streaming.html*

In [None]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "ec2-3-231-22-58.compute-1.amazonaws.com:9092") \
  .option("subscribe", "transactions") \
  .load()

In [None]:
schema = StructType(
    [
     StructField('Account No', StringType(), True),
     StructField('DATE', StringType(), True),
     StructField('TRANSACTION DETAILS', StringType(), True),
     StructField('CHQ.NO.', StringType(), True),
     StructField('VALUE DATE', StringType(), True),
     StructField(' WITHDRAWAL AMT ', StringType(), True),
     StructField(' DEPOSIT AMT ', StringType(), True),
     StructField('BALANCE AMT', StringType(), True)
    ]
)
df.printSchema()

dataset = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "timestamp") \
    .withColumn("value", from_json("value", schema)) \
    .select(col('key'), col("timestamp"), col('value.*'))

In [None]:
dataset_count.writeStream \
 .outputMode("update") \
 .format("memory") \
 .option("truncate", "false") \
 .queryName("transactions") \
 .start()

In [None]:
spark.sql("select * from transactions").show(truncate = False)

# Project 3 - Panama Papers

Trace "Spring Song International Co., Ltd." entity with Spark SQL using the following dataset</br>
/dataset/offshore_leaks.nodes.entity.csv </br>
/dataset/offshore_leaks.nodes.intermediary.csv </br>
/dataset/offshore_leaks.edges.csv </br>
/dataset/offshore_leaks.nodes.officer.csv