In [1]:
import os
SCALA_VERSION="2.12"
SPARK_VERSION="3.0.1"
os.environ['PYSPARK_SUBMIT_ARGS'] = f"--packages=org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION} pyspark-shell"

import json

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

from confluent_kafka import Producer
import uuid
import random

import pandas as pd
import requests
from time import sleep

Python Version expected 3.8.6

In [2]:
# Python 3.8.6
!python --version

Python 3.8.6


confluent-kafka version 1.5.0

In [3]:
# Version: 1.5.0
!pip show confluent-kafka

Name: confluent-kafka
Version: 1.5.0
Summary: Confluent's Python client for Apache Kafka
Home-page: https://github.com/confluentinc/confluent-kafka-python
Author: Confluent Inc
Author-email: support@confluent.io
License: UNKNOWN
Location: /opt/conda/lib/python3.8/site-packages
Requires: 
Required-by: 


## Spark Session

In [4]:
spark = SparkSession \
    .builder \
    .appName('RealtimeKafkaML') \
    .getOrCreate()

In [5]:
# Spark 3.0.1
print(f"Spark {spark.version}")

Spark 3.0.1


In [6]:
simple_messages = [
'I love this pony',
'This restaurant is great',
'The weather is bad today',
'I will go to the beach this weekend',
'She likes to swim',
'Apple is a great company'
]

bootstrap_servers = 'kafka:9092'
topic = 'test'
msg_count = 5

In [7]:
def delivery_report(err, msg):    
    """ Called once for each message produced to indicate delivery result.
        Triggered by poll() or flush(). """
    if err is not None:
        print('Message delivery failed: {}'.format(err))
    else:
        print('Message delivered to {}'.format(msg.topic()))

def confluent_kafka_producer():
    
    p = Producer({'bootstrap.servers': bootstrap_servers})
    for data in simple_messages:
        
        record_key = str(uuid.uuid4())
        record_value = json.dumps({'data': data})
        p.produce(topic, key=record_key, value=record_value, on_delivery=delivery_report)
       
        p.poll(0)

    p.flush()
    print('we\'ve sent {count} messages to {brokers}'.format(count=len(simple_messages), brokers=bootstrap_servers))

In [8]:
confluent_kafka_producer()

Message delivered to test
Message delivered to test
Message delivered to test
Message delivered to test
Message delivered to test
Message delivered to test
we've sent 6 messages to kafka:9092


  p.produce(topic, key=record_key, value=record_value, on_delivery=delivery_report)


## Read From Kafka to Spark

In [9]:
df_raw = spark \
  .readStream \
  .format('kafka') \
  .option('kafka.bootstrap.servers', bootstrap_servers) \
  .option("startingOffsets", "earliest") \
  .option('subscribe', topic) \
  .load()

In [10]:
df_json = df_raw.selectExpr('CAST(value AS STRING) as json')

In [11]:
query = df_json \
    .writeStream \
    .format("memory") \
    .queryName("df_json") \
    .start()

In [13]:
display(f"Data Available: {query.status['isDataAvailable']}")
display(spark.sql('SELECT DISTINCT json FROM df_json').show())

'Data Available: False'

+--------------------+
|                json|
+--------------------+
|{"data": "She lik...|
|{"data": "Apple i...|
|{"data": "I will ...|
|{"data": "I love ...|
|{"data": "The wea...|
|{"data": "This re...|
+--------------------+



None

In [14]:
schema = StructType([StructField('data', StringType())])

df_json.select(
    from_json(df_json.json, schema)
    .alias('raw_data')) \
  .select('raw_data.data') \
  .writeStream \
  .trigger(once=True) \
  .format("memory") \
  .queryName('exploded_json') \
  .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f27fbb78640>

In [17]:
display(f"Data Available: {query.status['isDataAvailable']}")
display(spark.sql('SELECT * FROM exploded_json').show())

'Data Available: False'

+--------------------+
|                data|
+--------------------+
|    I love this pony|
|This restaurant i...|
|The weather is ba...|
|I will go to the ...|
|   She likes to swim|
|Apple is a great ...|
|    I love this pony|
|This restaurant i...|
|The weather is ba...|
|I will go to the ...|
|   She likes to swim|
|Apple is a great ...|
|    I love this pony|
|This restaurant i...|
|The weather is ba...|
|I will go to the ...|
|   She likes to swim|
|Apple is a great ...|
|    I love this pony|
|This restaurant i...|
+--------------------+
only showing top 20 rows



None

## REST NLP Sentiment

In [18]:
def apply_sentiment_analysis(data):
    import requests
    import json
    print(f"sending request in format: {json.loads(data)}")
    #result = requests.post('http://localhost:9000/predict', json=json.loads(data))
    result = requests.post('http://sentiment:9000/predict', json=json.loads(data))
    print(json.dumps(result.json()))
    return json.dumps(result.json())

vader_udf = udf(lambda data: apply_sentiment_analysis(data), StringType())

In [19]:
schema_input = StructType([StructField('data', StringType())])

schema_output = StructType(
    [StructField('neg', StringType()),
     StructField('pos', StringType()),
     StructField('neu', StringType()),
     StructField('compound', StringType())
    ])

In [20]:
(df_json.select(
    from_json(df_json.json, schema)
    .alias('raw_data'))
  .select('raw_data.data')
  .writeStream
  .trigger(once=True)
  .format("memory")
  .queryName('exploded_json')
  .start()
)

<pyspark.sql.streaming.StreamingQuery at 0x7f27fbb23e50>

In [21]:
(
    df_json
    .select(
        from_json(df_json.json, schema_input)
        .alias('sentence'),
        from_json(vader_udf(df_json.json), schema_output)
        .alias('response')
    )
    .select('sentence.data', 'response.*')
    .writeStream
    .trigger(once=True)
    .format("memory")
    .queryName("input_output")
    .start()
)

<pyspark.sql.streaming.StreamingQuery at 0x7f282150cdf0>

# IAMHERE.
Issues connecting to localhost 9000. Going to serve the app on separate named docker instance for serving flask apps.

In [22]:
from IPython.display import display, clear_output


while True:
    clear_output(wait=True)
    display(f"Data Available: {query.status['isDataAvailable']}")
    display(spark.sql('SELECT * FROM input_output').show())
    sleep(1)

'Data Available: False'

+----+---+---+---+--------+
|data|neg|pos|neu|compound|
+----+---+---+---+--------+
+----+---+---+---+--------+



None

KeyboardInterrupt: 

 StructField('compound', StringType())]

    sentence#36.data AS data#45, 
    response#38.neg AS neg#41, 
    response#38.pos AS pos#42, 
    response#38.neu AS neu#43, 
    response#38.compound AS compound#44

    from_json(
        StructField(data,StringType,true), json#21, 
        Some(Etc/UTC)
        ) AS sentence#36, 
    from_json(
        StructField(neg,StringType,true), 
        StructField(pos,StringType,true), 
        StructField(neu,StringType,true), 
        StructField(compound,StringType,true), 
        <lambda>(json#21), 
        Some(Etc/UTC)) AS response#38]

    cast(value#8 as string) AS json#21

    StreamingDataSourceV2Relation 
        [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]

In [None]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
  .option("subscribe", "topic1") \
  .load()

query = (
    df
    .writeStream
    .trigger(once=True)
    .format("memory")
    .queryName('response')
    .start())

In [None]:
display(query.status)
display(spark.sql('SELECT * FROM response').show())

In [None]:
exploded_json