In [1]:
import pyspark
print(pyspark.__version__)

import logging

import pandas as pd
from cassandra.cluster import Cluster
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType, LongType, IntegerType

3.3.0


## Build spark

In [2]:
s_conn = SparkSession \
    .builder \
    .appName("SparkDataStreaming") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
                                "com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.3.0") \
    .config("spark.sql.shuffle.partitions", 4) \
    .config('spark.cassandra.connection.host', 'host.docker.internal') \
    .getOrCreate()

## Connect to kafka

In [3]:
spark_df = s_conn.readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', 'kafka:29092') \
    .option('subscribe', 'user_created') \
    .option('startingOffsets', 'earliest') \
    .load()

In [4]:
# Change 'readStream' to 'read' to read kafka
# spark_df = s_conn.read\
#     .format('kafka') \
#     .option('kafka.bootstrap.servers', 'kafka:29092') \
#     .option('subscribe', 'user_created') \
#     .option('startingOffsets', 'earliest') \
#     .load()

# spark_df.printSchema()
# spark_df.show()

## Create spark dataframe from kafka

In [5]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("first_name", StringType(), False),
    StructField("last_name", StringType(), False),
    StructField("gender", StringType(), False),
    StructField("address", StringType(), False),
    StructField("post_code", StringType(), False),
    StructField("email", StringType(), False),
    StructField("username", StringType(), False),
    StructField("dob", TimestampType(), False),
    StructField("registered_date", TimestampType(), False),
    StructField("phone", StringType(), False),
    StructField("picture", StringType(), False)
])

sel = spark_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col('value'), schema).alias('data')).select("data.*")

## Create cassandra connection

In [6]:
cluster = Cluster(['host.docker.internal'])
cas_session = cluster.connect()
cas_session

<cassandra.cluster.Session at 0x7f4684f8dd20>

### Create keyspace

In [7]:
cas_session.execute("""
    CREATE KEYSPACE IF NOT EXISTS spark_streams
    WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
""")

<cassandra.cluster.ResultSet at 0x7f46857c3be0>

### Create table

In [8]:
cas_session.execute("""
    CREATE TABLE IF NOT EXISTS spark_streams.created_users (
        id UUID PRIMARY KEY,
        first_name TEXT,
        last_name TEXT,
        gender TEXT,
        address TEXT,
        post_code TEXT,
        email TEXT,
        username TEXT,
        dob TIMESTAMP,
        registered_date TIMESTAMP,
        phone TEXT,
        picture TEXT);
""")

<cassandra.cluster.ResultSet at 0x7f46857c2e90>

### Insert_data to cassandra

In [9]:
streaming_query = (sel.writeStream.format("org.apache.spark.sql.cassandra")
                   .option('checkpointLocation', '/tmp/checkpoint')
                   .option('keyspace', 'spark_streams')
                   .option('table', 'created_users')
                   .start())

streaming_query.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

## Read cassandra spark dataframe

In [10]:
cassandra_options = {
    "keyspace": "spark_streams",
    "table": "created_users"
}

cass_df = s_conn.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(**cassandra_options) \
    .load()

cass_df.limit(5).toPandas()

Unnamed: 0,id,address,dob,email,first_name,gender,last_name,phone,picture,post_code,registered_date,username
0,7c50630e-82d6-11ee-8fcc-0242ac120009,"8880 Peatonal Arreola, San Pedro Lagunillas, Q...",1979-05-23T00:41:50.154Z,lourdes.olmos@example.com,Lourdes,female,Olmos,(622) 157 8755,https://randomuser.me/api/portraits/med/women/...,83277,2003-04-16T21:29:47.084Z,angryelephant289
1,8e2f90c2-82d6-11ee-8fcc-0242ac120009,"2202 Dane St, Sacramento, Idaho, United States",1995-06-07T00:55:34.002Z,gabriel.elliott@example.com,Gabriel,male,Elliott,(363) 918-2070,https://randomuser.me/api/portraits/med/men/99...,17091,2016-02-14T17:42:51.301Z,bigfish863
2,78ddad1c-82d6-11ee-8fcc-0242ac120009,"2345 Fontainhas, Bidhannagar, Dadra and Nagar ...",1965-02-17T16:38:22.121Z,gunbir.kumar@example.com,Gunbir,male,Kumar,7069860700,https://randomuser.me/api/portraits/med/men/90...,82551,2010-11-28T01:53:14.494Z,whitemeercat278
3,1cb4bf98-82d7-11ee-990e-0242ac120009,"8366 Berislavske shose, Novoukrayinka, Zakarpa...",1962-04-08T11:09:28.170Z,bilovid.logvinenko@example.com,Bilovid,male,Logvinenko,(066) B41-0950,https://randomuser.me/api/portraits/med/men/90...,33400,2011-04-10T05:20:03.503Z,lazybutterfly927
4,13ab9f02-82d7-11ee-990e-0242ac120009,"5996 E Sandy Lake Rd, Sacramento, Kentucky, Un...",1964-09-04T14:36:18.944Z,eric.reyes@example.com,Eric,male,Reyes,(557) 478-5372,https://randomuser.me/api/portraits/med/men/97...,74467,2013-06-13T23:02:18.671Z,purpleelephant689


In [None]:
cass_df_time = cass_df\
.select('id')\
.