In [1]:
import pyspark
print(pyspark.__version__)

import logging

from cassandra.cluster import Cluster
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

3.3.0


In [2]:
def create_spark_connection():
    s_conn = None
    
    try:
        s_conn = SparkSession \
            .builder \
            .appName("SparkDataStreaming") \
            .config("spark.streaming.stopGracefullyOnShutdown", True) \
            .config('spark.jars.packages', "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
                                        "com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.3.0") \
            .config("spark.sql.shuffle.partitions", 4) \
            .config('spark.cassandra.connection.host', 'host.docker.internal') \
            .getOrCreate()
                
        s_conn.sparkContext.setLogLevel("ERROR")
        logging.info("Spark connection created successfully!")
    except Exception as e:
        logging.error(f"Couldn't create the spark session due to exception {e}")
    
    return s_conn

In [3]:
spark_conn = create_spark_connection()

In [4]:
def connect_to_kafka(spark_conn):
    spark_df = None
    try:
        spark_df = spark_conn.read \
            .format('kafka') \
            .option('kafka.bootstrap.servers', 'kafka:9092') \
            .option('subscribe', 'user_created') \
            .option('startingOffsets', 'earliest') \
            .load()
        logging.info("kafka dataframe created successfully")
    except Exception as e:
        logging.warning(f"kafka dataframe could not be created because: {e}")

    return spark_df

In [5]:
if spark_conn is not None:
    # connect to kafka with spark connection
    spark_df = connect_to_kafka(spark_conn)

In [None]:
spark_df.show(5)

In [33]:
def create_selection_df_from_kafka(spark_df):
    schema = StructType([
        StructField("id", StringType(), True),
        StructField("first_name", StringType(), False),
        StructField("last_name", StringType(), False),
        StructField("gender", StringType(), False),
        StructField("address", StringType(), False),
        StructField("post_code", StringType(), False),
        StructField("email", StringType(), False),
        StructField("username", StringType(), False),
        StructField("dob", StringType(), False),
        StructField("registered_date", StringType(), False),
        StructField("phone", StringType(), False),
        StructField("picture", StringType(), False)
    ])

    sel = spark_df.selectExpr("CAST(value AS STRING)") \
        .select(from_json(col('value'), schema).alias('data')).select("data.*")
    print(sel)

    return sel

In [34]:
selection_df = create_selection_df_from_kafka(spark_df)

DataFrame[id: string, first_name: string, last_name: string, gender: string, address: string, post_code: string, email: string, username: string, dob: string, registered_date: string, phone: string, picture: string]


In [35]:
selection_df.show()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka

In [36]:
def create_cassandra_connection():
    try:
        # connecting to the cassandra cluster
        cluster = Cluster(['host.docker.internal'])

        cas_session = cluster.connect()

        return cas_session
    except Exception as e:
        logging.error(f"Could not create cassandra connection due to {e}")
        return None

In [37]:
session = create_cassandra_connection()



In [38]:
def create_table(session):
    session.execute("""
    CREATE TABLE IF NOT EXISTS spark_streams.created_users (
        id UUID PRIMARY KEY,
        first_name TEXT,
        last_name TEXT,
        gender TEXT,
        address TEXT,
        post_code TEXT,
        email TEXT,
        username TEXT,
        dob TEXT,
        registered_date TEXT,
        phone TEXT,
        picture TEXT);
    """)

    print("Table created successfully!")

In [39]:
if session is not None:
    create_keyspace(session)
    create_table(session)

Keyspace created successfully!
Table created successfully!


In [40]:
def insert_data(session, **kwargs):
    print("inserting data...")

    user_id = kwargs.get('id')
    first_name = kwargs.get('first_name')
    last_name = kwargs.get('last_name')
    gender = kwargs.get('gender')
    address = kwargs.get('address')
    postcode = kwargs.get('post_code')
    email = kwargs.get('email')
    username = kwargs.get('username')
    dob = kwargs.get('dob')
    registered_date = kwargs.get('registered_date')
    phone = kwargs.get('phone')
    picture = kwargs.get('picture')

    try:
        session.execute("""
            INSERT INTO spark_streams.created_users(id, first_name, last_name, gender, address, 
                post_code, email, username, dob, registered_date, phone, picture)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, (user_id, first_name, last_name, gender, address,
              postcode, email, username, dob, registered_date, phone, picture))
        logging.info(f"Data inserted for {first_name} {last_name}")

    except Exception as e:
        logging.error(f'could not insert data due to {e}')

In [41]:
insert_data(session)

ERROR:root:could not insert data due to Error from server: code=2200 [Invalid query] message="Invalid null value in condition for column id"


inserting data...


In [43]:
# logging.info("Streaming is being started...")

# streaming_query = selection_df.writeStream.format("org.apache.spark.sql.cassandra")\
#                    .option('checkpointLocation', '/tmp/checkpoint')\
#                    .option('keyspace', 'spark_streams')\
#                    .option('table', 'created_users')\
#                    .start()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 