In [1]:
# !pip install cassandra-driver

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark import SparkConf
from cassandra.cluster import Cluster

In [2]:
import os
os.chdir("..")
os.chdir("..")
BASE_DIR = os.getcwd()
BASE_DIR

'/Users/yashiro/Desktop/spark_examples'

In [3]:
INPUT_DIR = os.path.join(BASE_DIR, "00_input", "data")
OUTPUT_DIR = os.path.join(BASE_DIR, "00_output", "data")
CHECKPOINT = os.path.join(BASE_DIR, "00_output", "checkpoint")
TOPIC = "topic_one"
BOOTSTRAP_SERVER = "localhost:9092"
ZOOKEEPER = "localhost:2181"

In [4]:
# os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages com.datastax.spark:spark-cassandra-connector_2.11:2.5.0 --conf spark.cassandra.connection.host=127.0.0.1 pyspark-shell"

In [5]:
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.execute("use dev")

<cassandra.cluster.ResultSet at 0x1149136d0>

In [6]:
spark.stop()

In [7]:
spark : SparkSession = SparkSession.builder\
    .master( "local[*]" ) \
    .appName( "Cassandra-spark-example" ) \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .getOrCreate()

In [8]:
sample_tweets_df = spark.read\
    .option("inferSchema", True)\
    .option("header", True)\
    .json(f"file://{INPUT_DIR}/tweets.json")

tweets_schema = sample_tweets_df.schema
tweets_schema.fieldNames()[:5]

['contributors', 'coordinates', 'created_at', 'display_text_range', 'entities']

In [9]:
output_df = sample_tweets_df\
    .select("id", "user.name", "user.screen_name", "text", "lang", "source", \
            to_timestamp("created_at", "EEE MMM dd HH:mm:ss zzzz yyyy").alias("created_at"))
output_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- text: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- source: string (nullable = true)
 |-- created_at: timestamp (nullable = true)



In [10]:
session.execute("""
CREATE TABLE IF NOT EXISTS dev.tweets (
    id BIGINT PRIMARY KEY, 
    name TEXT,
    screen_name TEXT,  
    text TEXT,
    lang TEXT, 
    source TEXT, 
    created_at TIMESTAMP)
""")

<cassandra.cluster.ResultSet at 0x11490fed0>

In [11]:
 stream_df = spark\
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", BOOTSTRAP_SERVER) \
        .option("subscribe", TOPIC) \
        .load()

In [12]:
tweets_df = stream_df\
    .select(from_json(col("value").cast("string"), tweets_schema))\
    .select("jsontostructs(CAST(value AS STRING)).*")\
    .select("id", "user.name", "user.screen_name", "text", "lang", "source", \
            to_timestamp("created_at", "EEE MMM dd HH:mm:ss zzzz yyyy").alias("created_at"))

In [13]:
streamer = tweets_df.writeStream\
    .option("checkpointLocation", CHECKPOINT)\
    .format("org.apache.spark.sql.cassandra")\
    .option("table", "tweets")\
    .option("keyspace", "dev")\
    .start()

In [15]:
streamer.awaitTermination()

In [16]:
cass_data = spark.read\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="tweets", keyspace="dev")\
    .load()

In [17]:
cass_data.printSchema()

root
 |-- id: long (nullable = false)
 |-- created_at: timestamp (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)



In [18]:
cass_data.count()

589