In [1]:
import os
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.0 --conf spark.cassandra.connection.host=127.0.0.1 --jars /home/osboxes/spark-streaming-kafka-0-8-assembly_2.11-2.4.3.jar pyspark-shell --master local[2]'

# initialize spark
import pyspark
findspark.init()

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark import RDD
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json

from pyspark.sql.functions import split
from pyspark.sql.types import IntegerType, LongType, DecimalType,StructType, StructField, StringType
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql import Window

from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import GBTRegressor

In [2]:
# pip install cassandra-driver
from cassandra.cluster import Cluster
cluster = Cluster(['localhost'])
session = cluster.connect()
session.execute("CREATE KEYSPACE IF NOT EXISTS finalproject WITH REPLICATION = {'class' : 'SimpleStrategy', 'replication_factor' : 1};")
session.execute("CREATE TABLE IF NOT EXISTS finalProject.news (title text PRIMARY KEY, date timestamp, sentiment double, type text)")
session.execute("CREATE TABLE IF NOT EXISTS finalProject.bitcoin (date timestamp PRIMARY KEY, price double, type text)")

<cassandra.cluster.ResultSet at 0x7fa8d4e85a50>

In [3]:
# session.execute("DROP TABLE finalProject.news")

In [10]:
sc = SparkContext(appName="PythonSparkStreamingKafka")
ssc = StreamingContext(sc,2)

sqlContext = SQLContext(sc)

kafkaStream = KafkaUtils.createDirectStream(ssc, ["news", "bitcoin"], {"metadata.broker.list":"localhost:9092"})
rdd = kafkaStream.map(lambda v: json.loads(v[1]))
spark = SparkSession(sc)

In [None]:
# Reference https://stackoverflow.com/questions/50274793/spark-streaming-dstream-messages-in-json-format-to-dataframe
debug_flag = False

def readMyStream(rdd):
    if not rdd.isEmpty():
        df = spark.read.json(rdd)
        news = df.select("title",'date','sentiment',"type").where(col("type")=="news")
        bitcoin = df.select('date','price',"type").where(col("type")=="bitcoin")
        
        print("INCOMING NEW DATA".center(50, '-'))
        news.show()
        bitcoin.show()
        
        print("Save new data to Cassandra".center(50, '-'))
        # save new data to Cassandra
        news.write\
            .format("org.apache.spark.sql.cassandra")\
            .mode('append')\
            .options(table="news", keyspace="finalproject")\
            .save()
        print("Saved news")

        bitcoin.write\
            .format("org.apache.spark.sql.cassandra")\
            .mode('append')\
            .options(table="bitcoin", keyspace="finalproject")\
            .save()
        print("Saved bitcoin")
        
        print("Read new data from Cassandra".center(50, '-'))
        # read from Cassandra
        spark_sent = sqlContext.read\
            .format("org.apache.spark.sql.cassandra")\
            .options(table="news", keyspace="finalproject")\
            .load().select(["date","sentiment"])
        if debug_flag:
            spark_sent.printSchema()
        print("Read news")
        
        spark_bitcoin = sqlContext.read\
            .format("org.apache.spark.sql.cassandra")\
            .options(table="bitcoin", keyspace="finalproject")\
            .load().select(["date","price"]).withColumnRenamed("price", "bitcoin")
        if debug_flag:
            spark_bitcoin.printSchema()
        print("Read bitcoin")
        
        # Join both on date column
        sparkDF = spark_sent.join(spark_bitcoin,spark_sent.date ==  spark_bitcoin.date,"inner")
        
        # Define features
        assembler = VectorAssembler().setInputCols(['sentiment']).setOutputCol('features')
        DF = assembler.transform(sparkDF)
        
        # Split data into train/test
        splits = DF.randomSplit([0.7, 0.3])
        train_df = splits[0]
        test_df = splits[1]
        
        print("Evaluating Models".center(50, '-'))
        # Linear Regression
        lr = LinearRegression(featuresCol = 'features', labelCol='bitcoin', maxIter=10, regParam=0.3, elasticNetParam=0.8)
        lr_model = lr.fit(train_df)
        # Apply regression model on test data
        lr_predictions = lr_model.transform(test_df)
        test_result = lr_model.evaluate(test_df)
        lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                                         labelCol="bitcoin",metricName="r2") 
        print("Root Mean Squared Error (RMSE) on Linear Regression = %g" % test_result.rootMeanSquaredError)
        print("R Squared (R2) on Linear Regression = %g" % lr_evaluator.evaluate(lr_predictions))
        
        # Decision Tree Regression
        dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'bitcoin')
        dt_model = dt.fit(train_df)
        dt_predictions = dt_model.transform(test_df)
        dt_evaluator = RegressionEvaluator(
            labelCol="bitcoin", predictionCol="prediction", metricName="rmse")
        rmse = dt_evaluator.evaluate(dt_predictions)
        print("Root Mean Squared Error (RMSE) on Decision Tree Regression = %g" % rmse) 
        
        # Gradient-Boosted Trees Regression
        gbt = GBTRegressor(featuresCol = 'features', labelCol = 'bitcoin', maxIter=10)
        gbt_model = gbt.fit(train_df)
        gbt_predictions = gbt_model.transform(test_df)
        gbt_evaluator = RegressionEvaluator(labelCol="bitcoin", predictionCol="prediction", metricName="rmse")
        rmse = gbt_evaluator.evaluate(gbt_predictions)
        print("Root Mean Squared Error (RMSE) on Gradient-Boosted Trees Regression = %g" % rmse)
        
        print("Output prediction of incoming news".center(50, '-'))
        ## output prediction of the day
        if debug_flag:
            news.printSchema()
            sparkDF.printSchema()
        
        assembler = VectorAssembler().setInputCols(['sentiment']).setOutputCol('features')
        train_df = assembler.transform(sparkDF)
        incoming_news = assembler.transform(news.select(["date", "sentiment"]).withColumn("sentiment",col("sentiment").cast("double")))
        
        if debug_flag:
            incoming_news.show()
            incoming_news.printSchema()

        lr = LinearRegression(featuresCol = 'features', labelCol='bitcoin', maxIter=10, regParam=0.3, elasticNetParam=0.8)
        lr_model = lr.fit(train_df)
        print("Coefficients: " + str(lr_model.coefficients))
        print("Intercept: " + str(lr_model.intercept))
        
        lr_predictions = lr_model.transform(incoming_news)
        print("Linear Regression prediciton".center(50, '-'))
        lr_predictions.select(["date", "prediction"]).show(5)

        dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'bitcoin')
        dt_model = dt.fit(train_df)
        dt_predictions = dt_model.transform(incoming_news)
        print("Decision Tree Regression prediciton".center(50, '-'))
        dt_predictions.select(["date", "prediction"]).show(5)
        
        gbt = GBTRegressor(featuresCol = 'features', labelCol = 'bitcoin', maxIter=10)
        gbt_model = gbt.fit(train_df)
        gbt_predictions = gbt_model.transform(incoming_news)
        print("Gradient-Boosted Trees Regression prediciton".center(50, '-'))
        gbt_predictions.select(["date", "prediction"]).show(5)

        print("END".center(50, '-'))
rdd.foreachRDD(lambda rdd: readMyStream(rdd))


ssc.start()
ssc.awaitTermination()

----------------INCOMING NEW DATA-----------------
+--------------------+----------+---------+----+
|               title|      date|sentiment|type|
+--------------------+----------+---------+----+
|GOP congressman: ...|2021-10-21|  -0.9001|news|
+--------------------+----------+---------+----+

+----------+----------+-------+
|      date|     price|   type|
+----------+----------+-------+
|2021-10-21|62201.9633|bitcoin|
+----------+----------+-------+

------------Save new data to Cassandra------------
Saved news
Saved bitcoin
-----------Read new data from Cassandra-----------
Read news
Read bitcoin
----------------Evaluating Models-----------------
Root Mean Squared Error (RMSE) on Linear Regression = 2013.21
R Squared (R2) on Linear Regression = -0.0423031
Root Mean Squared Error (RMSE) on Decision Tree Regression = 2502.02
Root Mean Squared Error (RMSE) on Gradient-Boosted Trees Regression = 2440.01
--------Output prediction of incoming news--------
Coefficients: [-545.462900372963

In [9]:
ssc.stop()