In [132]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col,  explode, split, concat, col, lit, from_json, max
from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType, ArrayType, DoubleType
from time import sleep

### First we define the schema for our data
dataSchemaString = StructType([
    StructField("game_id", StringType(), True),
    StructField("player_id", StringType(), True),
    StructField("team_id", StringType(), True),
    StructField("player_name", StringType(), True),
    StructField("team_abbreviation", StringType(), True),
    StructField("min", StringType(), True),
    StructField("ast", IntegerType(), True),
    StructField("stl", IntegerType(), True),
    StructField("pf", IntegerType(), True)
])

### create a function that add the NBA results
def write_nba_results(df, batch_id):
    print('posted')
    print(batch_id)
    df.show()
    df \
      .write.format('bigquery') \
      .option('table', 'dataengineeringcourse2023.Output_processing_pipeline.new_nba_results') \
      .mode("overwrite") \
      .save()


In [133]:
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Streaming_pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de2023_2124849"  
spark.conf.set('temporaryGcsBucket', bucket)

## so here is the first step of the stream preprocessing pipeline, we read the data which is produced by producer (in our case our laptop )
# Read the whole dataset as a batch
kafkaStream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("failOnDataLoss", "false") \
    .option("subscribe", "games_details") \
    .option("startingOffsets", "latest") \
    .load()

df = kafkaStream.selectExpr("CAST(value AS STRING)")

df1 = df.select(from_json(df.value, dataSchemaString.simpleString()))
print('Df1 schema')
df1.printSchema()

sdf = df1.select(col("from_json(value).*"))

print("Sdf schema")

sdf.printSchema()

# Group by player_id and find the maximum points scored
result = sdf.groupBy("player_id").agg(max("pf").alias("max_points"))

# Order the result in descending order based on max_points
result_ordered = result.orderBy(col("max_points").desc())

query = result_ordered \
    .writeStream.outputMode("complete").trigger(processingTime = '5 seconds') \
    .foreachBatch(write_nba_results) \
    .start()

activityCounts.printSchema()

try:
    query.awaitTermination
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")
except:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Unexpected error")
    print("Stoped the streaming query and the spark context")



Df1 schema
root
 |-- from_json(value): struct (nullable = true)
 |    |-- game_id: string (nullable = true)
 |    |-- player_id: string (nullable = true)
 |    |-- team_id: string (nullable = true)
 |    |-- player_name: string (nullable = true)
 |    |-- team_abbreviation: string (nullable = true)
 |    |-- min: string (nullable = true)
 |    |-- ast: integer (nullable = true)
 |    |-- stl: integer (nullable = true)
 |    |-- pf: integer (nullable = true)

Sdf schema
root
 |-- game_id: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- min: string (nullable = true)
 |-- ast: integer (nullable = true)
 |-- stl: integer (nullable = true)
 |-- pf: integer (nullable = true)

root
 |-- game_id: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- player_name: string (nullable = true)


In [115]:
spark.stop()