In [234]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col,  explode, split, concat, col, lit, from_json, max
from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType, ArrayType, DoubleType
from time import sleep

### First we define the schema for our data
dataSchemaString = StructType([
    StructField("game_id", StringType(), True),
    StructField("player_id", StringType(), True),
    StructField("team_id", StringType(), True),
    StructField("player_name", StringType(), True),
    StructField("team_abbreviation", StringType(), True),
    StructField("min", StringType(), True),
    StructField("ast", IntegerType(), True),
    StructField("stl", IntegerType(), True),
    StructField("pf", IntegerType(), True)
])

In [235]:
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("batch_pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "data_2023de_2"  
spark.conf.set('temporaryGcsBucket', bucket)

In [236]:
# Load data from BigQuery.
df = spark.read \
  .format("bigquery") \
  .load("de-2023-399810.labdataset.bestTeamsByConference")    # project_id.datatset.tablename. Use your project id
df.printSchema()
df.show(4)

root
 |-- TEAM_ID: string (nullable = true)
 |-- NICKNAME: string (nullable = true)
 |-- CONFERENCE: string (nullable = true)
 |-- total_points_by_team: double (nullable = true)
 |-- denseRankTotalPoints: long (nullable = false)
 |-- LEAGUE_ID: string (nullable = true)
 |-- MIN_YEAR: string (nullable = true)
 |-- MAX_YEAR: string (nullable = true)
 |-- ABBREVIATION: string (nullable = true)
 |-- HEADCOACH: string (nullable = true)

+----------+---------+----------+--------------------+--------------------+---------+--------+--------+------------+--------------+
|   TEAM_ID| NICKNAME|CONFERENCE|total_points_by_team|denseRankTotalPoints|LEAGUE_ID|MIN_YEAR|MAX_YEAR|ABBREVIATION|     HEADCOACH|
+----------+---------+----------+--------------------+--------------------+---------+--------+--------+------------+--------------+
|1610612738|  Celtics|      East|            191453.0|                   1|       00|    1946|    2019|         BOS|  Brad Stevens|
|1610612748|     Heat|      East|   

In [237]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import concat, lit

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Constructors_pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# load the results, drivers and constructors tables in dataframes
gamesDF = spark.read.format("csv").option("header", "true") \
       .load("gs://data_2023de_2/games.csv")
gamesDetailsDF = spark.read.format("csv").option("header", "true") \
       .load("gs://data_2023de_2/games_details.csv")
playersDF = spark.read.format("csv").option("header", "true") \
       .load("gs://data_2023de_2/players.csv")
rankingDF = spark.read.format("csv").option("header", "true") \
       .load("gs://data_2023de_2/ranking.csv")
teamsDF = spark.read.format("csv").option("header", "true") \
       .load("gs://data_2023de_2/teams.csv")


gamesDF.show(5)
gamesDetailsDF.show(5)
playersDF.show(5)
rankingDF.show(5)
teamsDF.show(5)

+-------------+--------+----------------+------------+---------------+------+------------+--------+-----------+-----------+------------+--------+--------+------------+--------+-----------+-----------+------------+--------+--------+--------------+
|GAME_DATE_EST| GAME_ID|GAME_STATUS_TEXT|HOME_TEAM_ID|VISITOR_TEAM_ID|SEASON|TEAM_ID_home|PTS_home|FG_PCT_home|FT_PCT_home|FG3_PCT_home|AST_home|REB_home|TEAM_ID_away|PTS_away|FG_PCT_away|FT_PCT_away|FG3_PCT_away|AST_away|REB_away|HOME_TEAM_WINS|
+-------------+--------+----------------+------------+---------------+------+------------+--------+-----------+-----------+------------+--------+--------+------------+--------+-----------+-----------+------------+--------+--------+--------------+
|   2022-12-22|22200477|           Final|  1610612740|     1610612759|  2022|  1610612740|     126|      0.484|      0.926|       0.382|      25|      46|  1610612759|     117|      0.478|      0.815|       0.321|      23|      44|             1|
|   2022-12-

In [238]:
# Print the schema of gamesDF
print("Schema of gamesDF:")
gamesDF.printSchema()

# Print the schema of gamesDetailsDF
print("\nSchema of gamesDetailsDF:")
gamesDetailsDF.printSchema()

# Print the schema of playersDF
print("\nSchema of playersDF:")
playersDF.printSchema()

# Print the schema of rankingDF
print("\nSchema of rankingDF:")
rankingDF.printSchema()

# Print the schema of teamsDF
print("\nSchema of teamsDF:")
teamsDF.printSchema()


Schema of gamesDF:
root
 |-- GAME_DATE_EST: string (nullable = true)
 |-- GAME_ID: string (nullable = true)
 |-- GAME_STATUS_TEXT: string (nullable = true)
 |-- HOME_TEAM_ID: string (nullable = true)
 |-- VISITOR_TEAM_ID: string (nullable = true)
 |-- SEASON: string (nullable = true)
 |-- TEAM_ID_home: string (nullable = true)
 |-- PTS_home: string (nullable = true)
 |-- FG_PCT_home: string (nullable = true)
 |-- FT_PCT_home: string (nullable = true)
 |-- FG3_PCT_home: string (nullable = true)
 |-- AST_home: string (nullable = true)
 |-- REB_home: string (nullable = true)
 |-- TEAM_ID_away: string (nullable = true)
 |-- PTS_away: string (nullable = true)
 |-- FG_PCT_away: string (nullable = true)
 |-- FT_PCT_away: string (nullable = true)
 |-- FG3_PCT_away: string (nullable = true)
 |-- AST_away: string (nullable = true)
 |-- REB_away: string (nullable = true)
 |-- HOME_TEAM_WINS: string (nullable = true)


Schema of gamesDetailsDF:
root
 |-- GAME_ID: string (nullable = true)
 |-- TEAM

In [248]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Create Spark session
spark = SparkSession.builder.appName("NBAPipeline").getOrCreate()

# Load the raw tables into DataFrames
gamesDF = spark.read.format("csv").option("header", "true").load("gs://data_2023de_2/games.csv")
teamsDF = spark.read.format("csv").option("header", "true").load("gs://data_2023de_2/teams.csv")

# Filter relevant columns
gamesDF_filtered = gamesDF.select(
    "GAME_ID", "TEAM_ID_home", "PTS_home", "SEASON"
)

teamsDF_filtered = teamsDF.select("TEAM_ID", "ABBREVIATION")

# Convert 'PTS_home' column to numeric
gamesDF_filtered = gamesDF_filtered.withColumn("PTS_home", F.col("PTS_home").cast("float"))

# Aggregate data on 'TEAM_ID_home' and 'SEASON' and calculate total points when HOME
total_points = gamesDF_filtered.groupBy("TEAM_ID_home", "SEASON").agg(F.sum("PTS_home").alias("total_points"))

# Define a window specification to rank teams within each season based on total points scored when HOME
window_spec = Window.partitionBy("SEASON").orderBy(F.desc("total_points"))

# Add a rank column to the DataFrame based on total points within each season
ranked_teams = total_points.withColumn("rank", F.rank().over(window_spec))

# Filter the top 3 teams for each season
top_3_teams = ranked_teams.filter(F.col("rank") <= 3)

# Join with other tables to include additional information
result_df = top_3_teams.join(teamsDF_filtered, top_3_teams["TEAM_ID_home"] == teamsDF_filtered["TEAM_ID"])

# Show the result
print("Top 3 performing teams each season based on total points")
result_df.show()

# Stop the Spark session
# spark.stop()


Top 3 performing teams each season based on total points
+------------+------+------------+----+----------+------------+
|TEAM_ID_home|SEASON|total_points|rank|   TEAM_ID|ABBREVIATION|
+------------+------+------------+----+----------+------------+
|  1610612747|  2003|      5086.0|   1|1610612747|         LAL|
|  1610612758|  2003|      5057.0|   2|1610612758|         SAC|
|  1610612765|  2003|      5016.0|   3|1610612765|         DET|
|  1610612756|  2004|      5554.0|   1|1610612756|         PHX|
|  1610612759|  2004|      5327.0|   2|1610612759|         SAS|
|  1610612742|  2004|      5251.0|   3|1610612742|         DAL|
|  1610612756|  2005|      6101.0|   1|1610612756|         PHX|
|  1610612748|  2005|      5845.0|   2|1610612748|         MIA|
|  1610612742|  2005|      5642.0|   3|1610612742|         DAL|
|  1610612759|  2006|      5504.0|   1|1610612759|         SAS|
|  1610612756|  2006|      5434.0|   2|1610612756|         PHX|
|  1610612747|  2006|      5337.0|   3|16106127

In [249]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "data_2023de_2"  
spark.conf.set('temporaryGcsBucket', bucket)

# Saving the data to BigQuery
result_df.write.format('bigquery') \
  .option('table', 'labdataset.teamstatistics') \
  .mode("overwrite") \
  .save()