In [1]:
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder, VectorAssembler}
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.sql.expressions.Window

//create spark session
val spark = SparkSession.builder.appName("SPL Analysis").getOrCreate()
import spark.implicits._

lastException = null
spark = org.apache.spark.sql.SparkSession@26770d9b


org.apache.spark.sql.SparkSession@26770d9b

In [2]:
//import transformed data
val df = spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("data/data_for_model.csv")

df = [season_id: int, season_label: string ... 8 more fields]


[season_id: int, season_label: string ... 8 more fields]

In [3]:
//to partquet for editing
df.write
  .mode("overwrite")
  .parquet("data/model_master")

In [4]:
//current league table
//include goals for, goals against
val currentTable = Seq(
  ("Lion City Sailors", 15, 22, 2),
  ("Geylang Int.", 10, 7, 4),
  ("Tampines Rovers", 8, 8, 4),
  ("Balestier Khalsa", 8, 9, 12),
  ("Albirex Niigata", 7, 8, 5),
  ("Hougang Utd", 3, 4, 10),
  ("Tanjong Pagar Utd.", 3, 7, 15),
  ("Young Lions", 0, 4, 17)
).toDF("team", "pts", "gf", "ga")
currentTable.show(truncate = false)

currentTable = [team: string, pts: int ... 2 more fields]


+------------------+---+---+---+
|team              |pts|gf |ga |
+------------------+---+---+---+
|Lion City Sailors |15 |22 |2  |
|Geylang Int.      |10 |7  |4  |
|Tampines Rovers   |8  |8  |4  |
|Balestier Khalsa  |8  |9  |12 |
|Albirex Niigata   |7  |8  |5  |
|Hougang Utd       |3  |4  |10 |
|Tanjong Pagar Utd.|3  |7  |15 |
|Young Lions       |0  |4  |17 |
+------------------+---+---+---+



[team: string, pts: int ... 2 more fields]

In [5]:
//stream to update current table
// NOTE: Avoid case class encoders in notebooks/REPL (can cause "Unable to generate an encoder for inner class...").
// Use an explicit StructType schema instead.
import org.apache.spark.sql.types._

val eventSchema = StructType(Seq(
  StructField("event_id", LongType, nullable = true),
  StructField("season_year", StringType, nullable = true),
  StructField("home_team", StringType, nullable = true),
  StructField("away_team", StringType, nullable = true),
  StructField("home_score", LongType, nullable = true),
  StructField("away_score", LongType, nullable = true),
  StructField("status_code", LongType, nullable = true),
  StructField("start_timestamp", LongType, nullable = true)
))

eventSchema = StructType(StructField(event_id,LongType,true),StructField(season_year,StringType,true),StructField(home_team,StringType,true),StructField(away_team,StringType,true),StructField(home_score,LongType,true),StructField(away_score,LongType,true),StructField(status_code,LongType,true),StructField(start_timestamp,LongType,true))


StructType(StructField(event_id,LongType,true),StructField(season_year,StringType,true),StructField(home_team,StringType,true),StructField(away_team,StringType,true),StructField(home_score,LongType,true),StructField(away_score,LongType,true),StructField(status_code,LongType,true),StructField(start_timestamp,LongType,true))

In [6]:
val eventStream = spark.readStream
  .schema(eventSchema)
  .option("multiLine", "true")
  .json("data/stream_in/event")

eventStream = [event_id: bigint, season_year: string ... 6 more fields]


[event_id: bigint, season_year: string ... 6 more fields]

In [7]:
//one match will affect 2 teams on the table
val perMatchTeamDelta =
  eventStream
//get pts for home team
    .withColumn(
      "home_pts",
      when(col("home_score") > col("away_score"), lit(3))
        .when(col("home_score") === col("away_score"), lit(1))
        .otherwise(lit(0))
    )
//get pts for away team
    .withColumn(
      "away_pts",
      when(col("away_score") > col("home_score"), lit(3))
        .when(col("away_score") === col("home_score"), lit(1))
        .otherwise(lit(0))
    )
    .select(
      col("event_id"),
      array(
        struct(
          col("home_team").alias("team"),
          col("home_pts").alias("pts_delta"),
          col("home_score").alias("gf_delta"),
          col("away_score").alias("ga_delta")
        ),
        struct(
          col("away_team").alias("team"),
          col("away_pts").alias("pts_delta"),
          col("away_score").alias("gf_delta"),
          col("home_score").alias("ga_delta")
        )
      ).alias("rows")
    )
    .withColumn("row", explode(col("rows")))
    .select(
      col("event_id"),
      col("row.team").alias("team"),
      col("row.pts_delta").alias("pts_delta"),
      col("row.gf_delta").alias("gf_delta"),
      col("row.ga_delta").alias("ga_delta")
    )

perMatchTeamDelta = [event_id: bigint, team: string ... 3 more fields]


[event_id: bigint, team: string ... 3 more fields]

In [8]:
// Aggregate changes (batch-level).
// NOTE: We do this inside foreachBatch so we don't need streaming state/checkpoints.
def aggregateDelta(perMatchTeamDeltaBatch: DataFrame): DataFrame = {
  perMatchTeamDeltaBatch
    .groupBy(col("team"))
    .agg(
      sum(col("pts_delta")).alias("pts_add"),
      sum(col("gf_delta")).alias("gf_add"),
      sum(col("ga_delta")).alias("ga_add")
    )
}

aggregateDelta: (perMatchTeamDeltaBatch: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [9]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

// NO CHECKPOINTING (demo mode): do all aggregation/update per micro-batch.
val q = perMatchTeamDelta.writeStream
  .outputMode("append")
  .foreachBatch { (batchRows: DataFrame, batchId: Long) =>

    val batchDelta = aggregateDelta(batchRows)

    val baseTable =
      if (spark.catalog.tableExists("current_table_live")) spark.table("current_table_live")
      else currentTable

    val updated =
      baseTable.as("c")
        .join(batchDelta.as("d"), Seq("team"), "left")
        .select(
          col("team"),
          (col("c.pts") + coalesce(col("d.pts_add"), lit(0))).alias("pts"),
          (col("c.gf")  + coalesce(col("d.gf_add"),  lit(0))).alias("gf"),
          (col("c.ga")  + coalesce(col("d.ga_add"),  lit(0))).alias("ga")
        )
        .orderBy(desc("pts"), (col("gf") - col("ga")).desc, desc("gf"))

    updated.createOrReplaceTempView("current_table_live")
    println(s"=== batchId=$batchId ===")
    updated.show(50, truncate = false)
  }
  .start()

q = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@2cc335aa


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@2cc335aa

=== batchId=0 ===
+------------------+---+---+---+
|team              |pts|gf |ga |
+------------------+---+---+---+
|Lion City Sailors |27 |30 |2  |
|Tampines Rovers   |16 |20 |16 |
|Hougang Utd       |15 |8  |10 |
|Tanjong Pagar Utd.|15 |19 |23 |
|Geylang Int.      |14 |15 |20 |
|Albirex Niigata   |11 |12 |9  |
|Balestier Khalsa  |8  |9  |16 |
|Young Lions       |0  |12 |29 |
+------------------+---+---+---+

=== batchId=1 ===
+------------------+---+---+---+
|team              |pts|gf |ga |
+------------------+---+---+---+
|Lion City Sailors |30 |32 |2  |
|Tampines Rovers   |16 |20 |16 |
|Hougang Utd       |15 |8  |10 |
|Tanjong Pagar Utd.|15 |19 |23 |
|Geylang Int.      |14 |15 |22 |
|Albirex Niigata   |11 |12 |9  |
|Balestier Khalsa  |8  |9  |16 |
|Young Lions       |0  |12 |29 |
+------------------+---+---+---+

=== batchId=2 ===
+------------------+---+---+---+
|team              |pts|gf |ga |
+------------------+---+---+---+
|Lion City Sailors |30 |32 |2  |
|Tampines Rovers   |

In [10]:
//stats streaming
// NOTE: Avoid case class encoders in notebooks/REPL; use explicit schema.
val statsSchema = StructType(Seq(
  StructField("event_id", LongType, nullable = true),
  StructField("home_xg", DoubleType, nullable = true),
  StructField("away_xg", DoubleType, nullable = true)
))

val statsStream = spark.readStream
  .schema(statsSchema)
  .option("multiLine", "true")
  .json("data/stream_in/stats")

statsSchema = StructType(StructField(event_id,LongType,true),StructField(home_xg,DoubleType,true),StructField(away_xg,DoubleType,true))
statsStream = [event_id: bigint, home_xg: double ... 1 more field]


[event_id: bigint, home_xg: double ... 1 more field]

In [11]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.hadoop.fs.{FileSystem, Path}

def upsertStatsIntoMaster(statsBatch: DataFrame): Unit = {

  val masterPath = "data/model_master"

  // Initialize master parquet if it doesn't exist yet (seed from the base CSV df).
  // In this project we use a parquet folder at `data/model_master` as the streaming-updated master.
  val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
  val p = new Path(masterPath)

  val exists = fs.exists(p)
  val empty = exists && {
    val st = fs.listStatus(p)
    st == null || st.isEmpty
  }

  if (!exists || empty) {
    if (empty) {
      fs.delete(p, true)
      println(s"[INIT] $masterPath existed but was empty -> recreating from df")
    } else {
      println(s"[INIT] $masterPath not found -> creating from df (data_for_model.csv)")
    }
    df.write.mode("overwrite").parquet(masterPath)
  }

  // Rename incoming columns to avoid duplicate column names after join.
  // We only want to fill nulls in the master (not overwrite existing xG).
  val updates = statsBatch
    .select(
      col("event_id"),
      col("home_xg").cast("double").alias("u_home_expected_goals"),
      col("away_xg").cast("double").alias("u_away_expected_goals")
    )
    .dropDuplicates("event_id")

  val master = spark.read.parquet(masterPath)

  val updated =
    master.as("m")
      .join(updates.as("u"), Seq("event_id"), "left")
      .withColumn(
        "home_expected_goals",
        coalesce(col("m.home_expected_goals"), col("u.u_home_expected_goals"))
      )
      .withColumn(
        "away_expected_goals",
        coalesce(col("m.away_expected_goals"), col("u.u_away_expected_goals"))
      )
      .withColumn(
        "is_future_fixture",
        when(col("u.event_id").isNotNull, lit(0)) // stats arrived → no longer future
          .otherwise(col("m.is_future_fixture"))
      )
      .drop("u_home_expected_goals", "u_away_expected_goals")

  updated.write.mode("overwrite").parquet(masterPath)
}


upsertStatsIntoMaster: (statsBatch: org.apache.spark.sql.DataFrame)Unit


In [12]:
//pipeline refit function for home + away GLM
def refitHomeAwayPipelines(trainRaw: DataFrame): (PipelineModel, PipelineModel) = {
  // IMPORTANT: GLM label columns are home_xg / away_xg
  // but the master parquet stores them as home_expected_goals / away_expected_goals.
  val train = trainRaw.select(
    col("home_team"),
    col("away_team"),
    col("home_expected_goals").cast("double").alias("home_xg"),
    col("away_expected_goals").cast("double").alias("away_xg"),
    lit(1.0).alias("home_adv")
  )

  val home_idx = new StringIndexer()
    .setInputCol("home_team")
    .setOutputCol("home_team_idx")
    .setHandleInvalid("keep")

  val away_idx = new StringIndexer()
    .setInputCol("away_team")
    .setOutputCol("away_team_idx")
    .setHandleInvalid("keep")

  val enc = new OneHotEncoder()
    .setInputCols(Array("home_team_idx", "away_team_idx"))
    .setOutputCols(Array("home_team_ohe", "away_team_ohe"))

  val assembler = new VectorAssembler()
    .setInputCols(Array("home_team_ohe", "away_team_ohe", "home_adv"))
    .setOutputCol("features")

  val glm_home = new GeneralizedLinearRegression()
    .setFeaturesCol("features")
    .setLabelCol("home_xg")
    .setFamily("gaussian")
    .setLink("identity")
    .setMaxIter(50)
    .setRegParam(0.0)

  val glm_away = new GeneralizedLinearRegression()
    .setFeaturesCol("features")
    .setLabelCol("away_xg")
    .setFamily("gaussian")
    .setLink("identity")
    .setMaxIter(50)
    .setRegParam(0.0)

  val pipe_home = new Pipeline().setStages(Array(home_idx, away_idx, enc, assembler, glm_home))
  val pipe_away = new Pipeline().setStages(Array(home_idx, away_idx, enc, assembler, glm_away))

  val model_home = pipe_home.fit(train)
  val model_away = pipe_away.fit(train)

  (model_home, model_away)
}

// -----------------------------
// Pattern B helpers: load-or-train + refresh exports
// -----------------------------
// (PipelineModel is imported in Cell 0)
import org.apache.spark.ml.feature.{StringIndexerModel, OneHotEncoderModel}
import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel

val MASTER_PATH = "data/model_master"
val MODEL_HOME_PATH = "data/models/glm_home_pipeline_latest"
val MODEL_AWAY_PATH = "data/models/glm_away_pipeline_latest"
val DASH_OUT = "data/dashboard_table_csv"
val FIXT_OUT = "data/match_level_fixtures"

// Keep these small-ish for "live" refresh. You can bump them if runtime is OK.
val DEFAULT_MC_SIMS = 20000

@volatile var dashboardBootstrapped: Boolean = false

import org.apache.hadoop.fs.{FileSystem, Path}

def ensureMasterParquet(): Unit = {
  val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
  val p = new Path(MASTER_PATH)
  if (!fs.exists(p)) {
    println(s"[INIT] $MASTER_PATH not found -> creating from df (data_for_model.csv)")
    df.write.mode("overwrite").parquet(MASTER_PATH)
  }
}

def tryLoadPipeline(path: String): Option[PipelineModel] = {
  try Some(PipelineModel.load(path))
  catch { case _: Throwable => None }
}

def loadOrTrainPipelines(): (PipelineModel, PipelineModel) = {
  val mh = tryLoadPipeline(MODEL_HOME_PATH)
  val ma = tryLoadPipeline(MODEL_AWAY_PATH)
  if (mh.isDefined && ma.isDefined) return (mh.get, ma.get)

  ensureMasterParquet()
  val master = spark.read.parquet(MASTER_PATH)
  val modelDF = master
    .filter(col("home_expected_goals").isNotNull && col("away_expected_goals").isNotNull)
    .dropDuplicates("event_id")

  val (h, a) = refitHomeAwayPipelines(modelDF)
  h.write.overwrite().save(MODEL_HOME_PATH)
  a.write.overwrite().save(MODEL_AWAY_PATH)
  (h, a)
}

def getLiveTableSafe(): DataFrame = {
  if (spark.catalog.tableExists("current_table_live")) spark.table("current_table_live")
  else currentTable
}

def powerRankFromHomePipeline(modelHome: PipelineModel): DataFrame = {
  val stages = modelHome.stages
  val homeIdx = stages(0).asInstanceOf[StringIndexerModel]
  val awayIdx = stages(1).asInstanceOf[StringIndexerModel]
  val ohe = stages(2).asInstanceOf[OneHotEncoderModel]
  val glm = stages(4).asInstanceOf[GeneralizedLinearRegressionModel]

  val coeff = glm.coefficients.toArray
  val homeCats = ohe.categorySizes(0)
  val awayCats = ohe.categorySizes(1)
  val dropLast = ohe.getDropLast
  val homeDim = if (dropLast) homeCats - 1 else homeCats
  val awayDim = if (dropLast) awayCats - 1 else awayCats

  val homeAttackCoef = coeff.slice(0, homeDim)
  val awayDefenseCoef = coeff.slice(homeDim, homeDim + awayDim)

  val homeTeams = if (homeIdx.getHandleInvalid == "keep") homeIdx.labels :+ "UNKNOWN" else homeIdx.labels
  val awayTeams = if (awayIdx.getHandleInvalid == "keep") awayIdx.labels :+ "UNKNOWN" else awayIdx.labels

  def attachBaseline(teams: Array[String], coef: Array[Double], dropLast: Boolean): Seq[(String, Double)] =
    if (dropLast) teams.zip(coef :+ 0.0) else teams.zip(coef)

  val homeAttack = attachBaseline(homeTeams, homeAttackCoef, dropLast).toDF("team", "home_attack")
  val awayDefWeak = attachBaseline(awayTeams, awayDefenseCoef, dropLast).toDF("team", "away_def_weak")

  homeAttack
    .join(awayDefWeak, "team")
    .withColumn("power_score", col("home_attack") - col("away_def_weak"))
    .orderBy(desc("power_score"))
}

// Full refresh: score fixtures -> compute odds/MC/SPI -> export CSVs for Streamlit
// This is what makes Pattern B non-redundant: predictions always come from the latest saved pipeline models.
def refreshDashboard(seasonLabel: String = "25/26", mcSims: Int = DEFAULT_MC_SIMS): Unit = {
  val liveTable = getLiveTableSafe().cache()

  val (modelHome, modelAway) = loadOrTrainPipelines()
  ensureMasterParquet()
  val master = spark.read.parquet(MASTER_PATH)

  // 1) Score future fixtures
  val fixtures = master
    .filter(col("is_future_fixture") === 1 && col("season_label") === seasonLabel)
    .select(
      col("event_id"),
      col("match_date"),
      trim(col("home_team")).alias("home_team"),
      trim(col("away_team")).alias("away_team"),
      lit(1.0).alias("home_adv")
    )
    .dropDuplicates("event_id")

  // IMPORTANT: modelHome and modelAway are both Pipelines that create the same intermediate columns
  // (home_team_idx, away_team_idx, *ohe, features). If we feed the full output of modelHome into modelAway,
  // the second transform fails with: "Output column home_team_idx already exists".
  val predHomeFull = modelHome.transform(fixtures)
    .withColumnRenamed("prediction", "lambda_home")

  // Drop intermediate pipeline columns by selecting only what modelAway needs.
  val predHome = predHomeFull.select(
    col("event_id"),
    col("match_date"),
    col("home_team"),
    col("away_team"),
    col("home_adv"),
    col("lambda_home")
  )

  val predBothFull = modelAway.transform(predHome)
    .withColumnRenamed("prediction", "lambda_away")

  val predBoth = predBothFull.select(
    col("event_id"),
    col("match_date"),
    col("home_team"),
    col("away_team"),
    col("lambda_home"),
    col("lambda_away")
  )

  // 2) W/D/L probabilities (local UDF to avoid global symbol clashes)
  import scala.math.{exp, pow}

  def poissonPmf(k: Int, lambda: Double): Double = {
    if (k < 0) 0.0
    else {
      var fact = 1.0
      var i = 2
      while (i <= k) { fact *= i; i += 1 }
      exp(-lambda) * pow(lambda, k) / fact
    }
  }

  case class WDL(win: Double, draw: Double, loss: Double)

  def wdlFromLambdas(lhRaw: Double, laRaw: Double, maxGoals: Int = 10): WDL = {
    val lh = math.max(lhRaw, 0.0)
    val la = math.max(laRaw, 0.0)

    val ph = Array.tabulate(maxGoals + 1)(k => poissonPmf(k, lh))
    val pa = Array.tabulate(maxGoals + 1)(k => poissonPmf(k, la))

    var mass = 0.0
    var i = 0
    while (i <= maxGoals) {
      var j = 0
      while (j <= maxGoals) { mass += ph(i) * pa(j); j += 1 }
      i += 1
    }
    val denom = if (mass > 0) mass else 1.0

    var win = 0.0
    var draw = 0.0
    i = 0
    while (i <= maxGoals) {
      var j = 0
      while (j <= maxGoals) {
        val p = (ph(i) * pa(j)) / denom
        if (i > j) win += p
        else if (i == j) draw += p
        j += 1
      }
      i += 1
    }
    WDL(win, draw, 1.0 - win - draw)
  }

  val wdlUdfLocal = udf((lh: Double, la: Double) => {
    val r = wdlFromLambdas(lh, la, maxGoals = 10)
    (r.win, r.draw, r.loss)
  })

  val matchProbs =
    predBoth
      .withColumn("wdl", wdlUdfLocal(col("lambda_home"), col("lambda_away")))
      .withColumn("p_home_win", col("wdl").getField("_1"))
      .withColumn("p_draw",     col("wdl").getField("_2"))
      .withColumn("p_away_win", col("wdl").getField("_3"))
      .drop("wdl")
      .withColumn("exp_pts_home", lit(3.0) * col("p_home_win") + lit(1.0) * col("p_draw"))
      .withColumn("exp_pts_away", lit(3.0) * col("p_away_win") + lit(1.0) * col("p_draw"))
      .withColumnRenamed("lambda_home", "xg_home")
      .withColumnRenamed("lambda_away", "xg_away")

  // 3) Monte Carlo season sims (using predicted lambdas)
  import scala.util.Random

  case class TeamState(var pts: Int, var gf: Int, var ga: Int)

  def poisson(lambdaRaw: Double, rng: Random): Int = {
    val lambda = math.max(lambdaRaw, 0.0)
    val L = math.exp(-lambda)
    var k = 0
    var p = 1.0
    do {
      k += 1
      p *= rng.nextDouble()
    } while (p > L && k < 25)
    k - 1
  }

  def goalDiff(s: TeamState): Int = s.gf - s.ga

  def simulateSeasonOnce(
    teams: Array[String],
    base: Map[String, TeamState],
    matches: Array[(String, String, Double, Double)],
    seed: Int
  ): Map[String, TeamState] = {

    val rng = new Random(seed)

    val st = scala.collection.mutable.Map[String, TeamState]()
    teams.foreach { t =>
      val b = base(t)
      st(t) = TeamState(b.pts, b.gf, b.ga)
    }

    matches.foreach { case (h, a, lamH, lamA) =>
      val gh = poisson(lamH, rng)
      val ga = poisson(lamA, rng)

      st(h).gf += gh; st(h).ga += ga
      st(a).gf += ga; st(a).ga += gh

      if (gh > ga) st(h).pts += 3
      else if (gh < ga) st(a).pts += 3
      else { st(h).pts += 1; st(a).pts += 1 }
    }

    st.toMap
  }

  val teams: Array[String] = liveTable.select($"team").as[String].collect()

  val baseTable: Map[String, TeamState] =
    liveTable.select($"team", $"pts", $"gf", $"ga").as[(String, Long, Long, Long)].collect()
      .map { case (t, p, gf, ga) => t -> TeamState(p.toInt, gf.toInt, ga.toInt) }
      .toMap

  val matchesArr: Array[(String, String, Double, Double)] =
    predBoth.select($"home_team", $"away_team", $"lambda_home", $"lambda_away")
      .as[(String, String, Double, Double)]
      .collect()

  val winLeague = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
  val makeACL   = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
  val totalFinalPts = scala.collection.mutable.Map[String, Double]().withDefaultValue(0.0)

  val N = math.max(mcSims, 1)

  for (i <- 0 until N) {
    val end = simulateSeasonOnce(teams, baseTable, matchesArr, seed = 1234 + i)

    val ranked = teams.sortBy { t =>
      val s = end(t)
      (-s.pts, -goalDiff(s), -s.gf, t)
    }

    winLeague(ranked(0)) += 1
    ranked.take(2).foreach(t => makeACL(t) += 1)
    teams.foreach { t => totalFinalPts(t) += end(t).pts }
  }

  val mcPtsDF =
    teams.toSeq.map { t =>
      val expFinal = totalFinalPts(t) / N
      (t, expFinal)
    }.toDF("team", "exp_pts_final_mc")

  val mcRemainingDF =
    mcPtsDF
      .join(liveTable.select($"team", $"pts"), Seq("team"))
      .withColumn("exp_pts_remaining_mc", col("exp_pts_final_mc") - col("pts"))

  val probsDF =
    teams.toSeq.map { t =>
      val pWin = winLeague(t).toDouble / N
      val pACL = makeACL(t).toDouble / N
      (t, pWin * 100.0, pACL * 100.0)
    }.toDF("team", "win_league_pct", "make_acl_pct")

  // 4) SPI from power rank
  val powerRank = powerRankFromHomePipeline(modelHome)
  val w = Window.partitionBy()
  val spiDF =
    powerRank
      .withColumn("mu", avg($"power_score").over(w))
      .withColumn("sd", stddev($"power_score").over(w))
      .withColumn("spi", lit(75.0) + lit(10.0) * (($"power_score" - $"mu") / $"sd"))
      .select("team", "spi")

  // Debug views (so you can show() them in separate cells)
  powerRank.createOrReplaceTempView("power_rank_live")
  spiDF.createOrReplaceTempView("spi_live")

  // 5) Dashboard table export
  val core =
    liveTable
      .select(
        col("team"),
        col("pts").cast("double").alias("pts"),
        col("gf").cast("int").alias("gf"),
        col("ga").cast("int").alias("ga")
      )

  val finalTable =
    core
      .join(mcRemainingDF.select("team", "exp_pts_remaining_mc"), Seq("team"), "left")
      .join(mcPtsDF.select("team", "exp_pts_final_mc"), Seq("team"), "left")
      .join(probsDF.select("team", "win_league_pct", "make_acl_pct"), Seq("team"), "left")
      .join(spiDF.select("team", "spi"), Seq("team"), "left")
      .na.fill(0.0, Seq("exp_pts_remaining_mc", "exp_pts_final_mc", "win_league_pct", "make_acl_pct", "spi"))
      .orderBy(desc("exp_pts_final_mc"))

  // Debug view
  finalTable.createOrReplaceTempView("dashboard_live")

  finalTable
    .coalesce(1)
    .write
    .mode("overwrite")
    .option("header", "true")
    .csv(DASH_OUT)

  // 6) Match-level fixtures export (team/opponent rows)
  val homeView =
    matchProbs.select(
      col("event_id"),
      col("match_date"),
      col("home_team").alias("team"),
      col("away_team").alias("opponent"),
      lit("H").alias("venue"),
      col("xg_home").alias("xg_for"),
      col("xg_away").alias("xg_against"),
      col("p_home_win").alias("p_win"),
      col("p_draw").alias("p_draw"),
      col("p_away_win").alias("p_loss"),
      col("exp_pts_home").alias("exp_pts")
    )

  val awayView =
    matchProbs.select(
      col("event_id"),
      col("match_date"),
      col("away_team").alias("team"),
      col("home_team").alias("opponent"),
      lit("A").alias("venue"),
      col("xg_away").alias("xg_for"),
      col("xg_home").alias("xg_against"),
      col("p_away_win").alias("p_win"),
      col("p_draw").alias("p_draw"),
      col("p_home_win").alias("p_loss"),
      col("exp_pts_away").alias("exp_pts")
    )

  val teamFixtures =
    homeView
      .unionByName(awayView)
      .withColumn("p_win", round(col("p_win") * 100, 2))
      .withColumn("p_draw", round(col("p_draw") * 100, 2))
      .withColumn("p_loss", round(col("p_loss") * 100, 2))
      .withColumn("xg_for", round(col("xg_for"), 2))
      .withColumn("xg_against", round(col("xg_against"), 2))
      .withColumn("exp_pts", round(col("exp_pts"), 2))

  // Debug view
  teamFixtures.createOrReplaceTempView("fixtures_live")

  teamFixtures
    .coalesce(1)
    .write
    .mode("overwrite")
    .option("header", "true")
    .csv(FIXT_OUT)

  dashboardBootstrapped = true
  liveTable.unpersist()
}

// -----------------------------
// Offline bootstrap: always produce outputs at least once from df/master.
// -----------------------------
// This ensures you get dashboard + fixtures CSVs even if:
// - no streaming files arrive yet, or
// - only the event stream is running.
val AUTO_BOOTSTRAP_DASHBOARD = true

if (AUTO_BOOTSTRAP_DASHBOARD && !dashboardBootstrapped) {
  println("[AUTO] bootstrapping dashboard from df/master -> refreshDashboard()")
  ensureMasterParquet()
  refreshDashboard(seasonLabel = "25/26", mcSims = DEFAULT_MC_SIMS)
  println("[AUTO] bootstrap done")
}


[AUTO] bootstrapping dashboard from df/master -> refreshDashboard()
[AUTO] bootstrap done


MASTER_PATH = data/model_master
MODEL_HOME_PATH = data/models/glm_home_pipeline_latest
MODEL_AWAY_PATH = data/models/glm_away_pipeline_latest
DASH_OUT = data/dashboard_table_csv
FIXT_OUT = data/match_level_fixtures
DEFAULT_MC_SIMS = 20000
dashboardBootstrapped = true


refitHomeAwayPipelines: (trainRaw: org.apache.spark.sql.DataFrame)(org.apache.spark.ml.PipelineModel, org.apache.spark.ml.PipelineModel)
ensureMasterParquet: ()Unit
tryLoadPipeline: (path: String)Option[org.apache.spark.ml.PipelineModel]
loadOrTrainPipelines: ()(org.apache.spark.m...


true

In [None]:
@volatile var statsAccum: Long = 0L

val mlQ = statsStream.writeStream
  .outputMode("append")
  .foreachBatch { (statsBatch: DataFrame, batchId: Long) =>

    val newUnique = statsBatch.select("event_id").distinct().count()
    println(s"=== stats batchId=$batchId | newUnique=$newUnique ===")

    if (newUnique == 0) {
      // no-op
    } else {
      // 1) upsert stats into the master parquet
      upsertStatsIntoMaster(statsBatch)
      println(s"[DATA] upserted stats into $MASTER_PATH")

      // 2) Always (re)export after new stats arrive so the professor sees changes immediately.
      // Keep MC sims lower for interactive streaming updates.
      val LIVE_MC_SIMS = 2000

      if (!dashboardBootstrapped) {
        println("[BOOT] no dashboard yet -> export dashboard/fixtures")
        refreshDashboard(seasonLabel = "25/26", mcSims = LIVE_MC_SIMS)
      } else {
        // 3) update cumulative counter for periodic refit
        statsAccum += newUnique
        println(s"[TRIGGER] accum=$statsAccum (refit when >= 4)")

        if (statsAccum >= 4) {
          // consume 4 and keep remainder
          statsAccum -= 4
          println(s"[ML] TRIGGER refit | carry remainder=$statsAccum")

          // 4) refit on the UPDATED master DF (full history with xG)
          ensureMasterParquet()
          val masterNow = spark.read.parquet(MASTER_PATH)

          val modelDF = masterNow
            .filter(col("home_expected_goals").isNotNull && col("away_expected_goals").isNotNull)
            .dropDuplicates("event_id")

          println(s"[ML] training rows = ${modelDF.count()}")

          val (model_home, model_away) = refitHomeAwayPipelines(modelDF)

          model_home.write.overwrite().save(MODEL_HOME_PATH)
          model_away.write.overwrite().save(MODEL_AWAY_PATH)

          println("[ML] refit complete + models saved")
        } else {
          println("[ML] skip refit")
        }

        // 5) Always re-score + export so the dashboard reacts to new stats
        refreshDashboard(seasonLabel = "25/26", mcSims = LIVE_MC_SIMS)

        // 6) Print key views for demo
        if (spark.catalog.tableExists("dashboard_live")) {
          println("[DEMO] dashboard_live (top 10)")
          spark.table("dashboard_live").show(10, truncate = false)
        }
        if (spark.catalog.tableExists("spi_live")) {
          println("[DEMO] spi_live (top 10)")
          spark.table("spi_live").orderBy(desc("spi")).show(10, truncate = false)
        }
      }
    }
  }
  // NO CHECKPOINTING (demo mode)
  .start()


statsAccum = 0
mlQ = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@12654214


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@12654214

=== stats batchId=0 | newUnique=5 ===


In [None]:
// -----------------------------
// Debug helpers (streams) — SAFE for Toree/REPL
// -----------------------------
import java.nio.file.{Files, Paths}
import scala.collection.JavaConverters._

def listFiles(path: String): Unit = {
  val p = Paths.get(path)
  if (!Files.exists(p)) {
    println(s"[FS] missing: $path")
    return
  }
  val files = Files.list(p).iterator().asScala.toSeq
    .map(_.getFileName.toString)
    .sorted
  println(s"[FS] $path (${files.size} files)")
  files.foreach(f => println(s"  - $f"))
}

println("\n[DEBUG] stream input folders")
listFiles("data/stream_in/event")
listFiles("data/stream_in/stats")

println("\n[DEBUG] active streaming queries")
val act = spark.streams.active
println(s"active=${act.length}")
act.foreach { q =>
  println(s"- name=${q.name} | id=${q.id} | isActive=${q.isActive}")
  println(s"  status=${q.status}")
  println(s"  lastProgress=${q.lastProgress}")
}

println("\n[DEBUG] current_table_live")
if (spark.catalog.tableExists("current_table_live")) {
  spark.table("current_table_live").show(20, truncate = false)
} else {
  println("current_table_live not created yet")
}

println("\n[DEBUG] dashboard_live / spi_live")
if (spark.catalog.tableExists("dashboard_live")) spark.table("dashboard_live").show(10, truncate = false)
if (spark.catalog.tableExists("spi_live")) spark.table("spi_live").orderBy(desc("spi")).show(10, truncate = false)



In [None]:
// (Removed) Legacy batch training path.
// Pattern B uses `refreshDashboard()` (called from the stats stream) which loads/trains models and exports outputs.


modelDF = [season_id: int, season_label: string ... 8 more fields]


[season_id: int, season_label: string ... 8 more fields]

In [None]:
// (Removed) Legacy batch feature engineering + model fitting setup.
// Pattern B handles training and scoring inside `refreshDashboard()`.


Matches used for ML: 253
+------------+--------+-----------------+------------------+-------+-------+--------+
|season_label|event_id|home_team        |away_team         |home_xg|away_xg|home_adv|
+------------+--------+-----------------+------------------+-------+-------+--------+
|2023        |11041385|Albirex Niigata  |Tampines Rovers   |0.59   |0.94   |1.0     |
|2023        |11041331|Lion City Sailors|Hougang Utd       |3.28   |0.0    |1.0     |
|2023        |11041349|Hougang Utd      |Tanjong Pagar Utd.|2.73   |1.12   |1.0     |
|2024        |12246956|Hougang Utd      |Young Lions       |2.6    |2.32   |1.0     |
|2023        |11041318|Geylang Int.     |Balestier Khalsa  |0.88   |3.69   |1.0     |
+------------+--------+-----------------+------------------+-------+-------+--------+
only showing top 5 rows



m = [season_label: string, event_id: int ... 5 more fields]
home_idx = strIdx_a0e2e18b767b
away_idx = strIdx_67ebaae09169
enc = oneHotEncoder_57eb833f8dbd
assembler = VectorAssembler: uid=vecAssembler_b6a76e6e1533, handleInvalid=error, numInputCols=3
homeIdxModel = StringIndexerModel: uid=strIdx_a0e2e18b767b, handleInvalid=keep
m1 = [season_label: string, event_id: int ... 6 more fields]
awayIdxModel = StringIndexerModel: uid=strIdx_67ebaae09169, handleInvali...


StringIndexerModel: uid=strIdx_67ebaae09169, handleInvali...

In [None]:
// (Removed) Legacy batch model fit.
// Pattern B trains (or loads) pipelines via `loadOrTrainPipelines()`.


glm_home = glm_b0d96b0ba71a
glm_away = glm_3ed6c7a335de
model_home = GeneralizedLinearRegressionModel: uid=glm_b0d96b0ba71a, family=gaussian, link=identity, numFeatures=19
model_away = GeneralizedLinearRegressionModel: uid=glm_3ed6c7a335de, family=gaussian, link=identity, numFeatures=19


GeneralizedLinearRegressionModel: uid=glm_3ed6c7a335de, family=gaussian, link=identity, numFeatures=19

In [None]:
// (Removed) Legacy batch coefficient slicing.
// Pattern B computes power rank from the saved pipeline in `powerRankFromHomePipeline()`.


coeff = Array(-0.3462127058516612, -0.19809177233738684, 0.5997640074519133, 0.4127271732976447, 0.29010120572657216, -0.4256348720014847, 0.2259891343524253, -0.2273549425913689, -0.35784129032393935, -0.40425119602856974, -0.1504959112002591, 0.18859830123442675, 0.05004954670406097, -0.5807843475125979, -0.7106415271659244, 0.583263704999385, 0.16928048497301232, 0.9544458971491501, 0.0)
homeCats = 10
awayCats = 10
dropLast = true
homeDim = 9
awayDim = 9


9

In [None]:
// (Removed) Legacy batch coefficient slicing (home/away blocks).


homeAttackCoef = Array(-0.3462127058516612, -0.19809177233738684, 0.5997640074519133, 0.4127271732976447, 0.29010120572657216, -0.4256348720014847, 0.2259891343524253, -0.2273549425913689, -0.35784129032393935)
awayDefenseCoef = Array(-0.40425119602856974, -0.1504959112002591, 0.18859830123442675, 0.05004954670406097, -0.5807843475125979, -0.7106415271659244, 0.583263704999385, 0.16928048497301232, 0.9544458971491501)


Array(-0.40425119602856974, -0.1504959112002591, 0.18859830123442675, 0.05004954670406097, -0.5807843475125979, -0.7106415271659244, 0.583263704999385, 0.16928048497301232, 0.9544458971491501)

In [None]:
// (Removed) Legacy batch team-name recovery.


homeTeams = Array(Tanjong Pagar Utd., Hougang Utd, Lion City Sailors, Tampines Rovers, Balestier Khalsa, Young Lions, Albirex Niigata, Geylang Int., DPMM, UNKNOWN)
awayTeams = Array(Albirex Niigata, Geylang Int., Balestier Khalsa, Hougang Utd, Lion City Sailors, Tampines Rovers, Young Lions, DPMM, Tanjong Pagar Utd., UNKNOWN)




Array(Albirex Niigata, Geylang Int., Balestier Khalsa, Hougang Utd, Lion City Sailors, Tampines Rovers, Young Lions, DPMM, Tanjong Pagar Utd., UNKNOWN)

In [None]:
// (Removed) Legacy batch baseline attachment.


homeAttack = ArraySeq((Tanjong Pagar Utd.,-0.3462127058516612), (Hougang Utd,-0.19809177233738684), (Lion City Sailors,0.5997640074519133), (Tampines Rovers,0.4127271732976447), (Balestier Khalsa,0.29010120572657216), (Young Lions,-0.4256348720014847), (Albirex Niigata,0.2259891343524253), (Geylang Int.,-0.2273549425913689), (DPMM,-0.35784129032393935), (UNKNOWN,0.0))
awayDefenseWeak = ArraySeq((Albirex Niigata,-0.40425119602856974), (Geylang Int.,-0.1504959112002591), (Balestier Khalsa,0.18859830123442675), (Hougang Utd,0.05004954670406097), (Lion City Sailors,-0.5807843475125979), (Tampines Rovers,-0.7106415271659244), (Young Lions,0.5832637...


attachBaseline: (teams: Array[String], coef: Array[Double], dropLast: Boolean)Seq[(String, Double)]


ArraySeq((Albirex Niigata,-0.40425119602856974), (Geylang Int.,-0.1504959112002591), (Balestier Khalsa,0.18859830123442675), (Hougang Utd,0.05004954670406097), (Lion City Sailors,-0.5807843475125979), (Tampines Rovers,-0.7106415271659244), (Young Lions,0.5832637...

In [None]:
// (Removed) Legacy batch power ranking.
// Pattern B computes and uses power ranking inside `refreshDashboard()`.


powerRank = [team: string, home_attack: double ... 2 more fields]


+------------------+--------------------+--------------------+--------------------+
|team              |home_attack         |away_def_weak       |power_score         |
+------------------+--------------------+--------------------+--------------------+
|Lion City Sailors |0.5997640074519133  |-0.5807843475125979 |1.180548354964511   |
|Tampines Rovers   |0.4127271732976447  |-0.7106415271659244 |1.123368700463569   |
|Albirex Niigata   |0.2259891343524253  |-0.40425119602856974|0.630240330380995   |
|Balestier Khalsa  |0.29010120572657216 |0.18859830123442675 |0.10150290449214541 |
|UNKNOWN           |0.0                 |0.0                 |0.0                 |
|Geylang Int.      |-0.2273549425913689 |-0.1504959112002591 |-0.0768590313911098 |
|Hougang Utd       |-0.19809177233738684|0.05004954670406097 |-0.24814131904144782|
|DPMM              |-0.35784129032393935|0.16928048497301232 |-0.5271217752969517 |
|Young Lions       |-0.4256348720014847 |0.583263704999385   |-1.00889857700

[team: string, home_attack: double ... 2 more fields]

In [None]:
// (Removed) Legacy batch future fixture selection.
// Pattern B scores fixtures inside `refreshDashboard()`.


futureFixtures = [event_id: int, match_date: date ... 4 more fields]


[event_id: int, match_date: date ... 4 more fields]

In [None]:
// (Removed) Legacy batch feature assembly for future fixtures.


futureFixtures2 = [event_id: int, match_date: date ... 3 more fields]
futureAssembled = [event_id: int, match_date: date ... 8 more fields]


[event_id: int, match_date: date ... 8 more fields]

In [None]:
// (Removed) Legacy batch scoring of future fixtures.
// Pattern B scores and exports inside `refreshDashboard()`.


predHome = [event_id: int, match_date: date ... 9 more fields]
predBoth = [event_id: int, match_date: date ... 4 more fields]


+--------+----------+------------------+------------------+------------------+------------------+
|event_id|match_date|home_team         |away_team         |lambda_home       |lambda_away       |
+--------+----------+------------------+------------------+------------------+------------------+
|14195506|2026-01-09|Geylang Int.      |Balestier Khalsa  |1.8227367236523886|2.276027950511678 |
|14195507|2026-01-10|Albirex Niigata   |Young Lions       |2.670746204361141 |0.7557585298403504|
|14195508|2026-01-11|Tampines Rovers   |Lion City Sailors |1.6934361907943776|1.8037359017997083|
|14195509|2026-01-12|Tanjong Pagar Utd.|Hougang Utd       |1.5653302058617307|1.710929538424973 |
|14195510|2026-01-16|Geylang Int.      |Hougang Utd       |1.684187969122023 |1.3975322414158229|
|14195512|2026-01-18|Tampines Rovers   |Albirex Niigata   |1.8699693422784058|0.9954425233618583|
|14195513|2026-01-19|Lion City Sailors |Tanjong Pagar Utd.|3.415703269610394 |0.8999471804676848|
|14195514|2026-01-23

[event_id: int, match_date: date ... 4 more fields]

In [None]:
// (Removed) Legacy batch W/D/L UDF.
// Pattern B computes W/D/L and expected points inside `refreshDashboard()`.



defined class WDL
wdlUdf = SparkUserDefinedFunction($Lambda$7766/0x00000001029eb040@602ffc9a,StructType(StructField(_1,DoubleType,false),StructField(_2,DoubleType,false),StructField(_3,DoubleType,false)),List(Some(class[value[0]: double]), Some(class[value[0]: double])),Some(class[_1[0]: double, _2[0]: double, _3[0]: double]),None,true,true)


poissonPmf: (k: Int, lambda: Double)Double
wdlFromLambdas: (lhRaw: Double, laRaw: Double, maxGoals: Int)WDL


SparkUserDefinedFunction($Lambda$7766/0x00000001029eb040@602ffc9a,StructType(StructField(_1,DoubleType,false),StructField(_2,DoubleType,false),StructField(_3,DoubleType,false)),List(Some(class[value[0]: double]), Some(class[value[0]: double])),Some(class[_1[0]: double, _2[0]: double, _3[0]: double]),None,true,true)

In [None]:
// (Removed) Legacy batch `matchProbs` derivation.
// Pattern B exports match-level fixtures inside `refreshDashboard()`.


matchProbs = [event_id: int, match_date: date ... 9 more fields]


[event_id: int, match_date: date ... 9 more fields]

In [None]:
// (Removed) Legacy batch Monte Carlo simulation.
// Pattern B runs MC and exports probabilities inside `refreshDashboard()`.


+------------------+--------------+------------+
|team              |win_league_pct|make_acl_pct|
+------------------+--------------+------------+
|Lion City Sailors |96.64         |99.89       |
|Tampines Rovers   |3.145         |76.33       |
|Albirex Niigata   |0.2           |20.21       |
|Balestier Khalsa  |0.01          |1.525       |
|Geylang Int.      |0.005         |2.045       |
|Hougang Utd       |0.0           |0.0         |
|Tanjong Pagar Utd.|0.0           |0.0         |
|Young Lions       |0.0           |0.0         |
+------------------+--------------+------------+



defined class TeamState
teams = Array(Lion City Sailors, Geylang Int., Tampines Rovers, Balestier Khalsa, Albirex Niigata, Hougang Utd, Tanjong Pagar Utd., Young Lions)
baseTable = Map(Lion City Sailors -> TeamState(15,22,2), Balestier Khalsa -> TeamState(8,9,12), Geylang Int. -> TeamState(10,7,4), Hougang Utd -> TeamState(3,4,10), Tanjong Pagar Utd. -> TeamState(3,7,15), Young Lions -> TeamState(0,4,17), Tampines Rovers -> TeamState(8,8,4), Albirex Niigata -> TeamState(7,8,5))


poisson: (lambdaRaw: Double, rng: scala.util.Random)Int
goalDiff: (s: TeamState)Int
simulateSeasonOnce: (teams: Array[String], base: Map[String,TeamState], matches: Array[(String, String, Double, Double)], seed: Int)Map[String,TeamState]
matchesArr: A...


Map(Lion City Sailors -> TeamState(15,22,2), Balestier Khalsa -> TeamState(8,9,12), Geylang Int. -> TeamState(10,7,4), Hougang Utd -> TeamState(3,4,10), Tanjong Pagar Utd. -> TeamState(3,7,15), Young Lions -> TeamState(0,4,17), Tampines Rovers -> TeamState(8,8,4), Albirex Niigata -> TeamState(7,8,5))

In [None]:
// (Removed) Legacy batch MC aggregation to probsDF.


In [None]:
// (Removed) Legacy batch SPI computation.
// Pattern B computes SPI inside `refreshDashboard()`.


w = org.apache.spark.sql.expressions.WindowSpec@1de69208
spiDF = [team: string, spi: double]


[team: string, spi: double]

In [None]:
// (Removed) Legacy batch finalTable join/show.


+------------------+----+---+---+--------------------+----------------+--------------+------------+------------------+
|team              |pts |gf |ga |exp_pts_remaining_mc|exp_pts_final_mc|win_league_pct|make_acl_pct|spi               |
+------------------+----+---+---+--------------------+----------------+--------------+------------+------------------+
|Lion City Sailors |15.0|22 |2  |37.7883             |52.7883         |96.64         |99.89       |89.47321462247646 |
|Tampines Rovers   |8.0 |8  |4  |31.90965            |39.90965        |3.145         |76.33       |88.77961094002528 |
|Albirex Niigata   |7.0 |8  |5  |27.139200000000002  |34.1392         |0.2           |20.21       |82.79783898992036 |
|Geylang Int.      |10.0|7  |4  |17.8014             |27.8014         |0.005         |2.045       |74.22054475013205 |
|Balestier Khalsa  |8.0 |9  |12 |18.26305            |26.26305        |0.01          |1.525       |76.38412017949582 |
|Hougang Utd       |3.0 |4  |10 |13.7406        

core = [team: string, pts: double ... 2 more fields]
mcPts = [team: string, exp_pts_final_mc: double]
mcRem = [team: string, exp_pts_remaining_mc: double]
odds = [team: string, win_league_pct: double ... 1 more field]
spi = [team: string, spi: double]
finalTable = [team: string, pts: double ... 7 more fields]


[team: string, pts: double ... 7 more fields]

In [None]:
// (Removed) Legacy batch dashboard CSV export.
// Pattern B writes dashboard CSVs inside `refreshDashboard()`.


dash = [team: string, pts: double ... 4 more fields]


[team: string, pts: double ... 4 more fields]

In [None]:
// (Removed) Legacy batch match_level_fixtures export.
// Pattern B writes fixtures CSVs inside `refreshDashboard()`.


homeView = [event_id: int, match_date: date ... 9 more fields]
awayView = [event_id: int, match_date: date ... 9 more fields]
teamFixtures = [event_id: int, match_date: date ... 9 more fields]


[event_id: int, match_date: date ... 9 more fields]

In [None]:
// -----------------------------
// DEMO RESET (recommended instead of disabling checkpoints)
// -----------------------------
// Use this before a demo so the professor can drop files and see changes immediately.
// It stops the streams and deletes the checkpoint folders (so Spark will re-read files).
// Optional: also delete test input files.

import org.apache.hadoop.fs.{FileSystem, Path}

def rm(path: String): Unit = {
  val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
  val p = new Path(path)
  if (fs.exists(p)) {
    fs.delete(p, true)
    println(s"[RM] deleted: $path")
  } else {
    println(s"[RM] missing: $path")
  }
}

def stopQuery(name: String, qOpt: => Any): Unit = {
  try {
    val q = qOpt.asInstanceOf[org.apache.spark.sql.streaming.StreamingQuery]
    if (q != null && q.isActive) {
      q.stop()
      println(s"[STOP] $name stopped")
    } else {
      println(s"[STOP] $name not active")
    }
  } catch {
    case _: Throwable => println(s"[STOP] $name not found")
  }
}

// 1) Stop streams (ignore if not running)
stopQuery("event stream (q)", q)
stopQuery("stats stream (mlQ)", mlQ)

// 2) Clear checkpoints
rm("data/_chk/current_table_live")
rm("data/_chk/rerun_model_on_stats")

// 3) Optional: clear input folders so you only process fresh demo files
// Uncomment if desired.
// rm("data/stream_in/event")
// rm("data/stream_in/stats")

println("[DEMO RESET] done. Now restart Cell 8 and Cell 12, then drop new JSON files.")



In [None]:
// -----------------------------
// Show SPI output (after refreshDashboard has run)
// -----------------------------
if (spark.catalog.tableExists("spi_live")) {
  spark.table("spi_live").orderBy(desc("spi")).show(50, truncate = false)
} else {
  println("spi_live not found. Run refreshDashboard(...) first.")
}



In [None]:
// -----------------------------
// Show dashboard + fixtures outputs (after refreshDashboard has run)
// -----------------------------
if (spark.catalog.tableExists("dashboard_live")) {
  spark.table("dashboard_live").show(50, truncate = false)
} else {
  println("dashboard_live not found. Run refreshDashboard(...) first.")
}

if (spark.catalog.tableExists("fixtures_live")) {
  spark.table("fixtures_live").orderBy("match_date").show(30, truncate = false)
} else {
  println("fixtures_live not found. Run refreshDashboard(...) first.")
}

