In [16]:
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder, VectorAssembler}
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import scala.util.Random
import scala.math.{exp, pow}
//create spark session
val spark = SparkSession.builder.appName("SPL Analysis").getOrCreate()
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@254693ac


org.apache.spark.sql.SparkSession@254693ac

In [17]:
//import spl match links with enriched
val df = spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("data/data_for_model.csv")

df = [season_id: int, season_label: string ... 8 more fields]


[season_id: int, season_label: string ... 8 more fields]

In [18]:
//current Table
// =======================
// CURRENT LEAGUE TABLE
// =======================
val currentTable = Seq(
  ("Lion City Sailors", 15, 22, 2),
  ("Geylang Int.", 10, 7, 4),
  ("Tampines Rovers", 8, 8, 4),
  ("Balestier Khalsa", 8, 9, 12),
  ("Albirex Niigata", 7, 8, 5),
  ("Hougang Utd", 3, 4, 10),
  ("Tanjong Pagar Utd.", 3, 7, 15),
  ("Young Lions", 0, 4, 17)
).toDF("team", "pts", "gf", "ga")

currentTable.show(truncate = false)

currentTable = [team: string, pts: int ... 2 more fields]


+------------------+---+---+---+
|team              |pts|gf |ga |
+------------------+---+---+---+
|Lion City Sailors |15 |22 |2  |
|Geylang Int.      |10 |7  |4  |
|Tampines Rovers   |8  |8  |4  |
|Balestier Khalsa  |8  |9  |12 |
|Albirex Niigata   |7  |8  |5  |
|Hougang Utd       |3  |4  |10 |
|Tanjong Pagar Utd.|3  |7  |15 |
|Young Lions       |0  |4  |17 |
+------------------+---+---+---+



[team: string, pts: int ... 2 more fields]

In [19]:
val modelDF = df
  // keep rows if:
  // 1) future fixture (is_future_fixture = 1)
  // OR
  // 2) past match with both xG present
  .filter(
    (col("home_expected_goals").isNotNull && col("away_expected_goals").isNotNull)
  )
  .dropDuplicates(Seq("event_id"))

modelDF = [season_id: int, season_label: string ... 8 more fields]


[season_id: int, season_label: string ... 8 more fields]

In [20]:
//predict xg_home and xg_away to build the strength of each team home and away
// Select and transform data
val m = modelDF.select(
  col("season_label"),
  col("event_id"),
  col("home_team").alias("home_team"),
  col("away_team").alias("away_team"),
  col("home_expected_goals").cast("double").alias("home_xg"),
  col("away_expected_goals").cast("double").alias("away_xg"),
  lit(1.0).alias("home_adv")
)
m.cache()
println(s"Matches used for ML: ${m.count()}")
m.show(5, truncate = false)

// StringIndexers
val home_idx = new StringIndexer()
  .setInputCol("home_team")
  .setOutputCol("home_team_idx")
  .setHandleInvalid("keep")

val away_idx = new StringIndexer()
  .setInputCol("away_team")
  .setOutputCol("away_team_idx")
  .setHandleInvalid("keep")

// OneHotEncoder
val enc = new OneHotEncoder()
  .setInputCols(Array("home_team_idx", "away_team_idx"))
  .setOutputCols(Array("home_team_ohe", "away_team_ohe"))

// VectorAssembler
val assembler = new VectorAssembler()
  .setInputCols(Array("home_team_ohe", "away_team_ohe", "home_adv"))
  .setOutputCol("features")

val homeIdxModel = home_idx.fit(m)
val m1 = homeIdxModel.transform(m)

val awayIdxModel = away_idx.fit(m1)
val m2 = awayIdxModel.transform(m1)

val encModel = enc.fit(m2)
val m3 = encModel.transform(m2)
val assembled = assembler.transform(m3)

Matches used for ML: 253
+------------+--------+-----------------+------------------+-------+-------+--------+
|season_label|event_id|home_team        |away_team         |home_xg|away_xg|home_adv|
+------------+--------+-----------------+------------------+-------+-------+--------+
|2023        |11041385|Albirex Niigata  |Tampines Rovers   |0.59   |0.94   |1.0     |
|2023        |11041331|Lion City Sailors|Hougang Utd       |3.28   |0.0    |1.0     |
|2023        |11041349|Hougang Utd      |Tanjong Pagar Utd.|2.73   |1.12   |1.0     |
|2024        |12246956|Hougang Utd      |Young Lions       |2.6    |2.32   |1.0     |
|2023        |11041318|Geylang Int.     |Balestier Khalsa  |0.88   |3.69   |1.0     |
+------------+--------+-----------------+------------------+-------+-------+--------+
only showing top 5 rows



m = [season_label: string, event_id: int ... 5 more fields]
home_idx = strIdx_8e94c494ef0a
away_idx = strIdx_e5473ba0529c
enc = oneHotEncoder_5f6950c09d44
assembler = VectorAssembler: uid=vecAssembler_b8a31d8e791e, handleInvalid=error, numInputCols=3
homeIdxModel = StringIndexerModel: uid=strIdx_8e94c494ef0a, handleInvalid=keep
m1 = [season_label: string, event_id: int ... 6 more fields]
awayIdxModel = StringIndexerModel: uid=strIdx_e5473ba0529c, handleInvali...


StringIndexerModel: uid=strIdx_e5473ba0529c, handleInvali...

In [21]:
// GeneralizedLinearRegression models
//fit GeneralizedLinearRegression model onto home_xg
//figures out the home and away strength index of each team
val glm_home = new GeneralizedLinearRegression()
  .setFeaturesCol("features")
  .setLabelCol("home_xg")
  .setFamily("gaussian")
  .setLink("identity")
  .setMaxIter(50)
  .setRegParam(0.0)

val glm_away = new GeneralizedLinearRegression()
  .setFeaturesCol("features")
  .setLabelCol("away_xg")
  .setFamily("gaussian")
  .setLink("identity")
  .setMaxIter(50)
  .setRegParam(0.0)

// Fit the model directly (without using the pipeline)
val model_home = glm_home.fit(assembled)
val model_away = glm_away.fit(assembled)

// Access the summary directly
//println(s"HOME model summary: ${model_home.summary}")
//println(s"AWAY model summary: ${model_away.summary}")
println(s"Intercept = ${model_home.intercept}")
println(s"Coefficients = ${model_home.coefficients}")

glm_home = glm_f83d1b5a3d2f
glm_away = glm_0cc0a44274c1
model_home = GeneralizedLinearRegressionModel: uid=glm_f83d1b5a3d2f, family=gaussian, link=identity, numFeatures=19
model_away = GeneralizedLinearRegressionModel: uid=glm_0cc0a44274c1, family=gaussian, link=identity, numFeatures=19


Intercept = 1.8614933650093324
Coefficients = [-0.3462127058516619,-0.19809177233738726,0.5997640074519127,0.41272717329764536,0.29010120572657133,-0.4256348720014848,0.22598913435242418,-0.22735494259136999,-0.35784129032393946,-0.4042511960285711,-0.1504959112002604,0.1885983012344262,0.05004954670406063,-0.5807843475125989,-0.7106415271659258,0.5832637049993841,0.16928048497301182,0.9544458971491494,0.0]


GeneralizedLinearRegressionModel: uid=glm_0cc0a44274c1, family=gaussian, link=identity, numFeatures=19

In [22]:
//get block sizes
val coeff = model_home.coefficients.toArray

val homeCats = encModel.categorySizes(0)
val awayCats = encModel.categorySizes(1)
val dropLast = enc.getDropLast

val homeDim = if (dropLast) homeCats - 1 else homeCats
val awayDim = if (dropLast) awayCats - 1 else awayCats

coeff = Array(-0.3462127058516619, -0.19809177233738726, 0.5997640074519127, 0.41272717329764536, 0.29010120572657133, -0.4256348720014848, 0.22598913435242418, -0.22735494259136999, -0.35784129032393946, -0.4042511960285711, -0.1504959112002604, 0.1885983012344262, 0.05004954670406063, -0.5807843475125989, -0.7106415271659258, 0.5832637049993841, 0.16928048497301182, 0.9544458971491494, 0.0)
homeCats = 10
awayCats = 10
dropLast = true
homeDim = 9
awayDim = 9


9

In [23]:
//slice coefficients
val homeAttackCoef = coeff.slice(0, homeDim)
val awayDefenseCoef = coeff.slice(homeDim, homeDim + awayDim)

homeAttackCoef = Array(-0.3462127058516619, -0.19809177233738726, 0.5997640074519127, 0.41272717329764536, 0.29010120572657133, -0.4256348720014848, 0.22598913435242418, -0.22735494259136999, -0.35784129032393946)
awayDefenseCoef = Array(-0.4042511960285711, -0.1504959112002604, 0.1885983012344262, 0.05004954670406063, -0.5807843475125989, -0.7106415271659258, 0.5832637049993841, 0.16928048497301182, 0.9544458971491494)


Array(-0.4042511960285711, -0.1504959112002604, 0.1885983012344262, 0.05004954670406063, -0.5807843475125989, -0.7106415271659258, 0.5832637049993841, 0.16928048497301182, 0.9544458971491494)

In [24]:
//recover team names
val homeTeams =
  if (homeIdxModel.getHandleInvalid == "keep")
    homeIdxModel.labels :+ "UNKNOWN"
  else
    homeIdxModel.labels

val awayTeams =
  if (awayIdxModel.getHandleInvalid == "keep")
    awayIdxModel.labels :+ "UNKNOWN"
  else
    awayIdxModel.labels

homeTeams = Array(Tanjong Pagar Utd., Hougang Utd, Lion City Sailors, Tampines Rovers, Balestier Khalsa, Young Lions, Albirex Niigata, Geylang Int., DPMM, UNKNOWN)
awayTeams = Array(Albirex Niigata, Geylang Int., Balestier Khalsa, Hougang Utd, Lion City Sailors, Tampines Rovers, Young Lions, DPMM, Tanjong Pagar Utd., UNKNOWN)




Array(Albirex Niigata, Geylang Int., Balestier Khalsa, Hougang Utd, Lion City Sailors, Tampines Rovers, Young Lions, DPMM, Tanjong Pagar Utd., UNKNOWN)

In [25]:
//attach baseline
def attachBaseline(
  teams: Array[String],
  coef: Array[Double],
  dropLast: Boolean
): Seq[(String, Double)] =
  if (dropLast) teams.zip(coef :+ 0.0) else teams.zip(coef)

val homeAttack = attachBaseline(homeTeams, homeAttackCoef, dropLast)
val awayDefenseWeak = attachBaseline(awayTeams, awayDefenseCoef, dropLast)

homeAttack = ArraySeq((Tanjong Pagar Utd.,-0.3462127058516619), (Hougang Utd,-0.19809177233738726), (Lion City Sailors,0.5997640074519127), (Tampines Rovers,0.41272717329764536), (Balestier Khalsa,0.29010120572657133), (Young Lions,-0.4256348720014848), (Albirex Niigata,0.22598913435242418), (Geylang Int.,-0.22735494259136999), (DPMM,-0.35784129032393946), (UNKNOWN,0.0))
awayDefenseWeak = ArraySeq((Albirex Niigata,-0.4042511960285711), (Geylang Int.,-0.1504959112002604), (Balestier Khalsa,0.1885983012344262), (Hougang Utd,0.05004954670406063), (Lion City Sailors,-0.5807843475125989), (Tampines Rovers,-0.7106415271659258), (Young Lions,0.583263...


attachBaseline: (teams: Array[String], coef: Array[Double], dropLast: Boolean)Seq[(String, Double)]


ArraySeq((Albirex Niigata,-0.4042511960285711), (Geylang Int.,-0.1504959112002604), (Balestier Khalsa,0.1885983012344262), (Hougang Utd,0.05004954670406063), (Lion City Sailors,-0.5807843475125989), (Tampines Rovers,-0.7106415271659258), (Young Lions,0.583263...

In [26]:
//power ranking table
val powerRank =
  homeAttack.toDF("team", "home_attack")
    .join(
      awayDefenseWeak.toDF("team", "away_def_weak"),
      "team"
    )
    .withColumn("power_score", col("home_attack") - col("away_def_weak"))
    .orderBy(desc("power_score"))

powerRank.show(20, truncate=false)

powerRank = [team: string, home_attack: double ... 2 more fields]


+------------------+--------------------+-------------------+--------------------+
|team              |home_attack         |away_def_weak      |power_score         |
+------------------+--------------------+-------------------+--------------------+
|Lion City Sailors |0.5997640074519127  |-0.5807843475125989|1.1805483549645115  |
|Tampines Rovers   |0.41272717329764536 |-0.7106415271659258|1.123368700463571   |
|Albirex Niigata   |0.22598913435242418 |-0.4042511960285711|0.6302403303809954  |
|Balestier Khalsa  |0.29010120572657133 |0.1885983012344262 |0.10150290449214514 |
|UNKNOWN           |0.0                 |0.0                |0.0                 |
|Geylang Int.      |-0.22735494259136999|-0.1504959112002604|-0.07685903139110958|
|Hougang Utd       |-0.19809177233738726|0.05004954670406063|-0.24814131904144787|
|DPMM              |-0.35784129032393946|0.16928048497301182|-0.5271217752969513 |
|Young Lions       |-0.4256348720014848 |0.5832637049993841 |-1.008898577000869  |
|Tan

[team: string, home_attack: double ... 2 more fields]

In [27]:
val futureFixtures = df
  .filter(col("is_future_fixture") === 1 && col("season_label") === "25/26")
  .select(
    col("event_id"),
    col("match_date"),
    trim(col("home_team")).alias("home_team"),
    trim(col("away_team")).alias("away_team"),
    lit(1.0).alias("home_adv"),
    col("season_label")
  )
  .dropDuplicates(Seq("event_id"))


futureFixtures = [event_id: int, match_date: date ... 4 more fields]


[event_id: int, match_date: date ... 4 more fields]

In [28]:
//get future fixtures
val futureFixtures2 = futureFixtures
  .select(
    col("event_id"),
    col("match_date"),
    trim(col("home_team")).alias("home_team"),
    trim(col("away_team")).alias("away_team"),
    lit(1.0).alias("home_adv")
  )

val futureAssembled =
  assembler.transform(
    encModel.transform(
      awayIdxModel.transform(
        homeIdxModel.transform(futureFixtures2)
      )
    )
  )

futureFixtures2 = [event_id: int, match_date: date ... 3 more fields]
futureAssembled = [event_id: int, match_date: date ... 8 more fields]


[event_id: int, match_date: date ... 8 more fields]

In [29]:
//predict expected goals
val predHome = model_home
  .transform(futureAssembled)
  .withColumnRenamed("prediction", "lambda_home")

val predBoth = model_away
  .transform(predHome)
  .withColumnRenamed("prediction", "lambda_away")
  .select("event_id", "match_date", "home_team", "away_team", "lambda_home", "lambda_away")

predBoth.show(10, truncate=false)

+--------+----------+------------------+------------------+------------------+------------------+
|event_id|match_date|home_team         |away_team         |lambda_home       |lambda_away       |
+--------+----------+------------------+------------------+------------------+------------------+
|14195506|2026-01-09|Geylang Int.      |Balestier Khalsa  |1.8227367236523886|2.2760279505116774|
|14195507|2026-01-10|Albirex Niigata   |Young Lions       |2.6707462043611407|0.7557585298403495|
|14195508|2026-01-11|Tampines Rovers   |Lion City Sailors |1.693436190794379 |1.8037359017997083|
|14195509|2026-01-12|Tanjong Pagar Utd.|Hougang Utd       |1.565330205861731 |1.710929538424972 |
|14195510|2026-01-16|Geylang Int.      |Hougang Utd       |1.684187969122023 |1.3975322414158222|
|14195512|2026-01-18|Tampines Rovers   |Albirex Niigata   |1.8699693422784067|0.9954425233618585|
|14195513|2026-01-19|Lion City Sailors |Tanjong Pagar Utd.|3.4157032696103946|0.8999471804676848|
|14195514|2026-01-23

predHome = [event_id: int, match_date: date ... 9 more fields]
predBoth = [event_id: int, match_date: date ... 4 more fields]


[event_id: int, match_date: date ... 4 more fields]

In [30]:
import scala.math.{exp, pow}
def poissonPmf(k: Int, lambda: Double): Double = {
  if (k < 0) 0.0
  else {
    var fact = 1.0
    var i = 2
    while (i <= k) { fact *= i; i += 1 }
    exp(-lambda) * pow(lambda, k) / fact
  }
}

case class WDL(win: Double, draw: Double, loss: Double)

def wdlFromLambdas(lhRaw: Double, laRaw: Double, maxGoals: Int = 10): WDL = {
  val lh = math.max(lhRaw, 0.0)
  val la = math.max(laRaw, 0.0)

  val ph = Array.tabulate(maxGoals + 1)(k => poissonPmf(k, lh))
  val pa = Array.tabulate(maxGoals + 1)(k => poissonPmf(k, la))

  // truncated mass (0..maxGoals). we renormalize so probs sum to 1.
  var mass = 0.0
  var i = 0
  while (i <= maxGoals) {
    var j = 0
    while (j <= maxGoals) { mass += ph(i) * pa(j); j += 1 }
    i += 1
  }
  val denom = if (mass > 0) mass else 1.0

  var win = 0.0
  var draw = 0.0
  i = 0
  while (i <= maxGoals) {
    var j = 0
    while (j <= maxGoals) {
      val p = (ph(i) * pa(j)) / denom
      if (i > j) win += p
      else if (i == j) draw += p
      j += 1
    }
    i += 1
  }
  WDL(win, draw, 1.0 - win - draw)
}

val wdlUdf = udf((lh: Double, la: Double) => {
  val r = wdlFromLambdas(lh, la, maxGoals = 10)
  (r.win, r.draw, r.loss)
})


defined class WDL
wdlUdf = SparkUserDefinedFunction($Lambda$7723/0x0000000101f9e040@73dfe548,StructType(StructField(_1,DoubleType,false),StructField(_2,DoubleType,false),StructField(_3,DoubleType,false)),List(Some(class[value[0]: double]), Some(class[value[0]: double])),Some(class[_1[0]: double, _2[0]: double, _3[0]: double]),None,true,true)


poissonPmf: (k: Int, lambda: Double)Double
wdlFromLambdas: (lhRaw: Double, laRaw: Double, maxGoals: Int)WDL


SparkUserDefinedFunction($Lambda$7723/0x0000000101f9e040@73dfe548,StructType(StructField(_1,DoubleType,false),StructField(_2,DoubleType,false),StructField(_3,DoubleType,false)),List(Some(class[value[0]: double]), Some(class[value[0]: double])),Some(class[_1[0]: double, _2[0]: double, _3[0]: double]),None,true,true)

In [31]:
val matchProbs =
  predBoth
    .withColumn("wdl", wdlUdf(col("lambda_home"), col("lambda_away")))
    .withColumn("p_home_win", col("wdl").getField("_1"))
    .withColumn("p_draw",     col("wdl").getField("_2"))
    .withColumn("p_away_win", col("wdl").getField("_3"))
    .drop("wdl")
    // ðŸ”¥ FIX IS HERE
    .withColumn(
      "exp_pts_home",
      lit(3.0) * col("p_home_win") + lit(1.0) * col("p_draw")
    )
    .withColumn(
      "exp_pts_away",
      lit(3.0) * col("p_away_win") + lit(1.0) * col("p_draw")
    )
    .withColumnRenamed("lambda_home", "xg_home")
    .withColumnRenamed("lambda_away", "xg_away")


matchProbs = [event_id: int, match_date: date ... 9 more fields]


[event_id: int, match_date: date ... 9 more fields]

In [32]:
println("matchProbs rows = " + matchProbs.count())
println("matchProbs distinct event_id = " + matchProbs.select("event_id").distinct().count())

println("futureFixtures distinct event_id = " + futureFixtures.select("event_id").distinct().count())


matchProbs rows = 59
matchProbs distinct event_id = 59
futureFixtures distinct event_id = 59


In [33]:
val expPtsByTeam =
  matchProbs
    .select(col("home_team").alias("team"), col("exp_pts_home").alias("exp_pts"))
    .unionByName(matchProbs.select(col("away_team").alias("team"), col("exp_pts_away").alias("exp_pts")))
    .groupBy("team")
    .agg(sum("exp_pts").alias("exp_pts_remaining"))
    .orderBy(desc("exp_pts_remaining"))

expPtsByTeam = [team: string, exp_pts_remaining: double]


[team: string, exp_pts_remaining: double]

In [34]:
val expectedTable =
  currentTable
    .join(expPtsByTeam, Seq("team"), "left")
    .na.fill(0.0, Seq("exp_pts_remaining"))
    .withColumn(
      "exp_pts_final",
      col("pts").cast("double") + col("exp_pts_remaining")
    )
    .orderBy(desc("exp_pts_final"))


expectedTable = [team: string, pts: int ... 4 more fields]


[team: string, pts: int ... 4 more fields]

In [35]:
//simulate future fixtures to produce final ranking and table
def poisson(lambda: Double, rng: scala.util.Random): Int = {
  val L = math.exp(-math.max(lambda, 0.0))
  var k = 0
  var p = 1.0
  do {
    k += 1
    p *= rng.nextDouble()
  } while (p > L && k < 20)
  k - 1
}

poisson: (lambda: Double, rng: scala.util.Random)Int


In [36]:
import scala.util.Random

// ---------- 0) Inputs you must have ----------
/*
currentTable: columns = team (String), pts (Int), gf (Int), ga (Int)
predBoth: columns = match_id, home_team, away_team, lambda_home, lambda_away
*/

// ---------- 1) Helpers ----------
case class TeamState(var pts:Int, var gf:Int, var ga:Int)

def poisson(lambdaRaw: Double, rng: Random): Int = {
  val lambda = math.max(lambdaRaw, 0.0)
  val L = math.exp(-lambda)
  var k = 0
  var p = 1.0
  do {
    k += 1
    p *= rng.nextDouble()
  } while (p > L && k < 25)
  k - 1
}

def goalDiff(s: TeamState): Int = s.gf - s.ga

def simulateSeasonOnce(
  teams: Array[String],
  base: Map[String, TeamState],
  matches: Array[(String,String,Double,Double)], // (home, away, lamH, lamA)
  seed: Int
): Map[String, TeamState] = {

  val rng = new Random(seed)

  // deep copy base table
  val st = scala.collection.mutable.Map[String, TeamState]()
  teams.foreach { t =>
    val b = base(t)
    st(t) = TeamState(b.pts, b.gf, b.ga)
  }

  // simulate remaining fixtures
  matches.foreach { case (h,a,lamH,lamA) =>
    val gh = poisson(lamH, rng)
    val ga = poisson(lamA, rng)

    st(h).gf += gh; st(h).ga += ga
    st(a).gf += ga; st(a).ga += gh

    if (gh > ga) st(h).pts += 3
    else if (gh < ga) st(a).pts += 3
    else { st(h).pts += 1; st(a).pts += 1 }
  }

  st.toMap
}

// ---------- 2) Build base table + match list ----------
val teams: Array[String] = currentTable.select($"team").as[String].collect()

val baseTable: Map[String, TeamState] =
  currentTable.select($"team",$"pts",$"gf",$"ga").as[(String,Int,Int,Int)].collect()
    .map { case (t,p,gf,ga) => t -> TeamState(p,gf,ga) }
    .toMap

val matchesArr: Array[(String,String,Double,Double)] =
  predBoth.select($"home_team",$"away_team",$"lambda_home",$"lambda_away")
    .as[(String,String,Double,Double)]
    .collect()

// ---------- 3) Outcome counters (THIS FIXES your "not found" errors) ----------
val winLeague = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
val makeACL   = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
val totalFinalPts = scala.collection.mutable.Map[String, Double]().withDefaultValue(0.0)


// ---------- 4) Run simulations ----------
val N = 20000

for (i <- 0 until N) {
  val end = simulateSeasonOnce(teams, baseTable, matchesArr, seed = 1234 + i)

  val ranked = teams.sortBy { t =>
    val s = end(t)
    (-s.pts, -goalDiff(s), -s.gf, t)   // tie-break: points, GD, GF
  }

    winLeague(ranked(0)) += 1
    ranked.take(2).foreach(t => makeACL(t) += 1)   // âœ… TOP 2 for ACL
    teams.foreach { t =>
  totalFinalPts(t) += end(t).pts
}

}

val mcPtsDF =
  teams.toSeq.map { t =>
    val expFinal = totalFinalPts(t) / N
    (t, expFinal)
  }.toDF("team", "exp_pts_final_mc")

val mcRemainingDF =
  mcPtsDF
    .join(currentTable.select($"team", $"pts"), Seq("team"))
    .withColumn("exp_pts_remaining_mc", col("exp_pts_final_mc") - col("pts"))

// ---------- 5) Convert to probabilities table ----------
val probsDF =
  teams.toSeq.map { t =>
    val pWin = winLeague(t).toDouble / N
    val pACL = makeACL(t).toDouble / N
    (t, pWin * 100.0, pACL * 100.0)
  }.toDF("team","win_league_pct","make_acl_pct")


probsDF.orderBy(desc("win_league_pct")).show(20, truncate=false)

defined class TeamState
teams = Array(Lion City Sailors, Geylang Int., Tampines Rovers, Balestier Khalsa, Albirex Niigata, Hougang Utd, Tanjong Pagar Utd., Young Lions)
baseTable = Map(Lion City Sailors -> TeamState(15,22,2), Balestier Khalsa -> TeamState(8,9,12), Geylang Int. -> TeamState(10,7,4), Hougang Utd -> TeamState(3,4,10), Tanjong Pagar Utd. -> TeamState(3,7,15), Young Lions -> TeamState(0,4,17), Tampines Rovers -> TeamState(8,8,4), Albirex Niigata -> TeamState(7,8,5))


poisson: (lambdaRaw: Double, rng: scala.util.Random)Int
goalDiff: (s: TeamState)Int
simulateSeasonOnce: (teams: Array[String], base: Map[String,TeamState], matches: Array[(String, String, Double, Double)], seed: Int)Map[String,TeamState]
matchesArr: A...


+------------------+--------------+------------+
|team              |win_league_pct|make_acl_pct|
+------------------+--------------+------------+
|Lion City Sailors |96.64         |99.89       |
|Tampines Rovers   |3.145         |76.33       |
|Albirex Niigata   |0.2           |20.21       |
|Balestier Khalsa  |0.01          |1.525       |
|Geylang Int.      |0.005         |2.045       |
|Hougang Utd       |0.0           |0.0         |
|Tanjong Pagar Utd.|0.0           |0.0         |
|Young Lions       |0.0           |0.0         |
+------------------+--------------+------------+



Map(Lion City Sailors -> TeamState(15,22,2), Balestier Khalsa -> TeamState(8,9,12), Geylang Int. -> TeamState(10,7,4), Hougang Utd -> TeamState(3,4,10), Tanjong Pagar Utd. -> TeamState(3,7,15), Young Lions -> TeamState(0,4,17), Tampines Rovers -> TeamState(8,8,4), Albirex Niigata -> TeamState(7,8,5))

In [37]:
//power ranking table
import org.apache.spark.sql.expressions.Window
val w = Window.partitionBy()

val spiDF =
  powerRank
    .withColumn("mu", avg($"power_score").over(w))
    .withColumn("sd", stddev($"power_score").over(w))
    .withColumn("spi", lit(75.0) + lit(10.0) * (($"power_score" - $"mu") / $"sd"))
    .select("team","spi")

w = org.apache.spark.sql.expressions.WindowSpec@2a08e1a9
spiDF = [team: string, spi: double]


[team: string, spi: double]

In [38]:
val finalTable =
  spiDF.join(probsDF, "team")
    .orderBy(desc("spi"))

finalTable.show(20, truncate=false)

+------------------+-----------------+--------------+------------+
|team              |spi              |win_league_pct|make_acl_pct|
+------------------+-----------------+--------------+------------+
|Lion City Sailors |89.47321462247646|96.64         |99.89       |
|Tampines Rovers   |88.7796109400253 |3.145         |76.33       |
|Albirex Niigata   |82.79783898992036|0.2           |20.21       |
|Balestier Khalsa  |76.38412017949582|0.01          |1.525       |
|Geylang Int.      |74.22054475013204|0.005         |2.045       |
|Hougang Utd       |72.14284724583914|0.0           |0.0         |
|Young Lions       |62.91466914337387|0.0           |0.0         |
|Tanjong Pagar Utd.|59.37554617748956|0.0           |0.0         |
+------------------+-----------------+--------------+------------+



finalTable = [team: string, spi: double ... 2 more fields]


[team: string, spi: double ... 2 more fields]

In [39]:
// 0) Start from currentTable (the one with pts/gf/ga). This guarantees all teams included.
val core =
  currentTable
    .select(
      col("team"),
      col("pts").cast("double").alias("pts"),
      col("gf").cast("int").alias("gf"),
      col("ga").cast("int").alias("ga")
    )

// 1) Analytic expected remaining points (if you have it)
val analytic =
  expectedTable
    .select("team", "exp_pts_remaining", "exp_pts_final")

// 2) MC points (keep only new columns to avoid duplicates)
val mcPts =
  mcPtsDF.select("team", "exp_pts_final_mc")

val mcRem =
  mcRemainingDF.select("team", "exp_pts_remaining_mc")

// 3) Odds
val odds =
  probsDF.select("team", "win_league_pct", "make_acl_pct")

// 4) Power rating / SPI
val spi =
  spiDF.select("team", "spi")

// 5) Join them all (LEFT joins)
val finalTable =
  core
    .join(analytic, Seq("team"), "left")
    .join(mcRem,   Seq("team"), "left")
    .join(mcPts,   Seq("team"), "left")
    .join(odds,    Seq("team"), "left")
    .join(spi,     Seq("team"), "left")
    .na.fill(0.0, Seq(
      "exp_pts_remaining",
      "exp_pts_final",
      "exp_pts_remaining_mc",
      "exp_pts_final_mc",
      "win_league_pct",
      "make_acl_pct",
      "spi"
    ))
    .orderBy(desc("exp_pts_final_mc"))

finalTable.show(50, truncate=false)


+------------------+----+---+---+------------------+------------------+--------------------+----------------+--------------+------------+-----------------+
|team              |pts |gf |ga |exp_pts_remaining |exp_pts_final     |exp_pts_remaining_mc|exp_pts_final_mc|win_league_pct|make_acl_pct|spi              |
+------------------+----+---+---+------------------+------------------+--------------------+----------------+--------------+------------+-----------------+
|Lion City Sailors |15.0|22 |2  |37.772384894902856|52.772384894902856|37.7883             |52.7883         |96.64         |99.89       |89.47321462247646|
|Tampines Rovers   |8.0 |8  |4  |31.951665917508034|39.95166591750804 |31.90965            |39.90965        |3.145         |76.33       |88.7796109400253 |
|Albirex Niigata   |7.0 |8  |5  |27.109003165923657|34.10900316592365 |27.139200000000002  |34.1392         |0.2           |20.21       |82.79783898992036|
|Geylang Int.      |10.0|7  |4  |17.977429773716363|27.977429773

core = [team: string, pts: double ... 2 more fields]
analytic = [team: string, exp_pts_remaining: double ... 1 more field]
mcPts = [team: string, exp_pts_final_mc: double]
mcRem = [team: string, exp_pts_remaining_mc: double]
odds = [team: string, win_league_pct: double ... 1 more field]
spi = [team: string, spi: double]
finalTable = [team: string, pts: double ... 9 more fields]


[team: string, pts: double ... 9 more fields]

In [40]:
// For home team view: W=home_win, D=draw, L=away_win
val homeView =
  matchProbs.select(
    col("event_id"),
    col("match_date"), 
    col("home_team").alias("team"),
    col("away_team").alias("opponent"),
    lit("H").alias("venue"),
    col("xg_home").alias("xg_for"),
    col("xg_away").alias("xg_against"),
    col("p_home_win").alias("p_win"),
    col("p_draw").alias("p_draw"),
    col("p_away_win").alias("p_loss"),
    col("exp_pts_home").alias("exp_pts")
  )

// For away team view: W=away_win, D=draw, L=home_win
val awayView =
  matchProbs.select(
    col("event_id"),
    col("match_date"), 
    col("away_team").alias("team"),
    col("home_team").alias("opponent"),
    lit("A").alias("venue"),
    col("xg_away").alias("xg_for"),
    col("xg_home").alias("xg_against"),
    col("p_away_win").alias("p_win"),
    col("p_draw").alias("p_draw"),
    col("p_home_win").alias("p_loss"),
    col("exp_pts_away").alias("exp_pts")
  )

val teamFixtures = homeView.unionByName(awayView)

homeView = [event_id: int, match_date: date ... 9 more fields]
awayView = [event_id: int, match_date: date ... 9 more fields]
teamFixtures = [event_id: int, match_date: date ... 9 more fields]


[event_id: int, match_date: date ... 9 more fields]

In [41]:
teamFixtures
  .filter(col("team") === "Lion City Sailors")
  .select("event_id","venue","opponent","xg_for","xg_against","p_win","p_draw","p_loss","exp_pts")
  .orderBy(desc("p_loss"))
  .show(50, truncate=false)

+--------+-----+------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+
|event_id|venue|opponent          |xg_for            |xg_against        |p_win             |p_draw             |p_loss             |exp_pts           |
+--------+-----+------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+
|14195508|A    |Tampines Rovers   |1.8037359017997083|1.693436190794379 |0.4113317450284929|0.22257784275256534|0.3660904122189418 |1.4565730778380441|
|14195559|A    |Tampines Rovers   |1.8037359017997083|1.693436190794379 |0.4113317450284929|0.22257784275256534|0.3660904122189418 |1.4565730778380441|
|14255563|H    |Balestier Khalsa  |2.6498556736956713|1.9090274214088077|0.5408671589552966|0.18283726151467317|0.27629557953003026|1.805438738380563 |
|14195566|H    |Albirex Niigata   |2.057006176432674 |1.2616266649403545|0.5574046558245

In [42]:
val teamWdlSummary =
  teamFixtures
    .groupBy("team")
    .agg(
      sum("p_win").alias("exp_wins"),
      sum("p_draw").alias("exp_draws"),
      sum("p_loss").alias("exp_losses"),
      sum("exp_pts").alias("exp_pts_remaining"),
      count(lit(1)).alias("fixtures_remaining")
    )
    .orderBy(desc("exp_pts_remaining"))

teamWdlSummary.show(50, truncate=false)


+------------------+------------------+------------------+-----------------+------------------+------------------+
|team              |exp_wins          |exp_draws         |exp_losses       |exp_pts_remaining |fixtures_remaining|
+------------------+------------------+------------------+-----------------+------------------+------------------+
|Lion City Sailors |11.70099914537817 |2.669387458768351 |2.629613395853482|37.772384894902856|17                |
|Tampines Rovers   |9.639145095113292 |3.034230632168156 |3.326624272718551|31.951665917508034|16                |
|Albirex Niigata   |8.088213713852964 |2.8443620243647594|4.067424261782276|27.109003165923657|15                |
|Balestier Khalsa  |5.201773712434751 |2.5164085925115653|5.281817695053684|18.121729729815815|13                |
|Geylang Int.      |4.970380142474488 |3.0662893462928986|6.963330511232613|17.977429773716363|15                |
|Hougang Utd       |3.6049681327417775|2.909849553855638 |7.485182313402584|13.7

teamWdlSummary = [team: string, exp_wins: double ... 4 more fields]


[team: string, exp_wins: double ... 4 more fields]

In [43]:
val pretty =
  teamFixtures
    .withColumn("p_win", round(col("p_win"), 3))
    .withColumn("p_draw", round(col("p_draw"), 3))
    .withColumn("p_loss", round(col("p_loss"), 3))
    .withColumn("exp_pts", round(col("exp_pts"), 3))
    .withColumn("xg_for", round(col("xg_for"), 2))
    .withColumn("xg_against", round(col("xg_against"), 2))


pretty = [event_id: int, match_date: date ... 9 more fields]


[event_id: int, match_date: date ... 9 more fields]

In [44]:
// pick only what you need from each df to avoid duplicate columns
val dash =
  currentTable
    .select(col("team"), col("pts").cast("double").alias("pts"))
    .join(spiDF.select(col("team"), col("spi").cast("double").alias("spi")), Seq("team"), "left")
    .join(mcPtsDF.select(col("team"), col("exp_pts_final_mc").cast("double").alias("exp_pts_mc")), Seq("team"), "left")
    .join(probsDF.select(col("team"), col("win_league_pct").cast("double"), col("make_acl_pct").cast("double")), Seq("team"), "left")
    .na.fill(0.0, Seq("spi","exp_pts_mc","win_league_pct","make_acl_pct"))
    .orderBy(desc("win_league_pct"))

// write as a SINGLE csv file (easy for Streamlit)
dash
  .coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv("data/dashboard_table_csv")


dash = [team: string, pts: double ... 4 more fields]


[team: string, pts: double ... 4 more fields]

In [45]:
// Home team view
val homeView =
  matchProbs.select(
    col("event_id"),
    col("match_date"), 
    col("home_team").alias("team"),
    col("away_team").alias("opponent"),
    lit("H").alias("venue"),
    col("xg_home").alias("xg_for"),
    col("xg_away").alias("xg_against"),
    col("p_home_win").alias("p_win"),
    col("p_draw").alias("p_draw"),
    col("p_away_win").alias("p_loss"),
    col("exp_pts_home").alias("exp_pts")
  )

// Away team view
val awayView =
  matchProbs.select(
    col("event_id"),
    col("match_date"), 
    col("away_team").alias("team"),
    col("home_team").alias("opponent"),
    lit("A").alias("venue"),
    col("xg_away").alias("xg_for"),
    col("xg_home").alias("xg_against"),
    col("p_away_win").alias("p_win"),
    col("p_draw").alias("p_draw"),
    col("p_home_win").alias("p_loss"),
    col("exp_pts_away").alias("exp_pts")
  )

val teamFixtures =
  homeView
    .unionByName(awayView)
    .withColumn("p_win", round(col("p_win") * 100, 2))
    .withColumn("p_draw", round(col("p_draw") * 100, 2))
    .withColumn("p_loss", round(col("p_loss") * 100, 2))
    .withColumn("xg_for", round(col("xg_for"), 2))
    .withColumn("xg_against", round(col("xg_against"), 2))
    .withColumn("exp_pts", round(col("exp_pts"), 2))
teamFixtures
  .coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv("data/match_level_fixtures")

homeView = [event_id: int, match_date: date ... 9 more fields]
awayView = [event_id: int, match_date: date ... 9 more fields]
teamFixtures = [event_id: int, match_date: date ... 9 more fields]


[event_id: int, match_date: date ... 9 more fields]