In [None]:
using JSON
using DataFrames
using StatsPlots
using Turing
using LinearAlgebra
using Random
using StatsBase
gr()

### Multi-level model using football match simulation as an example

In [None]:
## First, import the data and do some data wrangling

england_league = JSON.parsefile("./data/matches_England.json")

matches_df = DataFrame(home = [], away = [], score_home = [], score_away = [])

In [None]:
# example entry for each game in england_league:  "label" => "Burnley - AFC Bournemouth, 1 - 2"
matches = []
for match in england_league
    push!(matches, split(match["label"], ",")) # "Burnley - AFC Bournemouth" # "1 - 2"
end

for match in matches
    home, away = split(match[1], " - ")  # "Burnley" # "AFC Bournemouth" 
    score_home, score_away = split(match[2], " - ") # "1" # "2"
    push!(matches_df,[home, away, parse(Int,score_home), parse(Int,score_away)])
end

matches_df

teams = unique(collect(matches_df[:,1]))

In [None]:
## Now, our model

@model function football_matches(home_teams, away_teams, score_home, score_away, teams)

    # Hyper priors
    μatt ~ Normal(0, 0.1)
    μdef ~ Normal(0, 0.1)
    σatt ~ Exponential(1)
    σdef ~ Exponential(1)
    home ~ Normal(0, 1)
        
    # Team-specific effects
    
    att = zeros(length(teams))
    def = zeros(length(teams))
    
    for i in 1:length(teams)
        att[i] ~ Normal(μatt, σatt)
        def[i] ~ Normal(μdef, σdef)
    end

    #att ~ filldist(Normal(μatt, σatt), length(teams))  # more compact than for-loops
    #def ~ filldist(Normal(μdef, σdef), length(teams))
    
    offset = mean(att) + mean(def)
    
    # the number of matches
    n_matches = length(home_teams)

    # scoring rates θ
    θ_home = Vector{Real}(undef, n_matches)     # or just θ_home = zeros(n_matches)
    θ_away = Vector{Real}(undef, n_matches)     # or just θ_away = zeros(n_matches)
        
    # Modeling score-rate and scores for each match
    for i in 1:n_matches
        # scoring rate
        home_team_idx = findfirst(isequal(home_teams[i]), teams)
        away_team_idx = findfirst(isequal(away_teams[i]), teams)

        θ_home[i] = home + att[home_team_idx] + def[away_team_idx] - offset
        θ_away[i] = att[away_team_idx] + def[home_team_idx] - offset

        # scores
        score_home[i] ~ Poisson(exp(θ_home[i]))  # To ensure positive λ for Poisson distribution
        score_away[i] ~ Poisson(exp(θ_away[i]))
    end
end

In [None]:
model = football_matches(matches_df[:,1], matches_df[:,2], matches_df[:,3], matches_df[:,4], teams);
num_samples = 3000
posterior = sample(model, NUTS(), num_samples);

In [None]:
posterior_df=DataFrame(posterior)

In [None]:
DataFrames.transform!(posterior_df, AsTable(Between("att[1]","att[20]")) => ByRow(mean) => :att_mean)
DataFrames.transform!(posterior_df, AsTable(Between("def[1]","def[20]")) => ByRow(mean) => :def_mean)
DataFrames.transform!(posterior_df, AsTable([:att_mean,:def_mean]) => ByRow(sum) => :offset)

In [None]:
# For this example, we are interested in a pair of teams (no need to use map for all teams)

teamA = "Manchester City"
teamB = "Manchester United"

teamA_id = findfirst(isequal(teamA), teams)
teamB_id = findfirst(isequal(teamB), teams)

teamA_att_post = posterior_df[:,"att[$teamA_id]"]
teamA_def_post = posterior_df[:,"def[$teamA_id]"]

teamB_att_post = posterior_df[:,"att[$teamB_id]"]
teamB_def_post = posterior_df[:,"def[$teamB_id]"]

In [None]:
ha1 = histogram(teamA_att_post, title=teamA*" attack", titlefontsize = 12, legend=false, normalized=true);
ha2 = histogram(teamB_att_post, title=teamB*" attack", titlefontsize = 12, legend=false, normalized=true);
plot(ha1, ha2, layout=(1,2));
xlabel!("Attack power");
ylabel!("Probability density")

In [None]:
hd1 = histogram(teamA_def_post, title=teamA*" defense", titlefontsize = 12, legend=false, normalized=true);
hd2 = histogram(teamB_def_post, title=teamB*" defense", titlefontsize = 12, legend=false, normalized=true);
plot(hd1, hd2, layout=(1,2));
xlabel!("Defense power");
ylabel!("Probability density")

### Mini Project

Consult the lecture notes.

In [None]:
using Plots

Random.seed!(1234)

# Simulate 500 hypothetical finals (each with 3000 posterior samples)
sim_matches = 500

# First leg: teamA is home, teamB is away
θ_home = posterior_df[:, :home] + posterior_df[:, "att[$teamA_id]"] + posterior_df[:, "def[$teamB_id]"] - posterior_df[:, :offset]
θ_away = posterior_df[:, "att[$teamB_id]"] + posterior_df[:, "def[$teamA_id]"] - posterior_df[:, :offset]

teamA_score = rand.(Poisson.(exp.(θ_home)), sim_matches)
print(len(teamA_score))
teamB_score = rand.(Poisson.(exp.(θ_away)), sim_matches)

# Second leg: teamA is away, teamB is home
θ_home = posterior_df[:, :home] + posterior_df[:, "att[$teamB_id]"] + posterior_df[:, "def[$teamA_id]"] - posterior_df[:, :offset]
θ_away = posterior_df[:, "att[$teamA_id]"] + posterior_df[:, "def[$teamB_id]"] - posterior_df[:, :offset]

teamA_score += rand.(Poisson.(exp.(θ_away)), sim_matches)
teamB_score += rand.(Poisson.(exp.(θ_home)), sim_matches)

# Transform into long column vectors
teamA_score = vcat(teamA_score...)
teamB_score = vcat(teamB_score...)

winA = sum(teamA_score .> teamB_score) / length(teamA_score)
winB = sum(teamB_score .> teamA_score) / length(teamA_score)
draw = sum(teamA_score .== teamB_score) / length(teamA_score)

println("Winning probability of $teamA: $winA")
println("Winning probability of $teamB: $winB")
println("Draw probability: $draw")

# Define bins with discretization of 1
x_min, x_max = floor(minimum(teamA_score)), ceil(maximum(teamA_score))
y_min, y_max = floor(minimum(teamB_score)), ceil(maximum(teamB_score))

x_edges = collect(x_min:x_max)  # Each integer gets its own bin
y_edges = collect(y_min:y_max)  # Each integer gets its own bin

# **Fixed title formatting**
title_str = "Simulated goals with posterior rates \nacross $sim_matches game$(sim_matches > 1 ? "s" : "") with $num_samples sample$(num_samples > 1 ? "s" : "") each"

# Generate the 2D histogram with integer binning and **adjusted margins**
hm = histogram2d(teamA_score, teamB_score, bins=(x_edges, y_edges),
    xlabel="$teamA", ylabel="$teamB",
    title=title_str, colorbar=true,
    margin=10Plots.mm,  # Adds margin around the whole plot
    left_margin=10Plots.mm, right_margin=15Plots.mm,  # Extra space for axis labels
    top_margin=15Plots.mm, bottom_margin=10Plots.mm,  # More space for the title and x-axis label
    size=(800, 600))  # Increase plot size

# Ensure tick marks are placed at each integer value
xticks!(x_edges)
yticks!(y_edges)

display(hm)  # Render the plot
println("Plot saved as football_match.png with increased margins.")
