# Convert raw data into JLD2 format

## Input data (i.e. the data uploaded by Chacoma)

The data provided by Chacoma consists in one file per metric per league.
The format of these files is

    n1,n2,fij
    1,2,-2.1274e+00
    1,3,-2.9301e+00
    1,4,-1.7331e+00
    1,5,-1.7331e+00
    1,6,-3.9430e-01
    ...

where `n1`$=i$, `n2`$=j$ and `fij`$=f_{ij}=f_{ji}$ is the antisymmetric components of the 1-cochain $f$.

### Tournament points

Here $f_{ij}=\pi_j-\pi_i$ if team $i$ did $\pi_i$ points in the tournament and the analogous for team $j$.
Hence, $f_{ij}<0$ if team $i$ did more points than team $j$.

        ~/redes/all_1/[league]/red_puntos.dat  

### Ranks

Here $f_{ij}=r_i-r_j<0$ if the final position in the table of points $r_i$ of team $i$ is higher than that $r_j$ of team $j$.
This can be obtaned from the file since $r_i=i$ by convention.

    ~/redes/all_1/[league]/red_puntos.dat      

### Metrics

Here $f$ is determined from the empirical metrics.
For example, if $m_{ij}$ represents the *build up time* team $i$ did in his matchs against team $j$, and $m_{ji}$ is the *build up time* $j$ did in his matchs against team $i$, then $f_{ij} = m_{ji}-m_{ij}<0$ if team $i$ makes more *build up time* than team $j$.
    
    ~/redes/all_1/[league]/
        red_T_build_up.dat
        red_T_maintenance.dat
        red_T_zona_media.dat
        red_counterattack.dat
        red_crossing.dat
        red_direct_play.dat
        red_flow_rate.dat
        red_pressure_loss.dat
        red_pressure_point.dat
        red_shots.dat  

### Styles

Here $f_{ij}=...$.

    ~/redes/all_1/[league]/
        red_S1.dat
        red_S2.dat
        red_S3.dat
        red_S4.dat
               
## Output format

A `JLD2` file containing a `SimpleWeightedGraph` from the `SimpleWeightedGraphs.jl` library, whose weights are the 1-cochain components $f_{ij}$ for $i<j$.

### Sparsification

In some cases, we may consider the generation of sparsified versions of $f_{ij}$ where not all $N(N-1)/2$ pairs $ij$ for $i<j$ are provided.

In [None]:
# Run this cell in a folder contained by the current project
using Pkg
#Pkg.activate()                 # Walks up the path tree until finds Project.toml
#Pkg.activate(".")              # 
#Pkg.activate("@__DIR__")       # 
Pkg.activate("../../../")       #
Pkg.instantiate()

In [None]:
using Plots
using LaTeXStrings
using Graphs
#using MetaGraphs
#using MetaGraphsNext
using GraphPlot
using SimpleWeightedGraphs
using Karnak
using NetworkLayout
using Colors
using Random
using LinearAlgebra
using OffsetArrays
using SparseArrays
using Serialization
using IterativeSolvers
using Krylov
using BenchmarkTools
using Profile
using ProfileView
using Statistics
using DifferentialEquations
using LoopVectorization
using Folds
using Transducers
using Distributed
using JLD2
using OnlineStats
using ImageFiltering
using SavitzkyGolay
using Interpolations
using DataInterpolations
using Polynomials
using CurveFit
using DataFrames
using LsqFit
using ForwardDiff
using JSON
using Dates

## Utils

### Enumerator

In [None]:
function enum!(d,k)
    return get!(d,k,length(d)+1)
end

### Test

In [None]:
mutable struct RunningStat
    n::Int
    m::Real # mean
    s::Real # std
end

function RunningStat()
    return RunningStat(0,0.0,0.0)
end

function Base.push!(rs::RunningStat,v::Real)
    if rs.n>0
        rs.n += 1
        new_m = rs.m + (v-rs.m)/rs.n
        rs.s += (v-rs.m)*(v-new_m)
        rs.m = new_m
    else
        rs.n = 1
        rs.m = v
        rs.s = 0
    end
    return rs.m
end

function Statistics.mean(rs::RunningStat)
    return rs.m 
end

function Statistics.var(rs::RunningStat)
    return rs.s/(rs.n-1)
end

function Statistics.std(rs::RunningStat)
    return sqrt(var(rs))
end

### Tests

In [None]:
list_v = 10 .+ 5.0*randn(1000)
;

In [None]:
rs = RunningStat()

In [None]:
for v in list_v
    push!(rs,v)
end

In [None]:
mean(rs),std(rs)

In [None]:
mean(list_v),std(list_v)

In [None]:
function R_squared(y_data,y_fit)
    y_mean = mean(y_data)
    SS_res = sum((y_data .- y_fit) .^ 2)
    SS_tot = sum((y_data .- y_mean) .^ 2)
    R_sqr = 1 - SS_res / SS_tot
    return R_sqr
end

### Tests

In [None]:
# Define the model function
@. model(x, p) = p[1] * x + p[2]

# Sample data (example)
x_data = [1.0, 2.0, 3.0, 4.0, 5.0]
y_data = [2.2, 2.8, 3.6, 4.5, 5.1]

# Initial guess for parameters
p0 = [1.0, 1.0]

# Perform curve fitting
fit = LsqFit.curve_fit(model, x_data, y_data, p0)

# Compute fitted values
y_fit = model(x_data, coef(fit))

# Compute R_sqr
R_sqr = R_squared(y_data,y_fit)

In [None]:
plot()
#plot!(yscale=:log10)
#plot!(ylim=(0.0,1.0))
plot!(xlabel=L"x")
plot!(ylabel=L"y")
scatter!(x_data,y_data,label="data")
plot!(x_data,y_fit,label="fit, \$R^2=0.995\$")
#plot!(0.0:0.0,x->0.0,label="\$z=1\$",width=0.0,c=:white)
#plot!(legendtitle=L"\mathrm{1D},\;z=1",legendtitlefontsize=8)
#plot!(list_sigma,x->0.6827/2,label="",style=:dash,color=:grey)
#plot!(
#    xticks=([0,1,2,3,4,5],[L"0",L"1",L"2",L"3",L"4",L"5"]),
#    yticks=([0,1,2,3],[L"0",L"1",L"2",L"3"]),
#    tickfont=font(11),
#)

In [None]:
x0 = 1.0
A = 1.0
B = 3.0
C = 0.0
p0 = [x0,A,B,C] # p1=x0, p2=A, p3=B, p4=C
@. sigmoid(x,p)  = p[2]/(1+exp(-p[3]*(x-p[1]))) # sigmoid
@. dsigmoid(x,p) = p[2]*p[3]*exp(-p[3]*(x-p[1]))/(1+exp(-p[3]*(x-p[1])))^2 # derivative of sigmoid
@. psigmoid(x,p) = p[2]/p[3]*log(1+exp(p[3]*(x-p[1])))+p[4] # primitive of sigmoid

### Test

In [None]:
plot(
    ylims=(-0.1,1.1),
    xlabel=L"x",
    ylabel=L"f(x)",
)
plot!(-2:0.01:4.0,x->sigmoid(x,p0),label="")

In [None]:
plot(
    ylims=(-0.1,1.1),
    xlabel=L"x",
    ylabel=L"df/dx",    
)
plot!(-2:0.01:4.0,x->dsigmoid(x,p0),label="")
scatter!([x0],[A*B/4],label="")

In [None]:
plot(
    ylims=(-0.1,1.1),
    xlabel=L"x",
    ylabel=L"F(x)",
)
plot!(-2:0.01:4.0,x->psigmoid(x,p0),label="")

### Date tag

In [None]:
function get_date_tag()
    today_date = today()
    # format as YY-MM-DD
    tag = Dates.format(today_date, dateformat"yy-mm-dd")
    return tag
end

#### Test

In [None]:
get_date_tag()

## Create `~/redes/all_1/[league]/red_ranks.dat`

In [None]:
prefix = """../../../redes/all_1/"""

## Convert `..redes/all_1/[league]/red_[metric].dat` to JLD2 format

In [None]:
metrics = "crossing counterattack pressure_loss T_build_up direct_play pressure_point shots flow_rate T_maintenance T_zona_media"
metrics_extra = "S1 S2 S3 S4"
metrics_truth = "puntos ranks"

In [None]:
metrics_all = metrics*" "*metrics_extra*" "*metrics_truth

In [None]:
leagues = "England France Germany Italy Spain"

In [None]:
archivos = []
for league in split(leagues)
    for metric in split(metrics_all)
        archivo = prefix*league*"/red_"*metric*".dat"
        #println(archivo)
        push!(archivos,archivo)
    end
end

In [None]:
archivos

In [None]:
dict_weighted_g_jld2 = Dict()
for archivo in archivos
    println(archivo)
    vec_i = Vector{Int64}()
    vec_j = Vector{Int64}()
    vec_f_ij = Vector{Float64}()    
    open(archivo) do fh
        for line in readlines(fh)[2:end]
            cols=split(replace(line,"," => " "))
            #println(cols)
            i=parse(Int64,cols[1])
            j=parse(Int64,cols[2])
            @assert i<j
            f_ij=parse(Float64,cols[3])
            #@show i,j,f_ij            
            push!(vec_i,i)
            push!(vec_j,j)
            push!(vec_f_ij,f_ij)
        end
    end
    n = max(maximum(vec_i),maximum(vec_j))
    g = SimpleWeightedGraph(n)
    for (i,j,f_ij) in zip(vec_i,vec_j,vec_f_ij)
        #println(i," ",j," ",f_ij)
        add_edge!(g,i,j,f_ij)
    end
    tag = replace(archivo, prefix => "")
    tag = replace(tag, ".dat" => "")
    tag = replace(tag, "red_" => "")
    tag = replace(tag, "/" => " ")
    tag = split(tag)
    tag = (tag[1],tag[2])
    println("tag =",tag)
    dict_weighted_g_jld2[tag] = g
end

### Tests

In [None]:
g = dict_weighted_g_jld2[("Italy","shots")]

In [None]:
for e in edges(g)
    @show e,get_weight(g,e)
end

In [None]:
date_tag = get_date_tag()
jld2_file = "jld2/"*date_tag*"-dict-weighted-g.jld2"
@show jld2_file
@save jld2_file dict_weighted_g_jld2

### Tests

In [None]:
@load jld2_file dict_weighted_g_jld2

In [None]:
dict_weighted_g_jld2

# References

1. [A. Chacoma and O. V. Billoni, Data-driven approach to defining football styles in major leagues, Chaos, Solitons & Fractals 200, 116926 (2025)](https://doi.org/10.1016/j.chaos.2025.116926)
2. [J. I. Perotti, Analysis of the inference of ratings and rankings in complex networks using discrete exterior calculus on higher-order networks, Phys. Rev. E 111, 034306 (2025)](https://journals.aps.org/pre/abstract/10.1103/PhysRevE.111.034306)