# Process ODH Data
This notebook extracts the necessary numbers for PDE DSA from the output of my parseODH.py script.

In [None]:
using CSV,DataFrames,Dates,Plots,Measures,ColorSchemes
gr();clr=palette(:default,rev=true); default(palette=clr);

include("bspl1d.jl");

ENV["COLUMNS"]=10000;

## Inputs

In [None]:
# Inputs
day0 = Date("2020-10-01");
dayf = Date("2021-02-15");

# From ERoot
agekeys = ["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80+"]
ohiopop = Dict{String,Float64}(
           "0-9"=>695933. + 1/3*2233431,
          "10-19"=>2. /3*2233431,
          "20-29"=>1554324.,
          "30-39"=>1428941.,
          "40-49"=>691199. + 1/2*1544429,
          "50-59"=>1/2*1544429. + 834920,
          "60-69"=>765841. + 631247,
          "70-79"=>449394. + 331259,
          "80+"=>228889. + 252072);
totpop = sum([ohiopop[key] for key in keys(ohiopop)]);
println("Total Ohio Population: $totpop")

# Normalizations from case counts to probability densities for comparing to pde prediction
#  At t=0 assumed only yˢ starts with nonzero values at time t=0
tsim = getfield(dayf-day0,:value);
ηyˢ = 1/totpop/(100*365+tsim);
ηyᵛ = 1/totpop/tsim;
ηyⁱ = 1/totpop/tsim;

## Function routines for processing

In [None]:
function mymvavg(v::Vector{Float64})
    w = similar(v);
    for i=1:length(v)
        a = i-3<1 ? 1 : i-3;
        b = i+3>length(v) ? length(v) : i+3;
        
        w[i] = sum(v[a:b])/length(v[a:b]);
    end
    return w
end;

## Data
#### ODH cases

In [None]:
dfodh = CSV.read("ODH_Data\\ODH_0119.csv",DataFrame);
println("Raw ODH:");
first(dfodh,3)

In [None]:
flag = (dfodh[!,:time].>=day0).&(dfodh[!,:time].<=dayf);
dftemp = dfodh[flag,:]; 
dfwave = deepcopy(dftemp);
for i=2:12
    dfwave[!,i]=mymvavg(dftemp[!,i]);
end
println("7-day averaged ODH:")
first(dfwave,3)

In [None]:
p1 = plot(dfodh[!,:time],dfodh[!,:daily_confirm],linewidth=1,linestyle=:dash,labels="",xlabel="date",ylabel="reported",
     title="ODH daily cases");
plot!(dftemp[!,:time],dftemp[!,:daily_confirm],linewidth=2,labels="");

p2 = plot(dftemp[!,:time],dftemp[!,:daily_confirm],linewidth=1,linestyle=:dash,labels="reported",xlabel="date",ylabel="")
plot!(dftemp[!,:time],dfwave[!,:daily_confirm],linewidth=2,labels="7-day avg",title="First wave");

lay = @layout [a b]
plot(p1,p2,layout=lay,size=(950,200),margin=5mm)

In [None]:
p3 = plot(dfwave[!,"time"],[dfwave[:,i] for i=2:10],legend=:topleft,size=(950,300),linewidth=2,
          labels=["0-9" "10-19" "20-29" "30-39" "40-49" "50-59" "60-69" "70-79" "80+"],
          title="ODH 7-day average",xlabel="date",ylabel="daily cases",margin=5mm,
          palette=clr)
println("How to compute data we are fitting from this?")
plot!(xticks=day0:Day(14):dayf)

In [None]:
dfcumwave = similar(dfwave); dfcumwave[!,:time]=dfwave[:,:time];
for key in agekeys
    dfcumwave[!,key]=cumsum(dfwave[!,key]);
end 
p4 = plot(dfcumwave[!,"time"],[dfcumwave[:,i] for i=2:10],legend=:topleft,size=(950,300),linewidth=2,
          labels=["0-9" "10-19" "20-29" "30-39" "40-49" "50-59" "60-69" "70-79" "80+"],
          title="ODH 7-day average",xlabel="date",ylabel="cumulative cases",margin=5mm,
          palette=clr)
plot!(xticks=day0:Day(14):dayf)

#### CDC vaccination

In [None]:
dfcdc = CSV.read("ODH_Data\\VaxCdcJan03_1D.csv",DataFrame); 
rename!(dfcdc,"Date"=>"time","19-Oct"=>"10-19"); dfcdc[!,"0-9"]=convert(Vector{Float64},dfcdc[!,"0-9"]);
dfcdc[!,:time]=Date.(dfcdc[!,:time],"m/d/yyyy");
dfcdc[!,:time].=dfcdc[!,:time].-Day(14); # my parse vax routine added a 14 day delay
flag = (dfcdc[!,:time].>=day0).&(dfcdc[!,:time].<=dayf);
dfvax = dfcdc[flag,:];
println("'Raw' CDC Vax_1D data:")
last(dfvax,3)

In [None]:
for i=2:10
    dfvax[!,i] = mymvavg(dfvax[!,i]);
end
println("7-day smooth CDC Vax_1D data:")
first(dfvax,3)

In [None]:
p5 = plot(dfvax[!,"time"],[dfvax[:,i] for i=2:10],legend=:topleft,size=(950,300),linewidth=2,
          labels=["0-9" "10-19" "20-29" "30-39" "40-49" "50-59" "60-69" "70-79" "80+"],
          title="CDC 1D 7-day average",xlabel="date",ylabel="daily doses",margin=5mm,
          palette=clr);
println("Does this suggest a λ shape?")
vline!([day0],linewidth=0,labels="")
plot!(xticks=day0:Day(14):dayf)

In [None]:
dfcumvax = similar(dfvax); dfcumvax[!,:time]=dfvax[:,:time];
for key in agekeys
    dfcumvax[!,key]=cumsum(dfvax[!,key]);
end 
p6 = plot(dfcumvax[!,"time"],[dfcumvax[:,i] for i=2:10],legend=:topleft,size=(950,300),linewidth=2,
          labels=["0-9" "10-19" "20-29" "30-39" "40-49" "50-59" "60-69" "70-79" "80+"],
          title="ODH 7-day average",xlabel="date",ylabel="cumulative vaccinations",margin=5mm,
          palette=clr)
vline!([day0],linewidth=0,labels="")
plot!(xticks=day0:Day(14):dayf)

### Vaccination hazard

We may estimate the hazard through the relation $\lambda(t)=\lim_{\Delta t\rightarrow 0}\frac{P\left(T\leq t+\Delta t\left.\right| T>t\right)}{\Delta t} = -\frac{d}{dt}\left[\ln G(t)\right]$ where $G(t) = P(T\geq t)$ is the survival function. We then estimate $\ln G(t)$, fit with a smooth spline, and compute the appropriate derivative.

In [None]:
# Parameters for spline knots
Δ = convert(Vector{Float64},LinRange(0.0,length(dfcumvax[!,"time"])-1,4));
Δ = sort!([Δ;5.0])
r = 2; d = 3;

In [None]:
# Compute the ln survival function in each age group and extract empirical hazard
ℓnG = Dict{String,Vector{Float64}}();
for key in agekeys
    ℓnG[key] = (ohiopop[key] .- dfcumvax[!,key])/ohiopop[key] |> (x->log.(x));
end

bspl = Dict{String,Bspl1d}();
xval = convert(Vector{Float64},0:(length(dfcumvax[!,"time"])-1));

knots = Knots(Δ,r,d);
for key in agekeys
    bspl[key] = lsqspl(xval,-ℓnG[key],knots);
end

vbspl = [bspl[key] for key in agekeys];
# Note on p1 can do p1.<tab> etc to explore dropdown
p1 = plot(vbspl); 
p1[1][1][:label]="0-9";p1[1][2][:label]="10-19";p1[1][3][:label]="20-29";p1[1][4][:label]="30-39";
p1[1][5][:label]="40-49";p1[1][6][:label]="50-59";p1[1][7][:label]="60-69";p1[1][8][:label]="70-79";
p1[1][9][:label]="80+";
for key in agekeys
    plot!(xval,-ℓnG[key],seriestype=:scatter,markersize=3,labels="",alpha=0.2)
end
plot!(legend=:topleft,xlabel="days after Dec 15",ylabel="-ℓn P(T>=t)")

∂vbspl = [∂(bspl[key]) for key in agekeys];
p2 = plot(∂vbspl);
p2[1][1][:label]="0-9";p2[1][2][:label]="10-19";p2[1][3][:label]="20-29";p2[1][4][:label]="30-39";
p2[1][5][:label]="40-49";p2[1][6][:label]="50-59";p2[1][7][:label]="60-69";p2[1][8][:label]="70-79";
p2[1][9][:label]="80+";
plot!(xlabel="days after Dec 15th",ylabel="hazard rate λ",legend=:topleft,ylim=(0.0,0.014));

lay = @layout [a b];
plot(p1,p2,layout=lay,size=(800,300))

In [None]:
savefig("VaxHaz.pdf");

In [None]:
println("Hazard rate bsplines by age group:")
Pλ = Vector{Any}(undef,9);
for i=1:9
    #println(key);
    #println(∂(bspl[key]))
    #println("")
    
    plot(∂(bspl[agekeys[i]]));
    Pλ[i] = plot!(xlabel="days after Dec 15th",ylabel="λ",legend=:topleft,ylim=(0.0,0.014));
    Pλ[i][1][1][:label]=labels=agekeys[i];
end

lay = @layout [a b c;d e f;g h i]
plot(Pλ[1],Pλ[2],Pλ[3],Pλ[4],Pλ[5],Pλ[6],Pλ[7],Pλ[8],Pλ[9],layout=lay,size=(900,600))

In [None]:
savefig("VaxHaz_ages.pdf");

## Equation terms estimated from ODH and CDC data
#### $f^s$

In [None]:
function fˢ(s::Float64)
    if (s<0.0)||(s>100.0)
        return 0.0
    elseif s<70.0
        return 0.0125/1.0625
    else
        return (s-100)*(-0.0125/30)/1.0625
    end
end;

In [None]:
agsz = [ohiopop[key]/totpop for key in ["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80+"]];
p7 = plot([convert(Vector,5:10:75);[90]],[agsz[1:end-1]/10;agsz[end]/20] ,seriestype=:bar,ylabel="pdf",xlabel="age group",
          xticks=0:10:100,size=(475,200),margin=5mm,labels="",title="Ohio age distribution",
          bins=[0:10:80,100]);
s=0:0.01:100; y=fˢ.(s);
println("Except numerically we also multiply by switch bumps at s=0,s=100:")
vline!([70.0],c=:red,linestyle=:dash,linewidth=1,labels="")
plot!(s,y,linewidth=2,c=:red,labels="fˢ")

#### $y^s$
Each age bracket below describes how $y^s$ evolves, within that bracket, as a function of time. A trick is that this measured $y^s$ is both a function of (s,t) and $\eta$, the reporting factor.
1. The first plot (note $\eta$ dependence) can give time series points that the simulation $y^s$ density could match (eg. at the middle value of the age bracket).
2. Second plot shows $y^s$ as a heatmap (note $\eta$ dependence).

In [None]:
η = 1.0; # reporting factor for infection

println("yˢ normalized to have unit mass at initial time over [0,100] years:")
p8 = plot(dfwave[!,"time"],
          [(ohiopop[key] .- η*dfcumwave[:,key] - [fill(0.0,getfield(dfvax[1,:time]-dfwave[1,:time],:value));dfcumvax[:,key]])*ηyˢ for key in agekeys],
          legend=:topleft,size=(950,300),linewidth=2,
          labels=["0-9" "10-19" "20-29" "30-39" "40-49" "50-59" "60-69" "70-79" "80+"],
          title="yˢ by age group: η=$η",xlabel="date",ylabel="yˢ",margin=5mm,
          palette=clr)

In [None]:
taxis = 0:1:getfield(dfwave[end,:time]-dfwave[1,:time],:value);
saxis = 0:0.1:100;
yˢ = Matrix{Float64}(undef,length(taxis),length(saxis));
for i=1:length(taxis)
    for j=1:length(saxis)
        s = saxis[j]
        if s<10
            key = "0-9";
        elseif s<20
            key = "10-19";
        elseif s<30
            key = "20-29";
        elseif s<40
            key = "30-39"
        elseif s<50
            key = "40-49";
        elseif s<60
            key = "50-59";
        elseif s<70
            key = "60-69";
        elseif s<80
            key = "70-79";
        else
            key = "80+";
        end
        yˢ[i,j] = ( (ohiopop[key] - η*dfcumwave[i,key] - 
                    [fill(0.0,getfield(dfvax[1,:time]-dfwave[1,:time],:value));dfcumvax[:,key]][i])*ηyˢ );
    end
end;
println("ODH/CDC derived susceptibility evolution from $day0 to $dayf:")
p9 = heatmap(saxis,taxis,yˢ*1e6,title="yˢ by age group: η=$η \n (scaled by 1e6)",size=(400,250),
        xlabel="ages (years)",ylabel="days after Oct 1",xguidefontsize=10,yguidefontsize=10,titlefontsize=12,
        margin=5mm)

In [None]:
# Export yˢ
dftemp = DataFrame("date"=>dfwave[!,"time"])
dftemp[!,"time"] = getfield.(dftemp[!,"date"].-Date("2020-10-01"),:value);
for key in agekeys
    dftemp[!,key] = (ohiopop[key] .- η*dfcumwave[:,key] - [fill(0.0,getfield(dfvax[1,:time]-dfwave[1,:time],:value));dfcumvax[:,key]])*ηyˢ;
end
CSV.write("ODH_ys.csv",dftemp);

#### $y^v$ and $y^i$
The ODH and CDC report daily incidence which then is what the implicit boundary data axis value should take for infection and vaccination, but normalized to the Ohio state population.

In [None]:
p9 = plot(dfwave[:,:time],η*dfwave[:,:daily_confirm]*ηyⁱ,linewidth=2,xlabel="date",ylabel="yⁱ(0,t)",
          title="ODH incidence (η=$η)",labels="",xticks=day0:Day(45):dayf,margin=5mm);
transform!(dfvax,["0-9","10-19","20-29","30-39","40-49","50-59","60-69","70-79","80+"]=>
                 ((x1,x2,x3,x4,x5,x6,x7,x8,x9)->x1+x2+x3+x4+x5+x6+x7+x8+x9)=>"daily_confirm");
p10 = plot(dfvax[:,:time],dfvax[:,:daily_confirm]*ηyᵛ,linewidth=2,xlabel="date",ylabel="yᵛ(0,t)",
          title="CDC Vax",labels="",xticks=day0:Day(21):dayf,margin=5mm);

lay = @layout [a b];
println("ODH and CDC measurements for yⁱ and yᵛ:")
plot(p9,p10,layout=lay,size=(800,200))

In [None]:
# Export yᵛ,yⁱ
dftemp = DataFrame("date"=>dfwave[!,"time"])
dftemp[!,"time"] = getfield.(dftemp[!,"date"].-Date("2020-10-01"),:value);
dftemp[!,"yi"] = η*dfwave[:,:daily_confirm]*ηyⁱ;
CSV.write("ODH_yi.csv",dftemp);

dftemp = DataFrame("date"=>dfvax[!,"time"])
dftemp[!,"time"] = getfield.(dftemp[!,"date"].-Date("2020-10-01"),:value);
dftemp[!,"yv"] = dfvax[:,:daily_confirm]*ηyᵛ;
CSV.write("ODH_yv.csv",dftemp);