# Processing of dust data with individual isolation movement
This notebook processes the several data streams needed to run calibration of dust shedding hyperparameters on Spring 2022 data.

In [1]:
using CSV,DataFrames,Plots,Measures
gr();

ENV["COLUMNS"]=10000;
ENV["LINES"] = 500;

## Inputs

In [None]:
# Paths the csv files storing Karen's dust collection data
#  Naming convention should be "MmmDd.csv" to match myparsedustfname routine below
fKaren = [""];

# Goldstar test file path
fgdstr = "";

# File paths to Chance's spreadsheets for students moving in and out of isolation and bld test compliance
fiso = "";
fcmpl = "";

# Date ranges for which dust is collected
day0 = Date("2022-01-10");
dayf = Date("2022-01-17");

# Days after dayf a later tested positive individual is considered to have contributed to this interval's shedding
daywin = Day(5);

# Buildings considered for independent dust measurement calibration
blds = [""];

## Ancillary functions

As dust data spreadsheets come to have additional month names and associated dates, myparsedustfname may need to be updated below to account.

In [None]:
function dtrg(v)
    day0 = Date("3000-01-01"); dayf = Date("0000-01-01");
    for i=1:length(v)
        if !ismissing(v[i])
            v0 =  Date(v[i],"m/d/yyyy")
            day0 = v0 < day0 ? v0 : day0;
            
            dayf = v0 > dayf ? v0 : dayf;
        end
    end
    day0 = day0 == Date("3000-01-01") ? missing : day0;
    dayf = dayf == Date("0000-01-01") ? missing : dayf;
    return day0,dayf
end;

function myparsebeg(v)
    return dtrg(v)[1];
end

function myparseend(v)
    return dtrg(v)[2];
end; 

function myuppercase(v)
    if ismissing(v)
        return v
    end
    return uppercase(v)
end;

function mysplit(s::String)
    w=split(s," ")[1]|>Date;
    return w
end;

function myparsedustfname(s::String)
    dt = s[4:5];
    if s[1:3]=="Jan"
        mth = "01";
    elseif s[1:3]=="Feb"
        mth = "02";
    elseif s[1:3]=="Mar"
        mth = "03";
    end
    
    return Date("2022-"*mth*"-"*dt)
end

"""
Used to extract residential buildings from dorm entries of goldstar
"""
function ressplit(s::Union{String,Missing})
    if ismissing(s)
        return s
    end
    val = split(s," - ");
    return convert(String,val[2])
    end;
    
function btwn(x::Date,d1::Date,d2::Date)
    return (x>=d1)&&(x<=d2)
end;

"""
Used to map an iso start date into the range of Chance's dust compliance
"""
function isocmplymap(dt::Date,dtrg::Vector{Date})
    n = length(dtrg);
    @inbounds for i=1:n-1
        if (dt>dtrg[i])&&(dt<=dtrg[i+1])
            return dtrg[i+1]
        end
    end
    
    return missing
end

## Process dust sheets

In [None]:
# Build master dataframe for Karen's dust sheets
dfdustorg = similar(CSV.read(fdust[1],DataFrame),0); dfdustorg[!,"fname"] = [] |> (x->convert(Vector{String},x))
for fname in fdust
    dftemp = CSV.read(fname,DataFrame); dftemp["fname"]=fill(fname,nrow(dftemp));
    dfdustorg = vcat(dfdustorg,dftemp);
end
dfdustorg[!,:Building]=myuppercase.(dfdustorg[!,:Building]);
dfdust = dropmissing(dfdustorg,["Building","Result"]); gdf = groupby(dfdust,"Building");
dfdust = combine(gdf,"Start Date"=>myparsebeg=>"Start Date",
                 "Result"=>sum=>"Result (cp/mg)",
                 "End Date"=>myparseend=>"End Date",
                 "fname"=>(x->myparsedustfname.(x))=>"File Date");
select!(dfdust,["Building","Start Date","End Date","File Date","Result (cp/mg)"]); sort!(dfdust,["Building","File Date"])

In [None]:
println("Building dust signal:")
dfdust

## Process Goldstar and Iso/Bld compliance

In [None]:
dfmst = CSV.read(fpath,DataFrame); 
select!(dfmst,Not("name_n")); dropmissing!(dfmst,["empl_id","test_dt","result_dt","campus","general_affil"]);
dfmst[!,"test_dt"] = mysplit.(dfmst[!,"test_dt"]);
dfmst[!,"result_dt"] = mysplit.(dfmst[!,"result_dt"]);
flag = (  (dfmst[!,"test_dt"].>=Date("2022-01-10")).*(dfmst[!,"result_dt"].>=Date("2022-01-10"))
           .*(dfmst[!,"campus"].=="COL").*(dfmst[!,"general_affil"].=="STUDENT")  );
dfmst=dfmst[flag,:]; 
select!(dfmst,["emplid","test_dt","result_dt","result","addr_typ","addr_1"]);
first(dfmst)

In [None]:
first(dfmst,7)

#### Filter to just on campus population

In [None]:
gdf = groupby(dfmst,"addr_typ");
dfoncmp = gdf[("DORM",)]|>DataFrame;
dfoncmp[!,"addr_1"] = ressplit.(dfon[!,"addr_1"]);

#### Print the names of campus buildings

In [None]:
unique(dfoncmp[!,"addr_1"])

### Extract isolation movement and building compliance from Chance's output

#### Individual traffic for the relevant date ranges

In [None]:
dfiso = CSV.read(fiso,DataFrame); select!(dfiso,Not(["Column1","order_location","dorm_building"]));
dropmissing!(dfiso,"start_date"); #flag = dfchance[!,"start_date"].>=Date("2022-01-10"); dfchance = dfchance[flag,:];
rename!(dfiso,["start_date"=>"isostart_date","release_date"=>"isorelease_date"])
transform!(dfiso,"isostart_date"=>(x->(x.-Date("2022-01-10")))=>"isostart_day_relJan10");
dfram = leftjoin(dfon,dfiso,on=:emplid);

# Filter to those with positive test results in the relevant date ranges
gdf = groupby(dfram,["result"]); 
dfindiv = DataFrame(gdf[("DETECTED",)]); 
flag=(dfindiv[!,"isostart_date"].>=day0).*(dfindiv[!,"isostart_date"].<=dayf+daywin);
println("Test positive individuals in all buildings moving into isolation during from $day0 to $daywin days after $dayf");
dfindiv=dfindiv[flag,:];sort!(dfindiv,["addr_1","isostart_date"])

#### Extract building compliance from Chance
Confirm with Chance that we should use an end date over a start date

In [None]:
# Examine Chance's compliance spreadsheet
dfcmpl = CSV.read(fcmpl,DataFrame); select!(dfcmpl,Not("Column1")); dropmissing!(dfcmpl,"dorm");
transform!(dfcmpl,"start_date"=>(x->x.+Day(7))=>"end_date");
transform!(dfcmpl,["n_compliant","n_to_test"]=>((x,y)->x./y)=>"compliance"); sort!(dfcmpl,["dorm","end_date"]);
select!(dfcmpl,["dorm","end_date","compliance"]);

# Filter to relevant date ranges
flag = (dfcmpl[!,"end_date"].>day0).*(dfcmpl[!,"end_date"].<=dayf);
println("Campus compliance across all buildings from ($day0,$dayf]")
dfcmpl = dfcmpl[flag,:]; sort!(dfcmpl,["dorm","end_date"])

In [None]:
println("Campus compliance summary statistics:")
gdf = groupby(dfcmpl,"end_date");
dfcmpl_sum = combine(gdf,"compliance"=>(x->sum(x)/length(x))=>"mean campus compliance")

#### Restrict to the relevant buildings for this analysis

In [None]:
# Process individuals
gdf = groupby(dfindiv,["addr_1"]); dftemp = similar(dfindiv,0);
for bld in blds
    key = (addr_1=bld,);
    dftemp = vcat(dftemp,(gdf[key] |> DataFrame));
end
dfindiv = deepcopy(dftemp);

# Process compliance
gdf = groupby(dfcmpl,["dorm"]); dftemp = similar(dfcmpl,0);
for bld in blds
    key = (dorm=bld,);
    dftemp = vcat(dftemp,(gdf[key] |> DataFrame));
end
dfcmpl = deepcopy(dftemp);

In [None]:
println("Test positive individuals in specified buildings moving into isolation from [$day0,$dayf+$daywin] days:")
dfindiv

In [None]:
println("Campus compliance in specified buildings from ($day0,$dayf]:")
dfcmpl

#### Renormalize according to compliance for additional number of infected in the building

In [None]:
# Join individual information with compliance data
transform!(dfindiv,"test_dt"=>(x->isocmplymap.(x,dfcmpl[!,"end_date"]))=>"end_date");
dftemp = leftjoin(dfindiv,dfcmpl,on=["addr_1"=>"dorm","end_date"=>"end_date"]);

# Aggregate number of observed infected and extrapolated infected during these time periods
gdf = groupby(dftemp,["addr_1","end_date"]);
dfagg = combine(gdf,[nrow=>"obs_inf","compliance"=>(x->x[1])=>"compl","n_to_test"=>(x->x[1])=>"pop"]);
projinf = (pop,cmp,obsinf)->( (obsinf./(pop.*cmp)).*(1.-cmp).*pop.+obsinf );
transform!(dfagg,["pop","compl","obs_inf"]=>projinf=>"proj_inf");
println("Aggregate counts of infected in the buildings during specified date ranges:")
sort!(dfagg,["addr_1","end_date"])