In [10]:
import os.path
import os
import re
import pandas
import numpy as np

First we need to locate all relevant files

In [2]:
input_path = '../ensemble/output'

# Define the regex pattern to match filenames
pattern = re.compile(r'^performance')

# Initialize an empty list to store matched files
matched_files = []

# Iterate through files in the directory
for filename in os.listdir(input_path):
    # Check if the filename matches the regex pattern
    if pattern.search(filename):
        # If it matches, add to the list
        matched_files.append(filename)

In [3]:
print(len(matched_files))
print(matched_files[:5])

760
['performance-forecasting-Ensemble-onsetfixed-0-flag1-1-method-0-dist-0-horizon-1-weight_type--1-weekly-mpox-cases-China-area-1-05-18-2023.csv', 'performance-forecasting-Ensemble-onsetfixed-0-flag1-1-method-0-dist-0-horizon-1-weight_type--1-weekly-mpox-cases-China-area-1-05-25-2023.csv', 'performance-forecasting-Ensemble-onsetfixed-0-flag1-1-method-0-dist-0-horizon-1-weight_type--1-weekly-mpox-cases-China-area-1-06-01-2023.csv', 'performance-forecasting-Ensemble-onsetfixed-0-flag1-1-method-0-dist-0-horizon-1-weight_type--1-weekly-mpox-cases-China-area-1-06-08-2023.csv', 'performance-forecasting-Ensemble-onsetfixed-0-flag1-1-method-0-dist-0-horizon-1-weight_type--1-weekly-mpox-cases-China-area-1-06-15-2023.csv']


Now we accumulate data from files

In [4]:

df = None
model_no_col = "mod_num"
model_prefix = "SE-"

for file in matched_files:
    split_fn = file.split(".")
    name, extension = split_fn[0], split_fn[-1]
    name_elems = name.split("-")
    model = name_elems[2]
    horizon = name_elems[12]
    location = name_elems[-6]
    date = "-".join(name_elems[-3:])
    data = pandas.read_csv(os.path.join(input_path, file))
    data["model"] = model_prefix + model
    data["horizon"] = horizon
    data["date"] = date
    data["location"] = location
    data.columns.values[0] = model_no_col
    if df is None:
        df = data
    else:
        df = pandas.concat([df, data], ignore_index=True)

In [5]:
df.head()

Unnamed: 0,mod_num,MAE,MSE,Coverage 95%PI,WIS,model,horizon,date,location,AICc,RelativeLikelihood
0,2,0.011681,0.000136,100.0,1.61692,Ensemble,1,05-18-2023,China,,
1,3,0.327239,0.107085,100.0,1.698607,Ensemble,1,05-18-2023,China,,
2,4,0.407712,0.166229,100.0,1.731814,Ensemble,1,05-18-2023,China,,
3,2,26.887591,722.942544,0.0,22.720401,Ensemble,1,05-25-2023,China,,
4,3,27.136223,736.374584,0.0,22.689792,Ensemble,1,05-25-2023,China,,


In [6]:
df[df["MAE"]>1000]

Unnamed: 0,mod_num,MAE,MSE,Coverage 95%PI,WIS,model,horizon,date,location,AICc,RelativeLikelihood


Now calculate averages for each model

In [13]:
summary = df.groupby(['model', model_no_col, "location", "horizon"])[["MAE", "MSE", "Coverage 95%PI", "WIS"]].mean().reset_index()
summary["model"] = summary.apply(lambda row: f"{row['model']}({row['mod_num']})", axis=1)
summary = summary[['model', "horizon", "location", "MSE", "MAE", "Coverage 95%PI", "WIS"]]
summary["WIS"] = np.log10(summary["WIS"])

In [14]:
summary.head()

Unnamed: 0,model,horizon,location,MSE,MAE,Coverage 95%PI,WIS
0,SE-Ensemble(2),1,China,216.615076,11.641045,90.322581,0.904907
1,SE-Ensemble(2),2,China,463.107083,15.803686,86.666667,1.126823
2,SE-Ensemble(2),3,China,784.60662,19.586708,83.908046,1.401289
3,SE-Ensemble(2),4,China,1196.308427,23.64695,83.928571,1.798612
4,SE-Ensemble(2),1,Japan,4.335076,1.398077,100.0,-0.036839


Finally, output average metrics to the respective files

In [12]:
for location in summary["location"].unique():
    output_path = f"../dashboard/output/unsmoothed/{location}"
    summary[summary["location"] == location].to_csv(os.path.join(output_path, f"{model_prefix}average_metrics.csv"), index=False)

In [6]:
summary.to_csv(os.path.join("../dashboard/output/unsmoothed/", f"{model_prefix}average_metrics.csv"), index=False)