In [None]:
import os

from dotenv import load_dotenv

from load_data import loading_and_pre_processing_pipeline

load_dotenv()

feature_data = (
    loading_and_pre_processing_pipeline()
    .sort_values("user_id")
    .head(n=int(os.environ["LIMIT"]))
)

print(
    f'{len(feature_data)} records from {len(set(feature_data["user_id"].values))} users present. '
)
print(f"{len(feature_data.dropna(axis=0))} of which are complete.")
feature_data.head()

In [None]:
numeric = [
    "rhr_signal_mean",
    "rhr_signal_min",
    "rhr_signal_max",
    "steps_signal_mean",
    "steps_signal_min",
    "steps_signal_max",
    "days_since_last_dose",
    "vaccination_status",
    "fittness",
    "sex",
    "age",
    "omicronba1_share",
    "omicronba2_share",
    "omicronba5_share",
    "delta_share",
]
categorical = [
    "chills",
    "body_pain",
    "loss_of_taste_and_smell",
    "fatigue",
    "cough",
    "cold",
    "diarrhea",
    "sore_throat",
]

feature_names = categorical + numeric
target = "test_result"

In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from typing import List

from log_reg_model import model


def train_model(features: List[str], target: str, data: pd.DataFrame, random_state=42):

    # use only records with labels (test_result) present
    working_data = data.dropna(subset=[target])

    # Do train test split along user ids meaning the data of one group of users is the test set and the
    # data of another group of users is the training set and data of one particular user can not be in both.
    gsp = GroupShuffleSplit(test_size=0.33, n_splits=1, random_state=random_state)
    split = gsp.split(working_data, groups=working_data["user_id"])
    train, test = next(split)

    training_data = working_data.iloc[train]
    testing_data = working_data.iloc[test]

    X = training_data[features]
    y = training_data[target].astype(int)

    model.fit(X, y)

    return (
        model,
        testing_data[features],
        testing_data[target].astype(int),
        test,
        list(set(list(training_data["user_id"].values))),
    )

In [None]:
from analysis_plots import plot_analysis

pipeline, X_test, y_test, _, __ = train_model(feature_names, target, feature_data)

fig = plot_analysis(pipeline, X_test, y_test, feature_names, 0.5)

fig.savefig("model_metrics.png", dpi=300)

In [None]:
import ramda as R
from datetime import timedelta

MEAN_DAYS = 7


def calculate_incidence(cases_column: str, cases_data: pd.DataFrame):

    res2 = cases_data.groupby("date").agg({cases_column: ["sum", "count"]})

    res2.columns = res2.columns.droplevel(0)

    res2["rate"] = res2["sum"].div(res2["count"])

    res2["incidence"] = res2.rolling("7D")["rate"].mean() * 100_000

    incidence = res2.reset_index().dropna(axis=0)[["incidence", "date"]]
    incidence["date"] = incidence["date"] + timedelta(days=7)
    return incidence


def calculate_questionnaire_incidence(
    feature_data: pd.DataFrame,
) -> pd.DataFrame:

    res1 = feature_data.replace(
        {True: 1, False: 0, None: 0, float("nan"): 0}
    ).reset_index()[["test_result", "date", "user_id"]]

    return calculate_incidence("test_result", res1)


@R.curry
def predicted_incidence(
    classifier,
    feature_data,
    feature_names,
    threshold,
    column_name="rolling mean detections",
):
    data = feature_data.set_index(["user_id", "date"])

    # predicted probability for a set of vital data and reported symptoms to represent an infection
    proba_infected = classifier.predict_proba(data[feature_names]).T[1].T

    p = pd.DataFrame(data=proba_infected, index=data.index, columns=["probability"])

    colname = f"{MEAN_DAYS}d {column_name}"

    # threshold predicted probabilities to classify data points in positive and negative predictions
    p[colname] = p["probability"] > threshold

    # if individuals are detected as infected in more than one week, take only the first detection
    only_first_positive_detections = (
        p[p[colname] == True]
        .reset_index()
        .sort_values(["user_id", "date"])
        .groupby(["user_id"])
        .agg({colname: "first", "date": lambda x: list(x)[0]})
        .reset_index()
    )
    all_negative_detections = p[p[colname] == False].reset_index()

    detected_infections = pd.concat(
        [only_first_positive_detections, all_negative_detections]
    )

    return calculate_incidence(colname, detected_infections)


def predict_incidence_runs(
    feature_data: pd.DataFrame, feature_names, threshold=0.2
) -> pd.DataFrame:
    runs = []
    for i in range(10):
        predictor, _, __, ___, training_user_ids = train_model(
            feature_names, target, feature_data, random_state=i
        )

        full_test_data = feature_data[~feature_data["user_id"].isin(training_user_ids)]
        test_data_with_tests = full_test_data[full_test_data["test_result"].notna()]
        test_data_without_tests = full_test_data[full_test_data["test_result"].isna()]

        print(full_test_data["test_result"])

        nowcasted_incidence_all_data = predicted_incidence(
            predictor, full_test_data, feature_names, threshold
        )

        nowcasted_incidence_no_test = predicted_incidence(
            predictor, test_data_without_tests, feature_names, threshold
        )

        nowcasted_incidence_with_test = predicted_incidence(
            predictor, test_data_with_tests, feature_names, threshold
        )

        nowcasted_incidence_all_data["run_id"] = i
        nowcasted_incidence_no_test["run_id"] = i
        nowcasted_incidence_with_test["run_id"] = i

        nowcasted_incidence_all_data["data"] = "all"
        nowcasted_incidence_no_test["data"] = "no_test"
        nowcasted_incidence_with_test["data"] = "with_test"

        runs.append(
            nowcasted_incidence_all_data[["incidence", "run_id", "date", "data"]]
        )
        runs.append(
            nowcasted_incidence_no_test[["incidence", "run_id", "date", "data"]]
        )
        runs.append(
            nowcasted_incidence_with_test[["incidence", "run_id", "date", "data"]]
        )

    predicted_incidence_runs = pd.concat(runs).reset_index().sort_values("date")

    predicted_incidence_plot_data = (
        predicted_incidence_runs.groupby(["date", "data"])
        .agg({"incidence": ["mean", "std"]})
        .reset_index()
    )

    predicted_incidence_plot_data.columns = ["date", "data", "mean", "std"]
    return predicted_incidence_plot_data

In [None]:
import plotly.graph_objects as go
from datetime import date, datetime

# All
threshold = 0.6
after = feature_data[["date"]] > pd.to_datetime(date(year=2021, month=11, day=1))
before = feature_data[["date"]] < pd.to_datetime(
    datetime.now().date() - timedelta(days=1)
)

working_data = feature_data[(before & after).values]

questionnaire_incidence = calculate_questionnaire_incidence(working_data)
predicted_incidence_plot_data = predict_incidence_runs(
    working_data, feature_names, threshold
)
predicted_incidence_plot_data.replace({float("nan"): 0})

In [None]:
predicted_incidence_plot_data.sort_values("date", inplace=True)

default_layout = {
    "title": {"y": 0.97, "x": 0.5, "xanchor": "center", "yanchor": "top"},
    "font_size": 12,
    "autosize": True,
    "height": 600,
    "margin": dict(l=60, r=30, b=75, t=80, pad=0),
    "template": "plotly_white",
    "hovermode": "x",
    "legend": {
        "font_size": 11,
        "x": 0,
        "y": 1,
        "itemclick": "toggleothers",
        "itemdoubleclick": "toggle",
        "tracegroupgap": 2,
    },
    "hoverlabel": {"font_color": "white", "namelength": -1},
    "xaxis": {
        "showline": True,
        "linewidth": 1.5,
        "linecolor": "#455266",
        "ticks": "outside",
        "tickwidth": 1.5,
        "rangemode": "nonnegative",
    },
    "yaxis": {
        "showline": True,
        "linewidth": 1.5,
        "linecolor": "#455266",
        "ticks": "outside",
        "tickwidth": 1.5,
        # "rangemode": "nonnegative",
        # "range": [0, 12_000],
    },
    "showlegend": True,
}

i18n = {
    "de": {
        "booster": "Mit Auffrischimpfung 18-59 Jahre",
        "base": "Grundimmunisierte  18-59 Jahre",
        "unvax": "Ungeimpfte 18-59 Jahre",
        "title": "Gemeldete Inzidenz vs. Machine Learning Nowcast",
        "y_axis_title": "Inzidenz",
        "all": "Nowcast",
        "no_test": "Without data with tests",
        "with_test": "Only data with tests",
        "insecurity": "Modell Unsicherheit",
    },
    "en": {
        "booster": "With booster 18-59 years",
        "base": "Vaccinated  18-59 years",
        "unvax": "Unvaccinated 18-59 years",
        "title": "Reported Incidence vs. Machine Learning Nowcast",
        "y_axis_title": "Incidence",
        "all": "Nowcast",
        "no_test": "Nur daten ohne Tests",
        "with_test": "Nur daten mit Tests",
        "insecurity": "Confidence interval",
    },
}


SKIP_DAYS = 5


@R.curry
def add_line(fig, data: pd.DataFrame, config: dict):
    key = config["key"]
    hue_color = config["hue_color"]
    line_color = config["line_color"]

    x = list(data[data["data"] == key]["date"].apply(pd.Timestamp))
    y = data[data["data"] == key]["mean"].values
    y_err = data[data["data"] == key]["std"].values
    y_upper = list(y + y_err)
    y_lower = list(y - y_err)

    print(x)

    fig.add_trace(
        go.Scatter(
            name=translation["insecurity"],
            x=x[::-1] + x,  # x, then x reversed
            y=y_lower[::-1] + y_upper,  # upper, then lower reversed
            showlegend=False,
            legendgroup=translation[key],
            marker=dict(color="#444"),
            line=dict(width=1, color=line_color),
            mode="lines",
            fillcolor=hue_color,  #
            fill="toself",
            hoverinfo="skip",
        )
    )

    fig.add_trace(
        go.Scatter(
            y=y,
            x=x,
            name=translation[key],
            legendgroup=translation[key],
            mode="lines",
            hovertemplate="%{y:.0f}",
            line=dict(width=2.5, color=line_color),  #
        )
    )


for language, translation in i18n.items():
    fig = go.Figure()

    R.map(
        add_line(fig, predicted_incidence_plot_data),
        [
            {
                "key": "all",
                "hue_color": "rgba(76, 89, 168, 0.3)",
                "line_color": "#4C59A8",
            },
            {
                "key": "no_test",
                "hue_color": "rgba(245, 240, 85, 0.3)",
                "line_color": "#F5F055",
            },
            {
                "key": "with_test",
                "hue_color": "rgba(236, 73, 245, 0.3)",
                "line_color": "#EC49F5",
            },
        ],
    )

    fig.add_trace(
        go.Scatter(
            y=questionnaire_incidence["incidence"].values[:-1],
            x=questionnaire_incidence["date"].values[:-1],
            name="questionnaire incidence",
            legendgroup="incidence",
            mode="lines",
            hovertemplate="%{y:.0f}",
            line=dict(width=2.5, color="red"),  # 455266
        )
    )
    layout = dict(
        title_text=translation["title"],
        yaxis_title=translation["y_axis_title"],
        hovermode="x unified",
        hoverlabel={
            "font": {
                "color": "black",
            }
        },
    )
    fig.update_layout(default_layout)
    fig.update_layout(layout)
    fig.write_html(
        f"incidence_nowcast_{language}.html",
        include_plotlyjs="cdn",
        full_html=False,
        config={"responsive": "true", "displayModeBar": False},
    )

fig.write_image(f"incidence_nowcast.png")
fig.show()