In [None]:
from load_data import pull_from_postgres
from psycopg2.sql import SQL
import pandas as pd

test_data = pull_from_postgres(SQL("""
    SELECT
        f10 test_result, test_week_start, user_id
    FROM
        datenspende_derivatives.homogenized_features
    WHERE
        test_week_start >= '2021-10-01'
    """))

test_data['date'] = pd.to_datetime(test_data['test_week_start'])
test_data.drop(columns=['test_week_start'], inplace=True)

In [None]:
from datetime import timedelta

def calculate_incidence(cases_column: str, cases_data: pd.DataFrame):

    res2 = cases_data.groupby("date").agg({cases_column: ["sum", "count"]})

    res2.columns = res2.columns.droplevel(0)

    res2["rate"] = res2["sum"].div(res2["count"])

    res2["incidence"] = res2.rolling("7D")["rate"].mean() * 100_000

    incidence = res2.reset_index().dropna(axis=0)[["incidence", "date"]]
    incidence["date"] = incidence["date"] + timedelta(days=7)
    return incidence


def calculate_questionnaire_incidence(
        feature_data: pd.DataFrame,
) -> pd.DataFrame:

    res1 = feature_data.replace(
        {True: 1, False: 0, None: 0, float("nan"): 0}
    ).reset_index()[["test_result", "date", "user_id"]]

    only_first_positive_detections = (
        feature_data[feature_data["test_result"] == True]
        .reset_index()
        .sort_values(["user_id", "date"])
        .groupby(["user_id"])
        .agg({"test_result": "first", "date": lambda x: list(x)[0]})
        .reset_index()
    )
    all_negative_detections = feature_data[feature_data["test_result"] == False].reset_index()

    detected_infections = pd.concat(
        [only_first_positive_detections, all_negative_detections]
    )


    return calculate_incidence("test_result", detected_infections)

questionnaire_incidence = calculate_questionnaire_incidence(test_data)

In [None]:
ax = questionnaire_incidence.plot(x="date", y="incidence")
ax.set_xlabel("Date")
ax.set_ylabel("Incidence as 7 day average per 100000")
ax.set_title("Incidence of self reported positive test results")

In [None]:
test_labels = test_data.replace(
    {True: 'positive', False: 'negative', None: 'no test', float("nan"): 'no test'}
).reset_index()[["test_result", "date", "user_id"]]
test_labels

In [None]:
df = test_labels[['test_result', 'date']].groupby('date')['test_result'].value_counts().unstack('test_result')

ax = df.div(df.sum(axis=1), axis=0).plot(kind='area', stacked=True)
ax.set_title('Relative shares of positive, negative and no tests')

In [None]:
incidence = pull_from_postgres(SQL(\
    """
    SELECT date_of_report date, incidence_7d_per_100k official_incidence FROM coronacases.german_counties_incidence WHERE location_level = 0;
    """))
incidence

In [None]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

df = questionnaire_incidence.merge(incidence, on='date').sort_values(by="date").reset_index(drop=True).query('date>20220101')

fig, ax = plt.subplots(figsize=(12, 6))
axb = ax.twinx()

ax1 = df.plot(x="date", y="official_incidence", ax=ax, label="incidence as officially reported", color='blue', legend=False)
ax1 = df.plot(x="date", y="official_incidence", ax=ax, label="incidence as officially reported", color='blue', legend=False)

ax2 = df.plot(x="date", y="incidence", ax=axb, label="incidence from surveys", color="red", legend=False)

# create legend manually
labels = ['incidence as officially reported', 'incidence as officially reported']
blue_line = Line2D([0], [0], color='blue', label='official incidence')
red_line = Line2D([0], [0], color='red', label='self reported incidence')
ax.legend([blue_line, red_line], labels, loc='upper left')

# label axes
ax.set_xlabel('Date')
ax.set_ylabel('Incidence calculated from surveys as 7 day average per 100000')
axb.set_ylabel('Incidence as officially reported')

fig.savefig('incidence.png')