In [None]:
selection = alt.selection_multi(fields=["element"], bind="legend")

alt.Chart(xx).mark_line().encode(
    x="category:T",
    y=alt.Y("KL"),
    color=alt.Color(
        "element",
        scale=alt.Scale(scheme="rainbow"),
        # sort=alt.Sort(field="-KL"),
        legend=alt.Legend(columns=2, labelLimit=1000),
    ),
    tooltip=alt.Tooltip(["element", "KL"]),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
).properties(width=900).add_selection(selection)


In [None]:
main_entropy = entropy_single(
    base_freq, "mainstream", "W", "2019-11-01", "2020-07-04"
).assign(**{"subcorpus": "mainstream"})
cons_entropy = entropy_single(
    base_freq, "conspiracy", "W", "2019-11-01", "2020-07-04"
).assign(**{"subcorpus": "conspiracy"})
full_ = pd.concat([main_entropy, cons_entropy])
alt.Chart(full_).mark_line().encode(
    x="date:T", y="entropy", color="subcorpus", tooltip=alt.Tooltip("date:T")
).properties(width=900)


In [None]:
def cumulative_dvr(
    base_freq: pd.DataFrame,
    subcorpus: Subcorpus = "full",
    freq: Literal["MS", "D", "W"] = "MS",
    start_date: str = "1990-01-01",
    end_date: str = "2020-07-01",
    filter_: pd.Series | None = None,
):
    l = []
    s = pd.to_datetime(start_date, format="%Y-%m-%d")
    e = pd.to_datetime(end_date, format="%Y-%m-%d")
    base_freq = base_freq[base_freq["date"] >= s & base_freq["date"] <= e].reset_index(
        drop=True
    )
    # for date in pd.date_range(s,e,freq=freq):
    # freq = freq_window(base_freq, e, True, subcorpus=subcorpus)
    dvr = LPA.create_dvr(freq).assign(**{"date": e, "subcorpus": subcorpus})
    l.append(dvr[dvr["element"] == filter_])
    return pd.concat(l).reset_index(drop=True)


In [None]:
def entropy_single(base_freq, *dvr_args):
    daily = non_cumulative_dvr(base_freq, *dvr_args)
    res = (
        daily.pivot(index="element", columns="date", values="global_weight")
        .fillna(0)
        .to_numpy()
    )
    kld_res = np.sum(entropy(res), axis=0)
    kldf = pd.DataFrame(
        {
            "date": daily["date"].drop_duplicates().reset_index(drop=True),
            "entropy": kld_res,
        }
    )
    return kldf


In [None]:
def plot_seasonal_decompose(kldf, arg1="observed", arg2="trend"):
    ts = kldf.set_index("date")
    result = seasonal_decompose(ts, model="additive")
    # result.seasonal.plot()
    # result.resid.plot()
    result = (
        getattr(result, arg1)
        .to_frame(name="observed")
        .join(getattr(result, arg2))
        .reset_index()
        .melt(id_vars="date")
    )
    return (
        alt.Chart(result)
        .mark_line()
        .encode(x="date:T", y="value", color="variable", tooltip=alt.Tooltip("date:T"))
        .properties(width=900)
    )


plot_seasonal_decompose(kldf)


In [None]:
monthly_conspiracy = non_cumulative_dvr(None, "conspiracy")
monthly_mainstream = non_cumulative_dvr(None, "mainstream")
monthly_conspiracy["subcorpus"] = "conspiracy"
monthly_mainstream["subcorpus"] = "mainstream"
monthly = non_cumulative_dvr()


In [None]:
daily_args = ("D", "2019-11-01", "2020-07-04")
daily_conspiracy = non_cumulative_dvr(base_freq, "conspiracy", *daily_args)
daily_mainstream = non_cumulative_dvr(base_freq, "mainstream", *daily_args)
daily_conspiracy["subcorpus"] = "conspiracy"
daily_mainstream["subcorpus"] = "mainstream"
daily = non_cumulative_dvr(base_freq, "full", *daily_args)


In [None]:
def kld_double(dvr_1, dvr_2, epsilon=0):
    dates_union = set(dvr_1["date"].drop_duplicates().tolist()) & set(
        dvr_2["date"].drop_duplicates().tolist()
    )
    d = {"date": [], "KLD": []}
    for date in sorted(dates_union):
        res = pd.concat(
            [
                dvr_1[dvr_1["date"] == date],
                dvr_2[dvr_2["date"] == date],
            ]
        ).pivot(index="element", columns="subcorpus", values="global_weight")
        res = (res / (1 + res.isna().sum() * epsilon)).fillna(epsilon).to_numpy()
        kld_res = np.sum(
            np.apply_along_axis(KLD_divergence_consecutive, 1, res), axis=0
        )
        d["date"].append(date)
        d["KLD"].append(kld_res[0])
    return pd.DataFrame(d)


In [None]:
comparative_kldf = kld_double(daily_mainstream, daily_conspiracy, epsilon=2e-5)
alt.Chart(comparative_kldf).mark_line().encode(
    x="date:T", y="KLD", tooltip=alt.Tooltip("date:T")
).properties(width=900)


In [None]:
ts = comparative_kldf.set_index("date")
idx = pd.date_range("2019-11-01", "2020-07-01")
ts.index = pd.DatetimeIndex(ts.index)
ts = ts.reindex(idx, fill_value=0)
result = seasonal_decompose(ts)  # , model='additive', period=241)
result = (
    result.observed.to_frame(name="observed")
    .join(result.trend)
    .reset_index()
    .melt(id_vars="index")
)
alt.Chart(result).mark_line().encode(
    x="index:T", y="value", color="variable", tooltip=alt.Tooltip("index:T")
).properties(width=900)

# result.resid.plot()
# result.observed.plot()


In [None]:
def word_over_time(
    full_dvr: pd.DataFrame,
    word: List[str] | str,
    save: bool = False,
    cumulative: bool = True,
) -> alt.Chart:
    # dvr must be full cumulative dvr
    dvr = full_dvr[full_dvr["element"].isin((word if isinstance(list) else [word]))]
    selection = alt.selection_multi(fields=["element"], bind="legend")
    chart = alt.Chart(dvr).mark_line() if cumulative else alt.Chart(dvr).mark_bar()
    chart = (
        chart.encode(
            x=alt.X("date:T"),
            y="global_weight",
            color=alt.Color("element", legend=alt.Legend(columns=3, symbolLimit=300)),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
        )
        .add_selection(selection)
        .properties(width=900)
    )
    if save == True:
        chart.save("results/random.html")
    return chart


word_over_time(full_dvr, "vaccin")


In [None]:
dr = pd.date_range(
    pd.to_datetime("1990-01-01", format="%Y-%m-%d"),
    pd.to_datetime("2022-01-01", format="%Y-%m-%d"),
    freq="MS",
)
s = dr.to_series(name="date")
df = full_dvr[full_dvr["element"] == "world"]
ts = (
    pd.merge(s, df, right_on="date", left_index=True, how="outer")[
        "frequency_in_category"
    ]
    .fillna(0)
    .reset_index(drop=True)
)
ts.index = dr
ts
# ts = df.set_index("date")['frequency_in_category']
