In [38]:
import pandas as pd
import numpy as np
import altair as alt
from typing import Literal, List
from more_itertools import consecutive_groups
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler
from floweaver import *
from functools import reduce


alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
from LPA import IterLPA, LPA
from algo import entropy, KLD_divergence_consecutive
from LOCO_LPA import create_freq, freq_window
from LOCO_LPA import Subcorpus

create/read the dvr and create lpa object (iter-lpa since data is in many csv files)

In [4]:
freq = create_freq()
freq = freq[freq["date"]>= pd.to_datetime("2019-10-28")]


KeyboardInterrupt: 

### Building DVR

`freq_window` creates the dvr for time-series LPA. `quantity` is the time to cut the dvr by (currently only supports a year), `cumulative` indicates whether the dvr should be cumulative (frequency adds up) or not up to the time set in quantity. `direction` can be either `"to"` or `"from"` - where, if the quantity is 2020, for instance, `"to"` would mean 1990-2020 and `"from"` would mean 2020-2021.

In [150]:
def cumulative_dvr(
    base_freq: pd.DataFrame,
    subcorpus: Subcorpus = "full",
    freq: Literal["MS", "D", "W"] = "MS",
    start_date: str = "1990-01-01",
    end_date: str = "2020-07-01",
    filter_: pd.Series | None = None,
):
    l = []
    s = pd.to_datetime(start_date, format="%Y-%m-%d")
    e = pd.to_datetime(end_date, format="%Y-%m-%d")
    base_freq = base_freq[base_freq["date"] >= s & base_freq["date"] <= e].reset_index(drop=True)
    # for date in pd.date_range(s,e,freq=freq):
    # freq = freq_window(base_freq, e, True, subcorpus=subcorpus)
    dvr = LPA.create_dvr(freq).assign(**{"date": e, "subcorpus": subcorpus})
    l.append(dvr[dvr["element"] == filter_])
    return pd.concat(l).reset_index(drop=True)





In [None]:
cumulative_dvr(base_freq)

In [4]:
def non_cumulative_dvr(
    base_freq: pd.DataFrame,
    subcorpus: Subcorpus = "full",
    freq: Literal["MS", "D", "W"] = "MS",
    start_date: str = "1990-01-01",
    end_date: str = "2020-07-01",
    filter_: pd.Series | None = None,
) -> pd.DataFrame:
    l = []
    s = pd.to_datetime(start_date, format="%Y-%m-%d")
    e = pd.to_datetime(end_date, format="%Y-%m-%d")
    for dr in zip(pd.date_range(s, e, freq=freq), pd.date_range(s, e, freq=freq[:1])):
        conditional_dir = {"direction": "range"} if freq in ("W", "MS") else {}
        if freq == "W":
            dr = dr[0] - pd.Timedelta("6D"), dr[0]
        freq_df = freq_window(
            base_freq, dr, False, subcorpus=subcorpus, **conditional_dir
        )
        dvr = LPA.create_dvr(freq_df).assign(**{"date": dr[0], "subcorpus": subcorpus})
        if filter_:
            dvr = dvr[dvr["element"] == filter_]
        l.append(dvr)
    ncdvr = pd.concat(l).reset_index(drop=True)
    return ncdvr


In [5]:
def kld_single(ncdvr, subcorpus):
    res = (
        ncdvr.pivot(index="element", columns="date", values="global_weight")
        .fillna(0)
        .to_numpy()
    )
    date_col = ncdvr["date"].drop_duplicates().reset_index(drop=True)[1:]
    kld_res = np.sum(np.apply_along_axis(KLD_divergence_consecutive, 1, res), axis=0)
    kldf = pd.DataFrame(
        {
            "date": date_col,
            "KLD": kld_res,
            "subcorpus": subcorpus
        }
    )
    return kldf

def entropy_single(base_freq, *dvr_args):
    daily = non_cumulative_dvr(base_freq, *dvr_args)
    res = (
        daily.pivot(index="element", columns="date", values="global_weight")
        .fillna(0)
        .to_numpy()
    )
    kld_res = np.sum(entropy(res), axis=0)
    kldf = pd.DataFrame(
        {
            "date": daily["date"].drop_duplicates().reset_index(drop=True),
            "entropy": kld_res,
        }
    )
    return kldf


Weekly

In [7]:
def show_kldf(kldf, rule_value):
    # color="subcorpus",
    kldf['color'] = np.where(kldf["KLD"] < rule_value, True, False)
    chart = (
        alt.Chart(kldf)
        .mark_bar()
        .encode(x="date:O", y="KLD", color="color", tooltip=alt.Tooltip("date:T"))
        # .properties(width=900)
    )
    kldf["y"]=rule_value
    rule = alt.Chart(kldf).mark_rule(color="red").encode(y="y")
    return chart + rule


In [8]:
# ncdvr = non_cumulative_dvr(base_freq, "mainstream", "W", "2019-11-01", "2020-07-04")
# display(ncdvr)
def squeeze(base_freq, ncdvr, subcorpus):
    kldf = kld_single(ncdvr, subcorpus)
    cutoff = kldf["KLD"].median()
    display(show_kldf(kldf, rule_value=cutoff))
    show_kldf(kldf, rule_value=cutoff).save(
        f"results/{subcorpus}/bar_chart/bar_iter_0.html"
    )
    low_xentropy = kldf[kldf["KLD"] < cutoff]
    groups = [
        (min(i), max(i))
        for i in [list(x) for x in consecutive_groups(low_xentropy.index)]
        if len(i) > 1
    ]
    iter_ = 1
    while len(groups) > 0:
        date = low_xentropy.loc[groups[0][0], "date"]
        next_date = low_xentropy.loc[groups[0][0] + 1, "date"]
        sq = squeeze_freq(base_freq, date, next_date, subcorpus)
        split_dvr = ncdvr[~ncdvr["date"].isin((date, next_date))]
        ncdvr = (
            pd.concat([split_dvr, sq])
            .sort_values(["date", "global_weight"], ascending=[True, False])
            .reset_index(drop=True)
        )
        kldf = kld_single(ncdvr, subcorpus)
        kldf.to_csv(f"results/{subcorpus}/final_kldf.csv", index=False)
        display(show_kldf(kldf, rule_value=cutoff))
        show_kldf(kldf, rule_value=cutoff).save(
            f"results/{subcorpus}/bar_charts/bar_iter_{iter_}.html"
        )
        low_xentropy = kldf[kldf["KLD"] < cutoff]
        groups = [
            (min(i), max(i))
            for i in [list(x) for x in consecutive_groups(low_xentropy.index)]
            if len(i) > 1
        ]
        iter_ += 1
        if len(groups) == 0:
            return ncdvr


def squeeze_freq(base_freq, min_date, max_date, subcorpus):
    base_freq = base_freq[base_freq["subcorpus"] == subcorpus]
    squeezed = (
        base_freq[(base_freq["date"] >= min_date) & (base_freq["date"] <= max_date)]
        .groupby("element", as_index=False)
        .agg({"frequency_in_category": "sum"})
    )
    squeezed["date"] = min_date
    squeezed["subcorpus"] = subcorpus
    squeezed["global_weight"] = squeezed["frequency_in_category"] / sum(
        squeezed["frequency_in_category"]
    )
    return squeezed.sort_values(by=["date", "frequency_in_category"], ascending=False)


# squeezed_freq = squeeze(base_freq, ncdvr, "mainstream")


In [9]:
def create_and_cut(squeezed_freq, subcorpus):
    dvr = (
        squeezed_freq.groupby("element")["global_weight"].mean()
        / squeezed_freq.groupby("element")["global_weight"].mean().sum()
    ).reset_index()
    dvr.to_csv(f"results/{subcorpus}/dvr.csv", index=False)
    pvr = squeezed_freq.rename(
        columns={"date": "category", "global_weight": "local_weight"}
    ).drop(columns=["frequency_in_category"])
    lpa = LPA(dvr, epsilon_frac=2)
    lpa.create_arrays(pvr)
    frequency = pvr.sort_values("category").reset_index(drop=True)
    distances = lpa.create_distances(frequency)
    sigs = lpa.add_overused(distances)
    cut = lpa.cut(sigs, sig_length=None)
    for i, g in cut.groupby("category"):
        g.to_csv(f"results/{sc}/sigs/sigs_{i.strftime('%Y-%m-%d')}.csv", index=False)
    return cut


In [12]:
base_freq = create_freq()
for sc in ("mainstream", "conspiracy"): # MS!!!
    ncdvr = non_cumulative_dvr(base_freq, sc, "W", "2019-11-01", "2020-07-04")
    squeezed_freq = squeeze(base_freq, ncdvr, sc)
    sigs = create_and_cut(squeezed_freq, sc)

Unnamed: 0,category,element,KL,overused
0,2019-10-28,harry,0.007756,True
1,2019-10-28,covid-19,0.007466,False
2,2019-10-28,diana,0.007192,True
3,2019-10-28,missouri,0.006672,True
4,2019-10-28,nhs,0.006369,True
...,...,...,...,...
14495,2020-06-22,sars-cov,0.000231,True
14496,2020-06-22,emergency use authorization,0.000231,True
14497,2020-06-22,mba,0.000231,True
14498,2020-06-22,up,0.000230,True


In [11]:
ncdvr = non_cumulative_dvr(base_freq, "mainstream", "MS", "2010-01-01", "2020-07-01")
squeezed_freq = squeeze(base_freq, ncdvr, "mainstream")
cut = create_and_cut(squeezed_freq)
cut

In [13]:
df

Unnamed: 0,category_x,element,KL_x,overused_x,category_y,KL_y,overused_y
0,2020-05-03,covid-19,0.0247545294318054,True,2020-06-07,0.0236721469551219,True
1,2020-05-03,u.s.,0.0077847321718413,True,2020-06-07,0.0046328563918316,True
2,2020-05-03,china,0.0058893700202807,True,2020-06-07,0.0082269371520048,True
3,2020-05-03,april,0.0056656690648523,True,2020-06-07,0.0029307336271205,True
4,2020-05-03,trump,0.0051293791092194,True,2020-06-07,0.0044280754646883,True
...,...,...,...,...,...,...,...
260,2020-05-03,european countries,0.0002127386167668,True,2020-06-07,0.0002758143430449,True
261,2020-05-03,thanathorn,0.0002118184615086,False,2020-06-07,0.0002118184615086,False
262,2020-05-03,u.s,0.0002097363258914,True,2020-06-07,0.0002156473391979,True
263,2020-05-03,fonda,0.0002083983089587,False,2020-06-07,0.0002083983089587,False


In [174]:
dfl = [
    pd.read_csv(p)
    for i, p in enumerate(glob("results/sigs/10_year_mainstream/normalized_mean*.csv"))
]

dfkl = pd.concat([df[df["KL"]>0.05] for df in dfl])["element"].drop_duplicates().to_list()
# summ = pd.concat(dfl).groupby("element").sum()
xx = pd.concat(dfl)
xx = xx[xx["element"].isin(dfkl)]
xx["overused"] = xx["overused"].replace({False: -1, True: 1}) 
xx["KL"] = xx["KL"] * xx["overused"]
# xlist = (
#     summ.assign(dd=summ.KL * summ.overused / 30)
#     .sort_values("dd", ascending=False)
#     .head(30)
#     .index.to_list()
# )

# # dc = pd.concat(dfl)
# # dc = dc[dc["element"].isin(xlist)]
# xx = xx[xx["element"].isin(xlist)]


selection = alt.selection_multi(fields=["element"], bind="legend")

alt.Chart(xx).mark_area().encode(
    x="category:T",
    y=alt.Y("KL"),# stack="center", axis=None),
    color=alt.Color(
        "element",
        scale=alt.Scale(scheme="rainbow"),
        sort=alt.Sort(field="-KL"),
        legend=alt.Legend(columns=2, labelLimit=1000),
    ),
    tooltip=alt.Tooltip(["element", "KL"]),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
).properties(width=900).add_selection(selection) #.save("./10_year_mainstream.html")


In [178]:
xx[xx["element"] == "covid-19"].sort_values("category")

Unnamed: 0,category,element,KL,overused
498,2017-01-01,covid-19,-0.000341,-1
484,2018-11-01,covid-19,-0.000341,-1
461,2019-04-01,covid-19,-0.000341,-1
465,2019-06-01,covid-19,-0.000341,-1
454,2019-07-01,covid-19,-0.000341,-1
474,2019-12-01,covid-19,-0.000341,-1
4,2020-02-01,covid-19,0.007671,1
0,2020-03-01,covid-19,0.042864,1
0,2020-04-01,covid-19,0.059453,1
0,2020-05-01,covid-19,0.055873,1


In [176]:
xx[xx["overused"] == -1]

Unnamed: 0,category,element,KL,overused
267,2018-11-01,appellant,-0.000569,-1
303,2018-11-01,lovelock,-0.000510,-1
466,2018-11-01,levinson,-0.000353,-1
484,2018-11-01,covid-19,-0.000341,-1
274,2018-08-01,appellant,-0.000569,-1
...,...,...,...,...
271,2016-08-01,appellant,-0.000569,-1
300,2016-08-01,lovelock,-0.000510,-1
265,2019-01-01,appellant,-0.000569,-1
310,2019-01-01,lovelock,-0.000510,-1


In [170]:
selection = alt.selection_multi(fields=["element"], bind="legend")

alt.Chart(xx).mark_line().encode(
    x="category:T",
    y=alt.Y("KL"),
    color=alt.Color(
        "element",
        scale=alt.Scale(scheme="rainbow"),
        # sort=alt.Sort(field="-KL"),
        legend=alt.Legend(columns=2, labelLimit=1000),
    ),
    tooltip=alt.Tooltip(["element", "KL"]),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
).properties(width=900).add_selection(selection)

In [44]:
from glob import glob

x = [
    (i, pd.read_csv(p).astype(str))
    for i, p in enumerate(
        glob("results/sigs/normalized_mean_mainstream/normalized_mean_sigs*.csv")
    )
    if i < 4
]
# df = pd.merge(x[0].head(20), x[1].head(20), how="outer", on="element")

len(x)  # df.head(10)
# df["source"] = df["category_x"] + " " + df["element"]
# df["target"] =  df["category_y"] + " " + df["element"]
# df.rename("")
# df = pd.merge(df, x[2].head(20), how="outer", on="element", suffixes=("","_z"))
# df
reduce(
    lambda left, right: (right[0], pd.merge(
        left[1], right[1], on="element", how="outer", suffixes=(left[0], right[0])
    )),
    x,
)[1]
# df[["source", "target"]]
# display(xx)
# xx.to_csv("test.csv", index=False)


Unnamed: 0,category0,element,KL0,overused0,category1,KL1,overused1,category2,KL2,overused2,category3,KL3,overused3
0,2020-05-03,covid-19,0.0247545294318054,True,2020-06-07,0.0236721469551219,True,2020-06-28,0.0270670840480662,True,2020-01-05,0.0089595908569335,False
1,2020-05-03,u.s.,0.0077847321718413,True,2020-06-07,0.0046328563918316,True,2020-06-28,0.0041177981828556,True,2020-01-05,0.0061404200512852,True
2,2020-05-03,china,0.0058893700202807,True,2020-06-07,0.0082269371520048,True,2020-06-28,0.0034936929335136,True,2020-01-05,0.0005911206281374,True
3,2020-05-03,april,0.0056656690648523,True,2020-06-07,0.0029307336271205,True,2020-06-28,0.0037463484377562,True,2020-01-05,0.0002919784727251,True
4,2020-05-03,trump,0.0051293791092194,True,2020-06-07,0.0044280754646883,True,2020-06-28,0.0050119416450989,True,2020-01-05,0.0048770121683603,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,,welts,,,,,,,,,2020-01-05,0.0002813524117464,True
1270,,kyoto protocol,,,,,,,,,2020-01-05,0.000280975177699,True
1271,,fahrenheit,,,,,,,,,2020-01-05,0.0002803480823618,True
1272,,selma,,,,,,,,,2020-01-05,0.0002803071210553,True


In [11]:
chart = alt.Chart(new_full).mark_line().encode(
    x="index:O", y="KLD", color="subcorpus", tooltip=alt.Tooltip("date:T")
).properties(width=900)
chart

In [None]:
ncdvr = non_cumulative_dvr(base_freq, "mainstream", "W", "2019-11-01", "2020-07-04")


Daily

In [20]:
main_entropy = entropy_single(base_freq, "mainstream", "W", "2019-11-01", "2020-07-04").assign(**{"subcorpus": "mainstream"})
cons_entropy = entropy_single(base_freq, "conspiracy", "W", "2019-11-01", "2020-07-04").assign(**{"subcorpus": "conspiracy"})
full_ = pd.concat([main_entropy, cons_entropy])
alt.Chart(full_).mark_line().encode(
    x="date:T", y="entropy", color="subcorpus", tooltip=alt.Tooltip("date:T")
).properties(width=900)

In [33]:
ncdvr = non_cumulative_dvr(base_freq, "conspiracy", "W", "2019-11-01", "2020-07-04")


Unnamed: 0,element,frequency_in_category,global_weight,date
256818,covid-19,74,0.013314,2020-05-31
256819,u.s.,62,0.011155,2020-05-31
256820,america,39,0.007017,2020-05-31
256821,facebook,22,0.003958,2020-05-31
256822,bill gates,20,0.003598,2020-05-31
256823,gates,19,0.003418,2020-05-31
256824,trump,19,0.003418,2020-05-31
256825,tech,19,0.003418,2020-05-31
256826,illegal aliens,18,0.003239,2020-05-31
256827,soros,18,0.003239,2020-05-31


In [43]:
ncdvr.groupby("date")["frequency_in_category"].sum()
# ncdvr[ncdvr["date"]==pd.to_datetime("2020-02-23")].head(20)

date
2019-11-03    12344
2019-11-10     8018
2019-11-17     8987
2019-11-24     7783
2019-12-01     8632
2019-12-08     9652
2019-12-15    10029
2019-12-22    10643
2019-12-29     9448
2020-01-05     8796
2020-01-12    11626
2020-01-19     7855
2020-01-26    10787
2020-02-02    12242
2020-02-09    14662
2020-02-16    10716
2020-02-23    13905
2020-03-01    12883
2020-03-08    12705
2020-03-15    11210
2020-03-22    13601
2020-03-29    15281
2020-04-05    18436
2020-04-12    22551
2020-04-19    23414
2020-04-26    24415
2020-05-03    23958
2020-05-10    26772
2020-05-17    45723
2020-05-24    35023
2020-05-31     5558
2020-06-07     2644
2020-06-14     1145
2020-06-21     1458
2020-06-28     1150
Name: frequency_in_category, dtype: int64

In [15]:
def plot_seasonal_decompose(kldf, arg1="observed", arg2="trend"):
    ts = kldf.set_index("date")
    result = seasonal_decompose(ts, model="additive")
    # result.seasonal.plot()
    # result.resid.plot()
    result = (
        getattr(result, arg1).to_frame(name="observed").join(getattr(result, arg2)).reset_index().melt(id_vars="date")
    )
    return alt.Chart(result).mark_line().encode(
        x="date:T", y="value", color="variable", tooltip=alt.Tooltip("date:T")
    ).properties(width=900)

plot_seasonal_decompose(kldf)

ValueError: x must have 2 complete cycles requires 104 observations. x only has 34 observation(s)

In [None]:
monthly_conspiracy = non_cumulative_dvr(None, "conspiracy")
monthly_mainstream = non_cumulative_dvr(None, "mainstream")
monthly_conspiracy["subcorpus"] = "conspiracy"
monthly_mainstream["subcorpus"] = "mainstream"
monthly = non_cumulative_dvr()

In [None]:
daily_args = ("D", "2019-11-01", "2020-07-04")
daily_conspiracy = non_cumulative_dvr(base_freq, "conspiracy", *daily_args)
daily_mainstream = non_cumulative_dvr(base_freq, "mainstream", *daily_args)
daily_conspiracy["subcorpus"] = "conspiracy"
daily_mainstream["subcorpus"] = "mainstream"
daily = non_cumulative_dvr(base_freq, "full", *daily_args)

In [133]:
def kld_double(dvr_1, dvr_2, epsilon=0):
    dates_union = set(
        dvr_1["date"].drop_duplicates().tolist()) & set(dvr_2["date"].drop_duplicates().tolist())
    d = {"date": [], "KLD": []}
    for date in sorted(dates_union):
        res = (
            pd.concat(
                [
                    dvr_1[dvr_1["date"] == date],
                    dvr_2[dvr_2["date"] == date],
                ]
            )
            .pivot(index="element", columns="subcorpus", values="global_weight")
        )
        res = (res / (1 + res.isna().sum() * epsilon)).fillna(epsilon).to_numpy()
        kld_res = np.sum(np.apply_along_axis(KLD_divergence_consecutive, 1, res), axis=0)
        d["date"].append(date)
        d["KLD"].append(kld_res[0])
    return pd.DataFrame(d)


In [161]:
comparative_kldf = kld_double(daily_mainstream, daily_conspiracy, epsilon=2e-5)
alt.Chart(comparative_kldf).mark_line().encode(
    x="date:T", y="KLD", tooltip=alt.Tooltip("date:T")
).properties(width=900)

In [162]:
ts = comparative_kldf.set_index("date")
idx = pd.date_range("2019-11-01", "2020-07-01")
ts.index = pd.DatetimeIndex(ts.index)
ts = ts.reindex(idx, fill_value=0)
result = seasonal_decompose(ts) #, model='additive', period=241)
result = result.observed.to_frame(name="observed").join(result.trend).reset_index().melt(id_vars='index')
alt.Chart(result).mark_line().encode(
    x="index:T", y="value", color="variable", tooltip=alt.Tooltip("index:T")
).properties(width=900)

# result.resid.plot()
# result.observed.plot()

In [31]:
def before_and_after(dvr: pd.DataFrame(), date: str):
    display(dvr[dvr["date"]==pd.to_datetime(date)-pd.to_timedelta("1D")].head(20))
    display(dvr[dvr["date"]==pd.to_datetime(date)].head(20))


### Visualization
create visualization of change over time in random sample of words:

In [None]:
def word_over_time(
    full_dvr: pd.DataFrame, word: List[str] | str, save: bool = False, cumulative: bool = True
) -> alt.Chart:
    # dvr must be full cumulative dvr
    dvr = full_dvr[full_dvr["element"].isin((word if isinstance(list) else [word]))]
    selection = alt.selection_multi(fields=["element"], bind="legend")
    chart = alt.Chart(dvr).mark_line() if cumulative else alt.Chart(dvr).mark_bar()
    chart = (
        chart.encode(
            x=alt.X("date:T"),
            y="global_weight",
            color=alt.Color("element", legend=alt.Legend(columns=3, symbolLimit=300)),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
        )
        .add_selection(selection)
        .properties(width=900)
    )
    if save == True:
        chart.save("results/random.html")
    return chart

word_over_time(full_dvr, "vaccin")


### Time series analysis of one word

In [None]:
dr = pd.date_range(
    pd.to_datetime("1990-01-01", format="%Y-%m-%d"),
    pd.to_datetime("2022-01-01", format="%Y-%m-%d"),
    freq="MS",
)
s = dr.to_series(name="date")
df = full_dvr[full_dvr["element"] == "world"]
ts = (
    pd.merge(s, df, right_on="date", left_index=True, how="outer")[
        "frequency_in_category"
    ]
    .fillna(0)
    .reset_index(drop=True)
)
ts.index = dr
ts
# ts = df.set_index("date")['frequency_in_category']


Seasonal decomposition of time series data

In [None]:
result = seasonal_decompose(ts, model='additive')
result.trend.plot()
result.seasonal.plot()
result.resid.plot()
result.observed.plot()

AD Fuller test for time series data:

In [None]:
x = full_dvr.groupby("element")["global_weight"].expanding(4).apply(lambda x: adfuller(x)[1])
for e in cum_dvr["element"].drop_duplicates().tolist():
    print(e, adfuller(cum_dvr[cum_dvr["element"] == e]["global_weight"]))

### Mutual Information

In [None]:
sum(full_dvr[full_dvr["date"] == "1990-02-01"]["global_weight"])

In [None]:
full_dvr = full_dvr(base_freq, "iraq")
# P = actual Q = model
P = full_dvr["global_weight"]
Q = full_dvr["global_weight"].expanding(1).mean()

In [None]:
pd.DataFrame.from_dict(
    {
        "date": pd.date_range(
            pd.to_datetime("1990-09-01", format="%Y-%m-%d"),
            pd.to_datetime("2021-01-01", format="%Y-%m-%d"),
            freq="MS",
        ),
        "kld": (P * np.log(P / Q)),
    }
).plot(x="date", y="kld")


In [None]:
full_dvr