In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import altair as alt
from typing import Literal
from more_itertools import consecutive_groups
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
from LPA import LPA
from algo import KLD_distance_consecutive
from LOCO_LPA import create_freq, freq_window
from LOCO_LPA import Subcorpus

In [3]:
def non_cumulative_dvr(
    base_freq: pd.DataFrame,
    subcorpus: Subcorpus = "full",
    freq: Literal["MS", "D", "W"] = "MS",
    start_date: str = "1990-01-01",
    end_date: str = "2020-07-01",
    filter_: pd.Series | None = None,
) -> pd.DataFrame:
    l = []
    s = pd.to_datetime(start_date, format="%Y-%m-%d")
    e = pd.to_datetime(end_date, format="%Y-%m-%d")
    for dr in zip(pd.date_range(s, e, freq=freq), pd.date_range(s, e, freq=freq[:1])):
        conditional_dir = {"direction": "range"} if freq in ("W", "MS") else {}
        if freq == "W":
            dr = dr[0] - pd.Timedelta("6D"), dr[0]
        freq_df = freq_window(
            base_freq, dr, False, subcorpus=subcorpus, **conditional_dir
        )
        dvr = LPA.create_dvr(freq_df).assign(**{"date": dr[0], "subcorpus": subcorpus})
        if filter_:
            dvr = dvr[dvr["element"] == filter_]
        l.append(dvr)
    ncdvr = pd.concat(l).reset_index(drop=True)
    return ncdvr


In [4]:
def kld_single(ncdvr, subcorpus):
    res = (
        ncdvr.pivot(index="element", columns="date", values="global_weight")
        .fillna(0)
        .to_numpy()
    )
    date_col = ncdvr["date"].drop_duplicates().reset_index(drop=True)[1:]
    kld_res = np.sum(np.apply_along_axis(KLD_distance_consecutive, 1, res), axis=0)
    kldf = pd.DataFrame({"date": date_col, "KLD": kld_res, "subcorpus": subcorpus})
    return kldf


In [5]:
def show_kldf(kldf, rule_value):
    kldf = deepcopy(kldf)
    kldf["color"] = np.where(kldf["KLD"] < rule_value, True, False)
    chart = (
        alt.Chart(kldf)
        .mark_bar()
        .encode(
            x=alt.X("date:O", axis=alt.Axis(labels=False)),
            y=alt.Y("KLD", scale=alt.Scale(domain=[0, 1])),
            color=alt.Color("color", legend=None),
        )
    ).properties(height=250)
    kldf["y"] = rule_value
    rule = alt.Chart(kldf).mark_rule(color="red").encode(y="y")
    return chart + rule


In [6]:
def squeeze(base_freq, ncdvr, subcorpus):
    kldf = kld_single(ncdvr, subcorpus)
    cutoff = kldf["KLD"].median()
    display(show_kldf(kldf, rule_value=cutoff))
    show_kldf(kldf, rule_value=cutoff).save(
        f"results/{subcorpus}/bar_charts/bar_iter_0.html"
    )
    low_xentropy = kldf[kldf["KLD"] < cutoff]
    groups = [
        (min(i), max(i))
        for i in [list(x) for x in consecutive_groups(low_xentropy.index)]
        if len(i) > 1
    ]
    iter_ = 1
    while len(groups) > 0:
        date = low_xentropy.loc[groups[0][0], "date"]
        next_date = low_xentropy.loc[groups[0][0] + 1, "date"]
        sq = squeeze_freq(base_freq, date, next_date, subcorpus)
        split_dvr = ncdvr[~ncdvr["date"].isin((date, next_date))]
        ncdvr = (
            pd.concat([split_dvr, sq])
            .sort_values(["date", "global_weight"], ascending=[True, False])
            .reset_index(drop=True)
        )
        kldf = kld_single(ncdvr, subcorpus)
        kldf.to_csv(f"results/{subcorpus}/final_kldf.csv", index=False)
        kldf_barchart = show_kldf(kldf, rule_value=cutoff)
        display(kldf_barchart)
        kldf_barchart.save(f"results/{subcorpus}/bar_charts/bar_iter_{iter_}.html")
        low_xentropy = kldf[kldf["KLD"] < cutoff]
        groups = [
            (min(i), max(i))
            for i in [list(x) for x in consecutive_groups(low_xentropy.index)]
            if len(i) > 1
        ]
        iter_ += 1
        if len(groups) == 0:
            return ncdvr


def squeeze_freq(base_freq, min_date, max_date, subcorpus):
    base_freq = base_freq[base_freq["subcorpus"] == subcorpus]
    squeezed = (
        base_freq[(base_freq["date"] >= min_date) & (base_freq["date"] <= max_date)]
        .groupby("element", as_index=False)
        .agg({"frequency_in_category": "sum"})
    )
    squeezed["date"] = min_date
    squeezed["subcorpus"] = subcorpus
    squeezed["global_weight"] = squeezed["frequency_in_category"] / sum(
        squeezed["frequency_in_category"]
    )
    return squeezed.sort_values(by=["date", "frequency_in_category"], ascending=False)

In [7]:
def create_and_cut(squeezed_freq, subcorpus):
    dvr = (
        squeezed_freq.groupby("element")["global_weight"].mean()
        / squeezed_freq.groupby("element")["global_weight"].mean().sum()
    ).reset_index()
    dvr.to_csv(f"results/{subcorpus}/dvr.csv", index=False)
    pvr = squeezed_freq.rename(
        columns={"date": "category", "global_weight": "local_weight"}
    ).drop(columns=["frequency_in_category"])
    lpa = LPA(dvr, epsilon_frac=2)
    lpa.create_arrays(pvr)
    frequency = pvr.sort_values("category").reset_index(drop=True)
    distances = lpa.create_distances(frequency)
    sigs = lpa.add_overused(distances)
    cut = lpa.cut(sigs, sig_length=None)
    for i, g in cut.groupby("category"):
        g.to_csv(f"results/{subcorpus}/sigs/sigs_{i.strftime('%Y-%m-%d')}.csv", index=False)
    return cut


In [9]:
base_freq = create_freq()
for subcorpus in ("mainstream", "conspiracy"): # MS!!!
    ncdvr = non_cumulative_dvr(base_freq, subcorpus, "MS", "2019-08-01", "2020-07-04")
    squeezed_freq = squeeze(base_freq, ncdvr, subcorpus)
    sigs = create_and_cut(squeezed_freq, subcorpus)
    sigs