In [1]:
import pandas as pd
import altair as alt
from typing import List
from visualize import timeline

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
def most_significant(l, i) -> List[str]:
    df = pd.concat(l)
    ms = (
        df.groupby("element")["KL"]
        .apply(lambda c: c.abs().sum())
        .sort_values(ascending=False)
        .head(i)
        .index.to_list()
    )
    ms = [e for e in ms if e not in ["laden", "osama", "marilyn","michael jackson", "hussein"]]
    return df[df["element"].isin(ms)].reset_index(drop=True), ms

### vis 1
Frequencies of top 15 words in the dvrs

In [7]:
for subcorpus in ("conspiracy", "mainstream"):
    dvr = (
        pd.read_csv(f"results/{subcorpus}/dvr.csv")
        .sort_values("global_weight", ascending=False)["element"]
        .head(15)
    )
    squeezed_freq = pd.read_csv(
        f"results/{subcorpus}/squeezed_freq.csv", parse_dates=["date"]
    )
    pdf = squeezed_freq[squeezed_freq["element"].isin(dvr)].reset_index(drop=True)
    pdf["xdate"] = pdf["date"].astype(str).str[:7]
    display(
        timeline(
            pdf,
            x="xdate",
            y="global_weight",
            subcorpus=subcorpus,
            stack="center",
            order=dvr.to_list(),
            name=f"vis_1_{subcorpus}_timeline",
        )
    )


### vis 2
Distances from dvr for most significant words

In [5]:
for subcorpus in ("mainstream", "conspiracy"):
    df = pd.read_csv(f"results/{subcorpus}/max_distances.csv")
    ms = df.columns.to_list()
    df = df.assign(
        **{
            "category": sorted([
                p.split("/")[-1][5:-4] for p in glob(f"results/{subcorpus}/sigs/*.csv")
            ])
        }
    )
    msdf = pd.melt(df[ms + ["category"]], id_vars="category", var_name="element", value_name="KL")
    display(
        timeline(
            msdf,
            x="category",
            y="KL",
            subcorpus=subcorpus,
            stack=None,
            order=ms,
            name=f"vis_2_{subcorpus}_timeline",
        )
    )


### vis 3
Frequencies of most significant words distance wise

In [6]:
for subcorpus in ("mainstream", "conspiracy"):
    df = pd.read_csv(f"results/{subcorpus}/max_distances.csv")
    ms = df.columns.to_list()
    squeezed_freq = pd.read_csv(
        f"results/{subcorpus}/squeezed_freq.csv", parse_dates=["date"]
    )
    pdf = squeezed_freq[squeezed_freq["element"].isin(ms)].reset_index(drop=True)
    pdf["xdate"] = pdf["date"].astype(str).str[:7]
    display(
        timeline(
            pdf,
            x="xdate",
            y="global_weight",
            subcorpus=subcorpus,
            stack="center",
            order=ms,
            name=f"vis_3_{subcorpus}_timeline",
        )
    )
