In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import altair as alt
from typing import Literal, List
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [6]:
from LOCO_TLPA import create_metadata, create_freq

In [7]:
metadata = create_metadata()
metadata

Unnamed: 0,category,date
0,C00001,2016-12-30
1,C00003,2017-08-04
5,C00008,2018-02-21
9,C0000c,2018-02-05
10,C0000d,2020-05-08
...,...,...
23926,C07467,2020-05-28
23927,C07468,2016-01-26
23928,C0746a,2018-01-05
23929,C0746b,2017-04-19


### Data exploration on metadata

In [12]:
# df = pd.read_csv("data/loco/metadata.csv", parse_dates=["date"])
print("Dates after 1990:")
count = (
    (metadata["date"].dt.year > 1990)
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"date": "amount", "index": "is after 1990"})
)
display(
    alt.Chart(count)
    .mark_bar()
    .encode(x="is after 1990", y="amount", tooltip="amount")
    .configure_mark(color="blue")
)

Dates after 1990:


  for col_name, dtype in df.dtypes.iteritems():


In [55]:
df = df[df["date"].dt.year > 2000]
(
    alt.Chart(df)
    .mark_bar(color="orange")
    .encode(x=alt.X("subcorpus"), y="count()", tooltip=alt.Tooltip("count()"))
    .properties(height=100)
)


In [57]:
49115 + 13564

62679

In [13]:
df["dt"] = df["date"].astype("str").str[:7]
grouped = df.groupby(["subcorpus", "dt"])["category"].count()


NameError: name 'df' is not defined

In [58]:
thresh = 20
display(grouped[grouped > thresh])
display(grouped[grouped > thresh].groupby(level=[0]).sum())
display(grouped[grouped > thresh].sum())
display(grouped[grouped > thresh].reset_index(level=1).groupby(level=0)["dt"].count())

subcorpus   dt     
conspiracy  2011-01      21
            2011-02      30
            2011-04      25
            2011-05      41
            2011-09      30
                       ... 
mainstream  2020-03    1498
            2020-04    2162
            2020-05    2874
            2020-06    3323
            2020-07     145
Name: category, Length: 297, dtype: int64

subcorpus
conspiracy    12896
mainstream    48480
Name: category, dtype: int64

61376

subcorpus
conspiracy    104
mainstream    193
Name: dt, dtype: int64

In [59]:
base = pd.read_csv("results/base_freq.csv")
base["dt"] = base["date"].astype("str").str[:7]
base = base.groupby(["subcorpus", "dt"])["frequency_in_category"].sum()
base.loc[grouped[grouped > 30].index].groupby(level=0).sum()

subcorpus
conspiracy    1810640
mainstream    4790162
Name: frequency_in_category, dtype: int64

In [None]:
df = df[df["date"].dt.year > 1990]
print("amount of articles per month")
alt.Chart(df).mark_bar(color="orange").encode(
    x=alt.X("yearmonth(date):T"), y="count()", tooltip=alt.Tooltip("count()")
).properties(width=900).facet(row="subcorpus").interactive()

In [None]:
df = pd.read_csv("data/loco/metadata.csv")

In [None]:
print("articles per year")
alt.Chart(df).mark_bar(color="red").encode(
    x=alt.X("year(date):T"), y="count()"
).properties(width=300)


In [None]:
print("conspiracy count")
alt.Chart(df).mark_bar(color="green").encode(
    x="subcorpus", y="count()"
)

#### Data exploration on frequency data

These functions are "filters" - create different ways to pick interesting words from the corpus, to then view over time. `biggest delta` creates a list of the biggest change over time, `popularity` finds the words with the weights over a specific amount, and `rand` selects a random sample of words to show.

In [None]:
def biggest_delta(dvr: pd.DataFrame) -> pd.Series:
    biggest_delta = []
    for y in (1990, 2000, 2010):
        delta_dvr = pd.merge(
        LPA.create_dvr(freq_window(y, True)),
        LPA.create_dvr(freq_window(2020, True)),
        left_on="element",
        right_on="element",
        how="inner",
    )
        delta_dvr["delta"] = abs(delta_dvr["global_weight_y"] - delta_dvr["global_weight_x"])
        delta_dvr = delta_dvr.sort_values("delta", ascending=False).head(20)
        biggest_delta += delta_dvr["element"].to_list()
    biggest_delta = set(biggest_delta)
    return dvr["element"].isin(biggest_delta)

def popularity(dvr: pd.DataFrame, weight: float=0.005) -> pd.Series:
    return dvr["global_weight"]>weight

def rand(dvr: pd.DataFrame, amount: int =50) -> pd.Series:
    return dvr["element"].isin(base_freq["element"].sample(amount, random_state=42))

In [None]:
def filtered_cumulative(filter_):
    l = []
    for date in pd.date_range(
        pd.to_datetime("1990-01-01", format="%Y-%m-%d"),
        pd.to_datetime("2022-01-01", format="%Y-%m-%d"),
        freq="MS",
    ):
        freq = freq_window(date, True)
        dvr = LPA.create_dvr(freq)
        dvr["date"] = date
        l.append(dvr[filter_].reset_index(drop=True))
    cum_dvr = pd.concat(l)
    cum_dvr["date"] = cum_dvr["date"].astype("category")
    cum_dvr = cum_dvr.groupby(["element", "date"], as_index=False).first()
    cum_dvr["date"] = pd.to_datetime(cum_dvr["date"])
    cum_dvr["global_weight"] = cum_dvr["global_weight"].fillna(0)
    return cum_dvr

### Different DVRs

In [None]:
freq = create_freq()
freq = freq[freq["date"]>= pd.to_datetime("2019-10-28")]

In [None]:
cons_freq = freq[freq["subcorpus"] == "conspiracy"].reset_index(drop=True)
cons_dvr = IterLPA.create_dvr(cons_freq)
mains_freq = freq[freq["subcorpus"] == "mainstream"].reset_index(drop=True)
mains_dvr = IterLPA.create_dvr(mains_freq)

mains_dvr.to_csv("results/lpa_mainstream_dvr.csv", index=False)
cons_dvr.to_csv("results/lpa_conspiracy_dvr.csv", index=False)

cons_weekly_freq = pd.concat(
    [x[1].assign(**{"category": x[0]}) for x in cons_freq.resample("W", on="date")],
    ignore_index=True,
)
mains_weekly_freq = pd.concat(
    [x[1].assign(**{"category": x[0]}) for x in mains_freq.resample("W", on="date")],
    ignore_index=True,
)
cons_weekly_freq = cons_weekly_freq.groupby(
    ["category", "element"], as_index=False
).sum()
mains_weekly_freq = mains_weekly_freq.groupby(
    ["category", "element"], as_index=False
).sum()


In [None]:
cons_lpa = LPA(cons_dvr, epsilon_frac=2)
mains_lpa = LPA(mains_dvr, epsilon_frac=2)
cons_sigs = cons_lpa.create_and_cut(cons_weekly_freq, sig_length=150)
mains_sigs = mains_lpa.create_and_cut(mains_weekly_freq, sig_length=150)
cons_sigs.to_csv("results/lpa_conspiracy_signatures.csv", index=False)
mains_sigs.to_csv("results/lpa_mainstream_signatures.csv", index=False)

In [None]:
cons_ncdvr = non_cumulative_dvr(cons_freq,"full", "W", "2019-10-28", "2020-07-05")
mains_ncdvr = non_cumulative_dvr(mains_freq,"full", "W", "2019-10-28", "2020-07-05")

cons_ncdvr["frequency_in_category"] = cons_ncdvr["global_weight"]
cons_avg_dvr = LPA.create_dvr(cons_ncdvr.drop(columns=["global_weight"]))
mains_ncdvr["frequency_in_category"] = mains_ncdvr["global_weight"]
mains_avg_dvr = LPA.create_dvr(mains_ncdvr.drop(columns=["global_weight"]))

mains_avg_dvr.to_csv("results/temporal_lpa_mainstream_dvr.csv", index=False)
cons_avg_dvr.to_csv("results/temporal_lpa_conspiracy_dvr.csv", index=False)

In [None]:
avg_cons_lpa = LPA(cons_avg_dvr, epsilon_frac=2)
avg_mains_lpa = LPA(mains_avg_dvr, epsilon_frac=2)
avg_cons_sigs = avg_cons_lpa.create_and_cut(cons_weekly_freq, sig_length=150)
avg_mains_sigs = avg_mains_lpa.create_and_cut(mains_weekly_freq, sig_length=150)

avg_cons_sigs.to_csv("results/temporal_lpa_conspiracy_signatures.csv", index=False)
avg_mains_sigs.to_csv("results/temporal_lpa_mainstream_signatures.csv", index=False)


In [None]:
conspiracy_dvr_2019 = LPA.create_dvr(
    freq_window(
        base_freq, 2019, cumulative=True, direction="to", subcorpus="conspiracy"
    )
)
conspiracy_dvr_2021 = LPA.create_dvr(
    freq_window( 
        base_freq, 2020, cumulative=True, direction="from", subcorpus="conspiracy"
    )
)
mainstream_dvr_2019 = LPA.create_dvr(
    freq_window(
        base_freq, 2019, cumulative=True, direction="to", subcorpus="mainstream"
    )
)
mainstream_dvr_2021 = LPA.create_dvr(
    freq_window(
        base_freq, 2020, cumulative=True, direction="from", subcorpus="mainstream"
    )
)

## Unit Test

In [None]:
element, category, date, subcorpus = "america", "C00f0d", "2005-11-01", "conspiracy"
base = pd.read_csv("results/base_freq.csv")
display(base[(base["element"] == element) & (base["subcorpus"] == subcorpus)])
# specific_category = base[base["category"] == category].sort_values(
#     "frequency_in_category", ascending=False
# )
# display(
#     specific_category["frequency_in_category"]
#     / sum(specific_category["frequency_in_category"])
# )
tw_freq_df = pd.read_csv(f"results/conspiracy1/tw_freq.csv")
display(tw_freq_df[tw_freq_df["element"] == element])
# month = tw_freq_df[tw_freq_df["date"] == pd.to_datetime(date)]
# display(month[month["element"] == element])
# display(month)
dvr = pd.read_csv(f"results/conspiracy1/dvr.csv")
display(dvr[dvr["element"] == element])
dvr[dvr["element"] == "covid-19"]


In [None]:
for subcorpus in ("conspiracy", "mainstream"):
    # dvr = (
    #     pd.read_csv(f"results/{subcorpus}/dvr.csv")
    #     .sort_values("global_weight", ascending=False)["element"]
    #     .head(20)
    # )
    squeezed_freq = pd.read_csv(f"results/{subcorpus}/squeezed_freq.csv")
    display((squeezed_freq.groupby("element")["global_weight"].sum() / squeezed_freq.groupby("element")["global_weight"].sum().sum()).sort_values(ascending=False).head(20))
