In [1]:
from svglib import svglib
from reportlab.graphics import renderPM

import pandas as pd
import altair as alt
import numpy as np
from pathlib import Path
from helpers import read, write

import bottleneck as bn
from LPA import Corpus, sockpuppet_distance, PCA
from math import floor
from scipy.spatial.distance import cdist, cityblock

from visualize import sockpuppet_matrix, timeline, plot_pca,facet_timeline

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [2]:
def svg_to_eps(name):
    svg = svglib.svg2rlg(f"{name}.svg")
    renderPM.drawToFile(svg, f"{name}.eps", dpi=300)
    renderPM.drawToFile(svg, f"{name}.png", fmt="PNG", dpi=300)

Micro

In [3]:
freq = pd.read_csv(f"data/death_cause/np_freq/0_normalized.csv").dropna(subset=["element"])
display(freq)
nfreq= freq.pivot(index="document", columns="element", values="frequency_in_document").fillna(0)
nfreq = (nfreq.T/ nfreq.sum(axis=1)).T
nfreq = nfreq.melt(var_name="element", value_name="frequency_in_document", ignore_index=False).reset_index()
nfreq=nfreq[nfreq["frequency_in_document"]>0].reset_index(drop=True)
display(nfreq)


corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(distance="JSD")

Unnamed: 0,document,element,frequency_in_document
0,1990,Acute hepatitis,0.000031
1,1990,Alcohol use disorders,0.000022
2,1990,Alzheimer's disease and other dementias,0.000105
3,1990,Cardiovascular diseases,0.002264
4,1990,Chronic kidney disease,0.000113
...,...,...,...
925,2019,Poisonings,0.000010
926,2019,Protein-energy malnutrition,0.000027
927,2019,Road injuries,0.000155
928,2019,Self-harm,0.000098


Unnamed: 0,document,element,frequency_in_document
0,1990,Acute hepatitis,0.003822
1,1991,Acute hepatitis,0.003751
2,1992,Acute hepatitis,0.003682
3,1993,Acute hepatitis,0.003583
4,1994,Acute hepatitis,0.003452
...,...,...,...
925,2015,Tuberculosis,0.024444
926,2016,Tuberculosis,0.023874
927,2017,Tuberculosis,0.023332
928,2018,Tuberculosis,0.022503


In [4]:
# pick elements

full_sig = []
freq_bump = []
top_sig = []

indices = [0, 1, 2, 3, 4, 5]
for i, ix in enumerate(indices):
    full_sig.append(sigs[0][ix])  # .head(10))
    freq_bump.append(freq[freq["element"].isin(full_sig[i].index)])
    if i > 0:
        full_sig[i] = pd.merge(
            full_sig[i - 1], full_sig[i], left_index=True, right_index=True, how="outer"
            )


def pop_and_melt(l):
    return (
        l.pop()
        .rename_axis(index="cause of death")
        .reset_index()
        .melt(id_vars="cause of death", var_name="year", value_name="distance")
    )  # fillna


sig_specific = pop_and_melt(full_sig)
freq_heads = pd.concat(freq_bump)
# .rename_axis(index="cause of death")
#     .reset_index()
#     .melt(id_vars="cause of death", var_name="year", value_name="distance")
# )
specific = ["HIV/AIDS", "Cardiovascular diseases"]
freq_heads = freq_heads[freq_heads["document"].isin(1990 + np.array(indices))]
specific_freq_heads = freq_heads[freq_heads["element"].isin(specific)]
display(
    alt.Chart(specific_freq_heads)
    .mark_line(point=True)
    .encode(
        x=alt.X("document:O", title="Year"),
        y=alt.Y("frequency_in_document:Q", title="Frequency"),
        color=alt.Color("element:N", title="Cause of Death"),
    )
    .properties(
        title="Causes of Death - Distribution",
        width=300,
        height=200,
    )
)
chart = (alt.Chart(sig_specific[sig_specific["cause of death"].isin(specific)])
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y("distance:Q", title="Distance from DVR"),
        color=alt.Color("cause of death:N", title="Cause of Death"),
    )
    .properties(
        title="Causes of Death - Change in Signature",
        width=300,
        height=200,
    ))
display(chart)

In [10]:
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures()
# ms = sigs[1].columns.to_list()
ms = ["Diarrheal diseases"]
ff = (
    nfreq[nfreq["element"].isin(ms)]
    .reset_index(drop=True)
    .rename(columns={"document": "year", "frequency_in_document": "value"})
)
ff["facet"] = "Frequency"
distances = (
    (sigs[1].melt(var_name="element", value_name="value", ignore_index=False))
    .rename_axis(index="year")
    .reset_index()
)
distances["facet"] = "Distance"
dd = pd.concat([ff, distances])
display(
    facet_timeline(
        dd[dd["element"].isin(ms)],
        x="year",
        y="value",
        title=ms[0],
        stack=False,
        name="sotu_timeline",
        color="#f05039",
        width=600
    )
    .facet(
        row=alt.Row("facet:O", title=None, header=alt.Header(labelFontWeight="bold"))
    )
    .properties(title=f'Distance and Frequency of the Element "{ms[0]}"')
    .resolve_scale(y="independent")
    .configure_title(orient="top", anchor="middle")
)
#green = 32a852
#bright_red =  f05039
# blue 3d65a5



In [11]:
#svg_to_eps(f"results/cod/diarrheal_freq_and_distance")


In [12]:
for i, title in enumerate(("Democrat", "Republican")):
    df = pd.read_csv(f"data/us_elections/np_freq/{i}_normalized.csv")
    corpus = Corpus(df)
    dvr = corpus.create_dvr(equally_weighted=True)
    sigs = corpus.create_signatures(distance="JSD",epsilon=1 / (len(dvr) * 2))
    # for sig in sigs[0]:
    #     write(
    #         Path(f"results/us_elections/sigs/{title.lower()}"),
    #         sig.to_frame().reset_index().rename(columns={"index": "state"}),
    #         name=f"us_elections_{sig.name}",
    #         color=True,
    #     )
    # display(timeline(sigs[1], x="year", y="distance", corpus=title, stack=False,order=,name=f"final_results/{title.lower()}_timeline"))
    mx = sockpuppet_distance(corpus, corpus, res="matrix")
    pca_res = PCA(mx)
    print(pca_res[1])
    display(plot_pca(pca_res[0], labels=mx.columns.to_list()).properties(title=title))





FileNotFoundError: [Errno 2] No such file or directory: 'data/us_elections/np_freq/0_normalized.csv'

Sotu

In [39]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv").dropna(subset=["element"])
display(freq)
nfreq= freq.pivot(index="document", columns="element", values="frequency_in_document").fillna(0)
nfreq = (nfreq.T/ nfreq.sum(axis=1)).T
nfreq = nfreq.melt(var_name="element", value_name="frequency_in_document", ignore_index=False).reset_index()
nfreq=nfreq[nfreq["frequency_in_document"]>0].reset_index(drop=True)
display(nfreq)

Unnamed: 0,element,frequency_in_document,document
0,fellow,1,1790
1,immediately,1,1790
2,impression,1,1790
3,receive,1,1790
4,happiness,1,1790
...,...,...,...
330900,vaccinate,1,2022
330901,fueled,1,2022
330902,crisis,3,2022
330903,helped,3,2022


Unnamed: 0,document,element,frequency_in_document
0,1946,aaa,0.000072
1,1894,aana,0.000267
2,1807,aaron,0.000929
3,2000,aaron,0.000257
4,1798,abandon,0.000992
...,...,...,...
330894,2020,zone,0.000964
330895,1901,zoological,0.000111
330896,1983,zooming,0.000370
330897,1859,zuloaga,0.000719


In [16]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")

corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures()
# ms = sigs[1].columns.to_list()
ms = ["world"]
ff = (
    nfreq[nfreq["element"].isin(ms)]
    .reset_index(drop=True)
    .rename(columns={"document": "year", "frequency_in_document": "value"})
)
ff["facet"] = "Frequency"
# freq_timeline = timeline(
#     ff,
#     x="document",
#     y="frequency_in_document",
#     title=ms[0],
#     stack=False,
#     name="sotu_timeline",
# ).properties(height=100)
distances = (
    (sigs[1].melt(var_name="element", value_name="value", ignore_index=False))
    .rename_axis(index="year")
    .reset_index()
)
distances["facet"] = "Distance"
dd = pd.concat([ff, distances])
# distance_timeline = timeline(
#     distances[distances["element"].isin(ms)],
#     x="index",
#     y="distance",
#     title="",
#     stack=False,
#     name="sotu_timeline",
# ).properties(height=100, title="")
display(
    facet_timeline(
        dd[dd["element"].isin(ms)],
        x="year",
        y="value",
        title=ms[0],
        stack=False,
        name="sotu_timeline",
        color="#3d65a5"
    )
    .facet(
        row=alt.Row("facet:O", title=None, header=alt.Header(labelFontWeight="bold"))
    )
    .properties(title=f'Distance and Frequency of the Element "{ms[0]}"')
    .resolve_scale(y="independent")
    .configure_title(orient="top", anchor="middle")
)
#green = 32a852
#bright_red =  f05039
# blue 3d65a5
# facet_timeline(    distances[distances["element"].isin(ms)],
#     x="index",
#     y="distance",
#     title="",
#     stack=False,
#     name="sotu_timeline",)


In [15]:
svg_to_eps(f"results/sotu_{ms[0]}")

Failed to load input file! (Error reading file 'results/sotu_world.svg': failed to load external entity "results/sotu_world.svg")


AttributeError: 'NoneType' object has no attribute 'renderScale'

LOCO conspiracy

In [17]:
corp = "loco_conspiracy"
freq = pd.read_csv(f"data/{corp}/freq.csv")
nfreq= freq.pivot(index="document", columns="element", values="frequency_in_document").fillna(0)
nfreq = (nfreq.T/ nfreq.sum(axis=1)).T
nfreq = nfreq.melt(var_name="element", value_name="frequency_in_document", ignore_index=False).reset_index()
nfreq=nfreq[nfreq["frequency_in_document"]>0].reset_index(drop=True)
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures()
most_significant = (
    sigs[1]
    .melt(var_name="element", value_name="distance", ignore_index=False)
    .rename_axis(index="year")
    .reset_index()
)
dd = timeline(
    most_significant,
    x="year",
    y="distance",
    title="Most Significant Elements",
    stack=False,
    name=f"final_results/{corp}_timeline",
).properties(title="Distnace of Most Significant Elements")
ff = timeline(
    nfreq[nfreq["element"].isin(most_significant["element"].drop_duplicates())].sort_values("document"),
    x="document",
    y="frequency_in_document",
    title="Most Significant Elements",
    stack=False,
    name=f"final_results/{corp}_timeline",
).properties(title="Most Significant Elements")
dd.save(f"final_results/{corp}_distance_timeline.html")
ff.save(f"final_results/{corp}_frequency_timeline.html")

FileNotFoundError: [Errno 2] No such file or directory: 'data/loco_conspiracy/freq.csv'

In [28]:
corp = "loco_mainstream"
freq = pd.read_csv(f"data/{corp}/freq.csv")
nfreq= freq.pivot(index="document", columns="element", values="frequency_in_document").fillna(0)
nfreq = (nfreq.T/ nfreq.sum(axis=1)).T
nfreq = nfreq.melt(var_name="element", value_name="frequency_in_document", ignore_index=False).reset_index()
nfreq=nfreq[nfreq["frequency_in_document"]>0].reset_index(drop=True)
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures()
most_significant = (
    sigs[1]
    .melt(var_name="element", value_name="distance", ignore_index=False)
    .rename_axis(index="year")
    .reset_index()
)
dd = timeline(
    most_significant,
    x="year",
    y="distance",
    title="Most Significant Elements",
    stack=False,
    name=f"final_results/{corp}_timeline",
).properties(title="Distnace of Most Significant Elements")
ff = timeline(
    nfreq[nfreq["element"].isin(most_significant["element"].drop_duplicates())].sort_values("document"),
    x="document",
    y="frequency_in_document",
    title="Most Significant Elements",
    stack=False,
    name=f"final_results/{corp}_timeline",
).properties(title="Most Significant Elements")
dd.save(f"final_results/{corp}_distance_timeline.html")
ff.save(f"final_results/{corp}_frequency_timeline.html")

In [13]:
corp = "loco_conspiracy"
freq = pd.read_csv(f"data/{corp}/freq.csv")
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(distance="JSD")
# for sig in sigs[0]:
#     write(Path("results")/corp/"sigs", sig.reset_index(), f"{corp}_{sig.name}", color=True)
sockpuppet_matrix(sockpuppet_distance(corpus, corpus)).properties(
    title=("Mainstream Articles")
).configure_axis(title=None).properties(width=900, height=900)
# .save(
#     f"final_results/loco_mainstream_sockpuppets.html"
# )


In [37]:
word = ["obama"]
# nfreq = freq.pivot(
#     index="document", columns="element", values="frequency_in_document"
# ).fillna(0)
# nfreq = (nfreq.T / nfreq.sum(axis=1)).T
# nfreq = nfreq.melt(
#     var_name="element", value_name="frequency_in_document", ignore_index=False
# ).reset_index()
# nfreq = nfreq[nfreq["frequency_in_document"] > 0].reset_index(drop=True)
# nfreq = nfreq[nfreq["element"].isin(word)].reset_index(drop=True)
corp = "loco_mainstream"
mfreq = pd.read_csv(f"data/{corp}/freq.csv")
mcorpus = Corpus(mfreq)
mdvr = mcorpus.create_dvr(equally_weighted=True)
msigs = mcorpus.create_signatures()
mmost_significant = (
    msigs[1]
    .melt(var_name="element", value_name="distance", ignore_index=False)
    .rename_axis(index="year")
    .reset_index()
)
mmost_significant = mmost_significant[mmost_significant["element"].isin(word)]
mmost_significant["facet"] = "Mainstream"


ccorp = "loco_conspiracy"
cfreq = pd.read_csv(f"data/{ccorp}/freq.csv")
ccorpus = Corpus(cfreq)
cdvr = ccorpus.create_dvr(equally_weighted=True)
csigs = ccorpus.create_signatures()
cmost_significant = (
    csigs[1]
    .melt(var_name="element", value_name="distance", ignore_index=False)
    .rename_axis(index="year")
    .reset_index()
)
cmost_significant = cmost_significant[cmost_significant["element"].isin(word)]
cmost_significant["facet"] = "Conspiracy"

dd = pd.concat([mmost_significant, cmost_significant])
dd = dd[pd.to_datetime(dd["year"]) >= pd.to_datetime("2010-01-01")]
display(
    facet_timeline(
        dd,
        x="year",
        y="distance",
        title="",
        stack=False,
        order=word,
        name="",
        color="#3d65a5"
    )
    .facet(
        row=alt.Row("facet:O", title=None, header=alt.Header(labelFontWeight="bold"))
    )
    .properties(title=f'Element "{word[0]}" in Consporatoty and Mainstream Articles')
    .resolve_scale(y="independent")
    .configure_title(orient="top", anchor="middle")
)
# dd.save(f"final_results/{corp}_distance_timeline.html")
# ff.save(f"final_results/{corp}_frequency_timeline.html")


In [3]:
word = ["china"]
corp = "loco_mainstream"
mfreq = pd.read_csv(f"data/{corp}/freq.csv")

mnfreq = mfreq.pivot(
    index="document", columns="element", values="frequency_in_document"
).fillna(0)
mnfreq = (mnfreq.T / mnfreq.sum(axis=1)).T
mnfreq = mnfreq.melt(
    var_name="element", value_name="frequency_in_document", ignore_index=False
).reset_index()
mnfreq = mnfreq[mnfreq["frequency_in_document"] > 0].reset_index(drop=True)
mnfreq = mnfreq[mnfreq["element"].isin(word)].reset_index(drop=True)

mcorpus = Corpus(mfreq)
mdvr = mcorpus.create_dvr(equally_weighted=True)
msigs = mcorpus.create_signatures()

print(mnfreq)
mnfreq["facet"] = "Mainstream"
ccorp = "loco_conspiracy"
cfreq = pd.read_csv(f"data/{ccorp}/freq.csv")

cnfreq = cfreq.pivot(
    index="document", columns="element", values="frequency_in_document"
).fillna(0)
cnfreq = (cnfreq.T / cnfreq.sum(axis=1)).T
cnfreq = cnfreq.melt(
    var_name="element", value_name="frequency_in_document", ignore_index=False
).reset_index()
cnfreq = cnfreq[cnfreq["frequency_in_document"] > 0].reset_index(drop=True)
cnfreq = cnfreq[cnfreq["element"].isin(word)].reset_index(drop=True)


ccorpus = Corpus(cfreq)

cdvr = ccorpus.create_dvr(equally_weighted=True)
csigs = ccorpus.create_signatures()

cnfreq["facet"] = "Mainstream"

dd = pd.concat([mnfreq, cnfreq])
dd = dd[pd.to_datetime(dd["document"]) >= pd.to_datetime("2010-01-01")]
display(
    facet_timeline(
        dd,
        x="document",
        y="frequency_in_document",
        title="",
        stack=False,
        order=word,
        name="",
        color="#3d65a5"
    )
    .facet(
        row=alt.Row("facet:O", title=None, header=alt.Header(labelFontWeight="bold"))
    )
    .properties(title=f'Element "{word[0]}" in Consporatoty and Mainstream Articles')
    .resolve_scale(y="independent")
    .configure_title(orient="top", anchor="middle")
)
# dd.save(f"final_results/{corp}_distance_timeline.html")
# ff.save(f"final_results/{corp}_frequency_timeline.html")


       document element  frequency_in_document
0    2000-05-01   china               0.000783
1    2000-07-01   china               0.000287
2    2000-08-01   china               0.000414
3    2000-09-01   china               0.000252
4    2000-10-01   china               0.003264
..          ...     ...                    ...
216  2020-03-01   china               0.002321
217  2020-04-01   china               0.001734
218  2020-05-01   china               0.001604
219  2020-06-01   china               0.001552
220  2020-07-01   china               0.000894

[221 rows x 3 columns]
