In [3]:
import pandas as pd
import altair as alt
import numpy as np
from pathlib import Path
from helpers import read, write

import bottleneck as bn
from LPA import Corpus, sockpuppet_distance
from math import floor
from scipy.spatial.distance import cdist, pdist
from scipy.stats import chi2_contingency, chi2
import matplotlib.pyplot as plt

from visualize import sockpuppet_matrix, timeline

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [17]:
ix = -1
lenn = 20
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")
year = freq["document"].drop_duplicates().sort_values().iloc[ix]
this_freq = freq[freq["document"] == year].drop(columns=["document"])
this_freq["frequency_in_document"] = (
    this_freq["frequency_in_document"] / this_freq["frequency_in_document"].sum()
)

jsd_corpus = Corpus(freq)
dvr = jsd_corpus.create_dvr(equally_weighted=True).reset_index()
jsd_sigs = jsd_corpus.create_signatures(distance="JSD", alpha=0.5, sig_length=10)
jsd_freq = this_freq[
    this_freq["element"].isin(jsd_sigs[0][ix].head(lenn).index)
].set_index("element")
# for sig in jsd_sigs[0]:
#     write(Path(f"results/sotu/sigs/jsd"), table=sig.reset_index(), name=f"{sig.name}", color=True)
epsilon = 1e-5

klde_corpus = Corpus(freq)
dvr = klde_corpus.create_dvr(equally_weighted=True).reset_index()
klde_sigs = klde_corpus.create_signatures(
    distance="KLDe", epsilon=epsilon, sig_length=10
)
klde_freq = this_freq[
    this_freq["element"].isin(klde_sigs[0][ix].head(lenn).index)
].set_index("element")

display(
    pd.merge(
        pd.merge(
            jsd_sigs[0][ix], dvr.set_index("element"), left_index=True, right_index=True
        ).head(lenn),
        jsd_freq,
        left_index=True,
        right_index=True,
        how="left",
    )
    .fillna(0)
    .rename(
        columns={
            year: "JSD_distance",
            "index": "rank_in_dvr",
            "frequency_in_document": "local_weight",
        }
    )
)


display(
    pd.merge(
        pd.merge(
            klde_sigs[0][ix],
            dvr.set_index("element"),
            left_index=True,
            right_index=True,
        ).head(lenn),
        klde_freq,
        left_index=True,
        right_index=True,
        how="left",
    )
    .fillna(epsilon)
    .rename(
        columns={
            year: "KLDe_distance",
            "index": "rank_in_dvr",
            "frequency_in_document": "local_weight",
        }
    )
)

display(
    pd.concat(
        [jsd_sigs[0][ix].reset_index(), klde_sigs[0][ix].reset_index()], axis=1
    ).head(lenn)
)

display(
    pd.merge(
        jsd_sigs[0][ix].head(lenn),
        klde_sigs[0][ix].head(lenn),
        left_index=True,
        right_index=True,
    ).rename(columns={"2022_x": "JSD_distance", "2022_y": "KLDe_distance"})
)
# corpus.distance_matrix.matrix
# for i in range(len(sigs[0])):
#     write(Path("results/sotu/sigs/klde/"), sigs[0][i].to_frame().reset_index(), f"sig_{i}", color=True)

ERROR! Session/line number was not unique in database. History logging moved to new session 667


Unnamed: 0,JSD_distance,rank_in_dvr,global_weight,local_weight
government,-0.00257,1,0.00857,0.000917
american,0.002219,8,0.004875,0.015888
let,0.002195,87,0.001259,0.008555
get,0.001897,307,0.000601,0.006111
tonight,0.001872,153,0.000966,0.007027
putin,0.001759,5432,1.8e-05,0.003666
america,0.001645,21,0.003015,0.010694
ukrainian,0.00162,6072,1.4e-05,0.003361
cost,0.001497,151,0.000974,0.006111
pas,0.001486,526,0.000401,0.004583


Unnamed: 0,KLDe_distance,rank_in_dvr,global_weight,local_weight
government,-0.027587,1,0.00857,0.000917
interest,-0.022432,23,0.002774,1e-05
putin,0.021763,5432,1.8e-05,0.003666
ukrainian,0.020522,6072,1.4e-05,0.003361
present,-0.019436,30,0.002458,1e-05
part,-0.019185,31,0.002431,1e-05
subject,-0.015205,38,0.001999,1e-05
let,0.014022,87,0.001259,0.008555
ukraine,0.013833,4237,2.9e-05,0.00275
treaty,-0.01377,41,0.00184,1e-05


Unnamed: 0,index,2022,index.1,2022.1
0,government,-0.00257,government,-0.027587
1,american,0.002219,interest,-0.022432
2,let,0.002195,putin,0.021763
3,get,0.001897,ukrainian,0.020522
4,tonight,0.001872,present,-0.019436
5,putin,0.001759,part,-0.019185
6,america,0.001645,subject,-0.015205
7,ukrainian,0.00162,let,0.014022
8,cost,0.001497,ukraine,0.013833
9,pas,0.001486,treaty,-0.01377


Unnamed: 0,JSD_distance,KLDe_distance
government,-0.00257,-0.027587
let,0.002195,0.014022
putin,0.001759,0.021763
ukrainian,0.00162,0.020522


In [18]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(distance="JSD", alpha=0.5, sig_length=10)
sockpuppet_matrix(sockpuppet_distance(corpus, corpus, heuristic=False)).properties(
    title=("State of the Union Address")
).configure_axis(title=None).properties(width=900, height=900)
# save(
#     f"results/sotu/jsd_sotu_sockpuppets.html"
# )
# sigs[1].to_csv("results/sotu/top_30_most_changing_distance.csv")

In [46]:
df = sigs[1].reset_index()
# ms = df.columns.to_list()
ms = [
    "america",
    "world",
    "job",
    "government",
    "tax",
    "help",
    "child",
    "economic",
    "war",
    "people",
    "duty",
    "family",
]
# df = df.assign(
#     **{
#         "category": sorted(
#             [p.split("/")[-1][5:-4] for p in glob(f"results/{subcorpus}/sigs/*.csv")]
#         )
#     }
# )
msdf = df.melt(id_vars="index", var_name="element", value_name="distance").rename(
    columns={"index": "year"}
)
msdf = msdf[msdf["element"].isin(ms)]
# msdf = pd.melt(
#     df[ms + ["document"]], id_vars="document", var_name="element", value_name="KL"
#
timeline(
    msdf,
    x="year",
    y="distance",
    corpus="sotu",
    stack=None,
    order=ms,
    name=f"abcde",
).save("results/sotu/jsd_timeline.html")


# frqq = freq[freq["element"].isin(ms)].rename(columns={"document":"year"})

# display(
#     timeline(
#         frqq,
#         x="year",
#         y="frequency_in_document",
#         corpus="sotu",
#         stack=None,
#         order=ms,
#         name=f"abcde",
#     )
# )

In [47]:
freq1 = pd.read_csv(f"data/sotu/np_freq/0.csv")
freq1 = freq1.pivot_table(
    index="document", columns="element", values="frequency_in_document", aggfunc="sum"
).fillna(0)
freq1 = freq1.div(freq1.sum(axis=1), axis=0)
freq1 = freq1.melt(
    value_name="frequency_in_document", var_name="element", ignore_index=False
).reset_index()
freq1 = freq1[freq1["element"].isin(ms)].rename(columns={"document": "year"})
# freq1
timeline(
    freq1,
    x="year",
    y="frequency_in_document",
    corpus="sotu",
    stack=None,
    order=ms,
    name=f"abcde",
).save("results/sotu/freq_timeline.html")

In [59]:
from glob import glob

presidents = (
    pd.DataFrame(
        [Path(p).stem.split("_") for p in glob("data/sotu/raw/*")],
        columns=["president", "year"],
    )
    .sort_values(by="year")
    .reset_index(drop=True)
)
presidents["year"] = presidents["year"].astype(int)


# indices = presidents.query("president =='Roosevelt' & year <= 1909").index
# presidents.loc[indices,'president'] = 'Theodore Roosevelt'

# indices = presidents.query("president == 'Roosevelt'").index
# presidents.loc[indices,'president'] = 'Franklin D. Roosevelt'

# indices = presidents.query("president =='Bush' & year <= 1992").index
# presidents.loc[indices,'president'] = 'George H. W. Bush'

# indices = presidents.query("president == 'Bush'").index
# presidents.loc[indices,'president'] = 'George W. Bush'

# indices = presidents.query("president =='Johnson' & year <= 1869").index
# presidents.loc[indices,'president'] = 'Andrew Johnson'

# indices = presidents.query("president == 'Johnson'").index
# presidents.loc[indices,'president'] = 'Lyndon B. Johnson'

# indices = presidents.query("president =='Adams' & year <= 1801").index
# presidents.loc[indices,'president'] = 'John Adams'

# indices = presidents.query("president == 'Adams'").index
# presidents.loc[indices,'president'] = 'John Quincy Adams'

# indices = presidents.query("president =='Harrison' & year <= 1841").index
# presidents.loc[indices,'president'] = 'William Henry Harrison'

# indices = presidents.query("president == 'Harrison'").index
# presidents.loc[indices,'president'] = 'Benjamin Harrison'

# presidents.loc[35:38, "president"] = ["Quincy Adams"] * 4
# presidents.loc[210:217, "president"] = ["Bush Junior"] * 8
# presidents.loc[198:201, "president"] = ["Bush Senior"] * 4
# presidents.loc[111:118, "president"] = ["Theodore Roosevelt"] * 8
# presidents.loc[143:154, "president"] = ["Franklin D. Roosevelt"] * 12

In [22]:
# d = {}
# for p, l in presidents.groupby("president")["year"].apply(list).to_dict().items():
#     d[p] = ""
#     for y in l:
#         with open(f"data/sotu/raw/{p.strip()}_{y}.txt", "r") as f:
#             text = f.read()
#         d[p] += text

In [78]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(distance="JSD", alpha=0.5, sig_length=100)
spd = sockpuppet_distance(corpus, corpus, group_by="president", reduce_df=presidents)
pivoted = spd.pivot(index="Corpus 1", columns="Corpus 2", values="value").fillna(0)
aa = pivoted.to_numpy()
full = aa + aa.T
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


def PCA_(df):
    df = StandardScaler().fit_transform(df)
    pca = PCA(n_components=2)
    pcdf = pca.fit_transform(df)
    print(pca.explained_variance_ratio_)
    return pcdf


res = PCA_(full)
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots()
ddd = pd.DataFrame(res, columns=["x", "y"]).assign(
    **{"pres": pivoted.columns.to_list()}
)
points = alt.Chart(ddd).mark_point().encode(x="x", y="y")
text = points.mark_text(align="left", baseline="middle", dx=7).encode(text="pres")
(points + text).properties(height=900, width=900).save("results/sotu/pca.html")

# x,y = res[:,0], res[:,1]
# plt.scatter(x,y)

# for i, txt in enumerate():
#     ax.annotate(txt, (x[i], y[i]))

# combs = list(cwr(presidents["president"].unique(), 2))
# matrix = copy(corpus.signature_matrix)

# d = dict(zip(["pres1", "pres2"], list(zip(*combs)))) | {"value": []}
# for pres1, pres2 in combs:
#     ix1 = presidents[presidents["president"] == pres1].index
#     ix2 = presidents[presidents["president"] == pres2].index
#     mx1 = matrix.matrix[ix1, :]
#     mx2 = matrix.matrix[ix2, :]
#     # mx1 = mx1[:, ~np.all(matrix.matrix == 0, axis=0)]
#     # mx2 = mx2[:, ~np.all(matrix.matrix == 0, axis=0)]
#     cdist_= cdist(mx1, mx2, metric="cityblock")
#     d["value"].append(cdist_.sum() / np.multiply(*cdist_.shape))

# df = pd.DataFrame(d)[["pres2","pres1","value"]]
# df["value"] /= df["value"].max()
# display(df)

# sockpuppet_matrix(df)

[0.69865487 0.13947634]


In [None]:
from sklearn.decomposition import PCA
PCA(sockpuppet_distance(corpus, corpus, group_by="president", reduce_df=presidents)

In [56]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(distance="JSD", sig_length=100)

presidents = (
    pd.DataFrame(
        [Path(p).stem.split("_") for p in glob("data/sotu/raw/*")],
        columns=["president", "year"],
    )
    .sort_values(by="year")
    .reset_index(drop=True)
)
presidents["year"] = presidents["year"].astype(int)
presidents["groups"] = (
    presidents["president"] != presidents["president"].shift()
).cumsum()
presidents = (
    presidents.reset_index()
    .groupby("groups")
    .agg({"index": list, "president": "first"})
)
pd.DataFrame(
    [np.abs(np.array(sigs[0])[ix]).sum() for ix in presidents["index"].to_list()]
)  # .plot.hist()
pd.merge(
    pd.DataFrame(
        [np.abs(np.array(sigs[0])[ix]).sum() for ix in presidents["index"].to_list()]
    ),
    presidents,
    left_index=True,
    right_index=True,
).drop(columns=["index"]).sort_values(
    by=0
)  # .plot.hist()
# combs = list(cwr(reduce_df.index, 2))
# d = dict(zip([c1n, c2n], list(zip(*combs)))) | {"value": []}
# ixs = [reduce_df.loc[[i1, i2], "index"].tolist() for i1, i2 in combs]
# for ix1, ix2 in ixs:
#     submatrix = cdist_[ix1][:, ix2]
#     d["value"].append(submatrix.sum() / np.multiply(*submatrix.shape))
# reduce_df.loc[reduce_df.duplicated(subset=group_by), group_by] += " "
# df = pd.DataFrame(d).replace(reduce_df[group_by].to_dict())[[c2n, c1n, "value"]]
# save(
#     f"results/sotu/jsd_sotu_sockpuppets.html"
# )
# sigs[1].to_csv("results/sotu/top_30_most_changing_distance.csv")

Unnamed: 0,0,president
10,0.052718,Polk
32,0.12534,Eisenhower
26,0.13476,Wilson
11,0.140012,Taylor
22,0.177083,Cleveland
43,0.184663,Trump
20,0.194405,Cleveland
24,0.194895,Theodore Roosevelt
7,0.201748,Jackson
8,0.203832,Buren


In [24]:
P = np.array([0.1, 0.2, 0.7, 0])
Q = np.array([0.4, 0.2, 0.2, 0.2])
M = (P + Q) / 2


def KL(P, Q):
    return (P - Q) * np.log2(np.where(P == 0, Q, P) / Q)


def KL(P, Q):
    return P * np.log2(np.where(P == 0, Q, P) / Q)


def over_under(P, Q):
    return np.where(P < Q, -1, 1)


def KLD(P, Q):
    return over_under(P, Q) * KL(P, Q)


def JSD(P, Q):
    M = (P + Q) / 2
    return over_under(P, Q) * (KL(P, M) + KL(Q, M))


print(KLD(P, M))
print(KLD(Q, M))
print(KLD(M, Q))

print(JSD(P, Q))
print(KLD(P, Q))

[ 0.13219281  0.          0.44620094 -0.        ]
[0.27122876 0.         0.233985   0.2       ]
[0.16951798 0.         0.52646625 0.1       ]
[-0.13903595  0.          0.21221594 -0.2       ]
[ 0.2         0.          1.26514845 -0.        ]


In [25]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")
# pop = pd.read_csv("data/us_elections/population.csv")
# pop = (
#     pop.filter(freq["document"].drop_duplicates().astype(str).tolist() + ["state"])
#     .melt(var_name="document", id_vars=["state"])
#     .rename(columns={"state": "element"})
# )
# pop["value"] = pop["value"].str.replace(",", "").astype(int)
# pop["document"] = pop["document"].astype(int)
# df = pd.merge(freq, pop, how="left", on=["document", "element"])
# df["frequency_in_document"] /= df["value"]
# df = df.drop(columns=["value", "party_simplified"])
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
corpus.matrix.normalize()
sample1 = corpus.matrix.matrix[0].copy()
sample2 = corpus.matrix.matrix[0].copy()
# sample2
sample2[sample2 == 0] = 2.07
res = chi2_contingency([sample1, sample2])
print(res[0])
# orig = pd.DataFrame([sample2]).T
# orig[orig[0]>1]
print(np.sum((sample1 - sample2) ** 2 / (sample1 + sample2)))
chi2.stats(len(sample1), moments="mvsk")
# sigs = corpus.create_signatures(epsilon=1 / (len(dvr) * 2))

# for sig in sigs[0]:
#     name = sig.name
#     sig = sig.rename("KL").reset_index()
#     # wordlist[name] = sig[sig["index"] == word].loc[:, "KL"].iloc[0]
#     write(
#         Path("results/sotu"),
#         sig,
#         f"sigs/sigs_with_prevelence_0.75_{name}_epsilon_{1 / (len(dvr) * 2)}",
#         color=True,
#     )
# display(timeline(
#     freq,
#     x="document",
#     y="frequency_in_document",
#     corpus="sotu",
#     stack="center",
#     order=dvr["element"].tolist(),
#     name=f"sotu",
# ))
# display(
#     sockpuppet_matrix(sockpuppet_distance(corpus, corpus))
#     .properties(title="State of the Union Address")
#     .configure_axis(title=None)
# )
# sockpuppet_matrix(sockpuppet_distance(corpus, corpus)).properties(
#     title=("State of the Union Address")
# ).configure_axis(title=None).save(
#     f"results/sotu/sotu_sockpuppets.html"
# )

20469.69498778683
40938.39


(20136.0, 40272.0, 0.019932344846421823, 0.0005959475566150178)

In [26]:
from visualize import timeline

# order = dvr["element"].tolist()[:40]
# timeline(
#     freq[freq["element"].isin(order)],x="document",y="frequency_in_document",corpus="sotu",stack="center",order=order,name=f"sotu"
# ).save("results/sotu/timelines/sotu40.html")

order = dvr["element"].tolist()[:40]
# sigs[1]
timeline(
    freq[freq["element"].isin(order)],
    x="document",
    y="frequency_in_document",
    corpus="sotu",
    stack="center",
    order=order,
    name=f"sotu",
).properties(
    width=1600
)  # .save("results/sotu/timelines/sotu40.html")

sigs[1]

Unnamed: 0,america,world,american,program,state,job,government,tonight,united,help,...,subject,need,budget,public,know,new,family,today,million,duty
1790,-0.001508,-1.879612e-03,-0.002438,-0.000812,1.112512e-04,-0.000625,-0.000002,-0.000483,0.000169,-0.000656,...,-0.001000,5.576473e-09,-0.000475,0.001218,0.000075,-2.141887e-04,-5.591483e-04,-4.597158e-04,-0.000838,0.000239
1791,-0.001508,-1.879612e-03,-0.002438,-0.000812,9.369756e-04,-0.000625,-0.000081,-0.000483,0.001924,-0.000656,...,-0.000134,-1.472672e-04,-0.000475,0.001018,-0.000013,-4.300513e-07,-5.591483e-04,-4.597158e-04,-0.000838,0.000042
1792,-0.001508,-1.879612e-03,-0.002438,-0.000812,-1.714957e-07,-0.000625,-0.000904,-0.000483,-0.000004,-0.000656,...,0.000892,-1.029065e-03,-0.000475,0.000127,-0.000620,-1.916492e-04,-2.881199e-07,-4.597158e-04,-0.000838,-0.000124
1793,-0.001508,-5.191899e-04,-0.002438,-0.000812,3.620422e-03,-0.000625,-0.000796,-0.000483,0.004931,-0.000656,...,0.000009,-1.029065e-03,-0.000475,0.000648,-0.000620,-1.509560e-04,-5.591483e-04,-4.597158e-04,-0.000838,0.000123
1794,-0.001508,-7.706326e-04,-0.000651,-0.000812,5.920042e-04,-0.000625,0.000168,-0.000483,0.000745,-0.000656,...,-0.000201,-2.170810e-04,-0.000475,-0.000014,-0.000620,-3.871475e-04,-2.246104e-05,-4.597158e-04,-0.000838,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0.001525,1.825786e-07,0.003646,-0.000114,-9.753554e-04,0.000492,-0.001650,0.002605,-0.000406,0.000040,...,-0.001000,4.563545e-06,-0.000475,-0.001088,0.000052,5.135315e-04,1.226032e-03,9.008435e-06,0.000035,-0.000034
2019,0.001170,1.422875e-07,0.002618,-0.000812,-7.578368e-06,0.000381,-0.002317,0.001806,0.000140,-0.000195,...,-0.001000,-2.227956e-04,0.000006,-0.001805,0.000193,6.798096e-04,4.449517e-04,1.271065e-04,0.000146,-0.000479
2020,0.002367,2.382679e-05,0.002372,-0.000158,-2.412359e-05,0.000702,-0.001421,0.001010,0.000031,-0.000019,...,-0.001000,-5.103559e-04,-0.000117,-0.000824,-0.000012,5.148849e-04,1.009984e-03,2.195935e-05,0.000862,-0.001058
2021,0.002941,4.024959e-04,0.004150,-0.000111,-1.969695e-03,0.003919,-0.002169,0.000209,-0.000531,0.000477,...,-0.001000,1.562156e-04,-0.000475,-0.000412,0.002098,-2.141887e-04,1.112566e-03,2.967071e-04,0.000751,-0.001058


NameError: name 'pd' is not defined

In [28]:
sockpuppet_matrix(sockpuppet_distance(corpus, corpus)).properties(
    title=("State of the Union Address")
).configure_axis(title=None).properties(width=900, height=900)
# .save(
# f"results/sotu/1e-9_sotu_sockpuppets_prevelence_0_5.html"
# )

In [29]:
freq = pd.read_csv(f"data/sotu/np_freq/0.csv")
corpus = Corpus(freq)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(distance="JSD", alpha=0.5)
sockpuppet_matrix(sockpuppet_distance(corpus, corpus)).properties(
    title=("State of the Union Address")
).configure_axis(title=None).properties(width=900, height=900).save(
    f"results/sotu/jsd_sotu_sockpuppets.html"
)

In [9]:
from collections import Counter

pd.DataFrame.from_dict(Counter("asfdsljghdfasdfkas"))
[(k, v) for k, v in Counter("asfdsljghdfasdfkas").items()]

Unnamed: 0,x
0,"(a, 3)"
1,"(s, 4)"
2,"(f, 3)"
3,"(d, 3)"
4,"(l, 1)"
5,"(j, 1)"
6,"(g, 1)"
7,"(h, 1)"
8,"(k, 1)"


In [30]:
eras = [i for i in range(1790, 2023, 29)]
for i in range(len(eras) - 1):
    freq_i = freq[
        (freq["document"] >= eras[i]) & (freq["document"] < eras[i + 1])
    ].reset_index(drop=True)
    corpus = Corpus(freq_i)
    dvr = corpus.create_dvr(equally_weighted=True)
    dvr.to_csv(f"results/sotu/periods/dvr_{eras[i]}-{eras[i+1]-1}.csv", index=False)
    sigs = corpus.create_signatures(
        distance="JSD", sig_length=100
    )  # 1 / (len(dvr) * 20000)
    for sig in sigs[0]:
        write(
            Path("results/sotu/periods/sigs"),
            sig.reset_index(),
            f"{eras[i]}-{eras[i+1]-1}_{sig.name}",
            color=True,
        )
    sockpuppet_matrix(sockpuppet_distance(corpus, corpus)).properties(
        title=("State of the Union Address")
    ).configure_axis(title=None).properties(width=900, height=900).save(
        f"results/sotu/periods/{eras[i]}-{eras[i+1]-1}_sotu_sockpuppets.html"
    )

wrote 1790-1818_1790
wrote 1790-1818_1791
wrote 1790-1818_1792
wrote 1790-1818_1793
wrote 1790-1818_1794
wrote 1790-1818_1795
wrote 1790-1818_1796
wrote 1790-1818_1797
wrote 1790-1818_1798
wrote 1790-1818_1799
wrote 1790-1818_1800
wrote 1790-1818_1801
wrote 1790-1818_1802
wrote 1790-1818_1803
wrote 1790-1818_1804
wrote 1790-1818_1805
wrote 1790-1818_1806
wrote 1790-1818_1807
wrote 1790-1818_1808
wrote 1790-1818_1809
wrote 1790-1818_1810
wrote 1790-1818_1811
wrote 1790-1818_1812
wrote 1790-1818_1813
wrote 1790-1818_1814
wrote 1790-1818_1815
wrote 1790-1818_1816
wrote 1790-1818_1817
wrote 1790-1818_1818
wrote 1819-1847_1819
wrote 1819-1847_1820
wrote 1819-1847_1821
wrote 1819-1847_1822
wrote 1819-1847_1823
wrote 1819-1847_1824
wrote 1819-1847_1825
wrote 1819-1847_1826
wrote 1819-1847_1827
wrote 1819-1847_1828
wrote 1819-1847_1829
wrote 1819-1847_1830
wrote 1819-1847_1831
wrote 1819-1847_1832
wrote 1819-1847_1833
wrote 1819-1847_1834
wrote 1819-1847_1835
wrote 1819-1847_1836
wrote 1819-18

In [31]:
print(1 / len(dvr * 2))
1e-4

0.0001293493726555426


0.0001

In [32]:
ddd = freq[(freq["document"] >= 1848) & (freq["document"] < 1876)].reset_index(
    drop=True
)
corpus = Corpus(ddd)
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(epsilon=1 / (len(dvr) * 2))
sockpuppet_matrix(sockpuppet_distance(corpus, corpus)).properties(
    title=("State of the Union Address")
).configure_axis(title=None).properties(width=900, height=900).save(
    f"results/sotu/1848-1876_sotu_sockpuppets.html"
)

In [33]:
def sockpuppet_matrix_(spd):
    c1n, c2n = (c for c in spd.columns if c != "value")
    return (
        alt.Chart(spd)
        .mark_rect()
        .encode(
            x=alt.X(f"{c1n}:O", axis=alt.Axis(orient="top")),
            y=f"{c2n}:O",
            color="value",
        )
    )