In [1]:
import pandas as pd
import altair as alt
import numpy as np
from pathlib import Path
from helpers import read

import bottleneck as bn
# import re
# from algo import KLD_distance_overused
# import string
# from glob import glob
# from collections import Counter
# from datetime import datetime
from xLPA import Corpus, sockpuppet_distance
from math import floor
from scipy.spatial.distance import cdist

from visualize import sockpuppet_matrix
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [5]:
bn.nanmean([1,2,4])

2.3333333333333335

In [2]:
df = pd.read_csv("data/us_elections/1976-2020-president.csv")[["year", "candidatevotes", "party_simplified", "state"]]
df = df[df["party_simplified"].isin(["DEMOCRAT", "REPUBLICAN"])].rename(columns={"state": "element", "year": "document", "candidatevotes": "frequency_in_document"}).reset_index(drop=True)
# cs = []
# for party in ["DEMOCRAT", "DEMOCRAT"]:
#     corpus = Corpus(df[df["party_simplified"] == party].reset_index(drop=True))
#     dvr = corpus.create_dvr()
#     sigs = corpus.create_signatures(epsilon=1/(len(dvr) * 2))
#     cs.append(corpus)
# spd = sockpuppet_distance(*cs)
# sockpuppet_matrix(spd, "DEMOCRAT", "DEMOCRAT 1") #.save("dem_vs_dem.html")

In [5]:
corpus = Corpus(df[df["party_simplified"] == "DEMOCRAT"].reset_index(drop=True))
dvr = corpus.create_dvr(equally_weighted=True)
sigs = corpus.create_signatures(epsilon=1/(len(dvr) * 2))


In [6]:
sigs

([WEST VIRGINIA           0.004230
  ARKANSAS                0.002622
  INDIANA                 0.002463
  OKLAHOMA                0.002408
  PENNSYLVANIA            0.001775
  OHIO                    0.001208
  NEW YORK                0.001047
  KANSAS                  0.001045
  ALABAMA                 0.000947
  ILLINOIS                0.000888
  MISSOURI                0.000885
  NORTH DAKOTA            0.000834
  KENTUCKY                0.000821
  MASSACHUSETTS           0.000778
  TENNESSEE               0.000741
  IOWA                    0.000676
  SOUTH DAKOTA            0.000520
  MINNESOTA               0.000357
  RHODE ISLAND            0.000264
  NEW JERSEY              0.000233
  LOUISIANA               0.000232
  NEBRASKA                0.000213
  MISSISSIPPI             0.000159
  WISCONSIN               0.000127
  CONNECTICUT             0.000116
  WYOMING                 0.000084
  MICHIGAN                0.000071
  MONTANA                 0.000069
  MARYLAND          

In [60]:
df[(df["document"]==1976) & (df["element"]=="CALIFORNIA")]

Unnamed: 0,document,frequency_in_document,party_simplified,element
8,1976,3882244,REPUBLICAN,CALIFORNIA
9,1976,3742284,DEMOCRAT,CALIFORNIA


In [52]:
for i, s in enumerate(sigs[0]):
    s.to_csv(f"results/us_elections/signatures/{1976 + i * 4}.csv")
dvr.to_csv("results/us_elections/dvr.csv", index=False)
# sigs[0][8].to_csv("2008.csv")

In [54]:
sigs[0][0].loc["CALIFORNIA"]

0.0

In [34]:
num_to_year = {
    14: 1996,
    15: 1999,
    16: 2003,
    17: 2006,
    18: 2009,
    19: 2013,
    20: 2015,
    21: 2018,
    22: 2019,
    23: 2020,
    24: 2021,
    25: 2022
}


### Preprocessing

In [35]:
# heb_cols = ["שם ישוב", "סמל ישוב", "בזב", "מצביעים", "פסולים", "כשרים"]
# eng_cols = ["town_name", "town_code", "bzb", "voters", "valid_votes", "invalid_votes"]
# years = ("2015", "2019a", "2019b", "2020", "2021") #  ("2015 - hebrew", "2019 - hebrew",
# drop_cols = {
#     "2019b": ["סמל ועדה"],
#     "2020": ["סמל ועדה", "Unnamed: 37"],
#     "2021": ["סמל ועדה", "Unnamed: 46"],
# } | {y: [] for y in years[:-3]}
full_df = []
# kv_df = []
for num, year in num_to_year.items():
    if num <= 20:
        df = pd.read_excel(f"data/elections/results_{num}.xls")
    else:
        enc = {"encoding":"iso8859_8"} if num != 25 else {}
        df = pd.read_csv(f"data/elections/results_{num}.csv", **enc)
    print(year)
    df.loc[
        ((df["שם ישוב"] == "מעטפות חיצוניות") | (df["שם ישוב"] == "מעטפות כפולות")),
        "סמל ישוב",
    ] = 99999
    df = df.drop(
        columns=set(df.columns.tolist())
        & {
            "קלפי",
            "פיצול",
            "שם ישוב",
            "כתובת",
            "בעלי זכות בחירה",
            "בזב",
            "נפה",
            "מצביעים",
            "כשרים",
            "פסולים",
            "מספר קלפי",
            "ברזל",
            "סמל ועדה",
            "שופט",
            "ריכוז",
            "סמל קלפי",
        }
    )
    df = df.rename(columns={"סמל ישוב": "element"})
    full_df.append(
        df.groupby("element", as_index=False)
        .sum()
        .melt(id_vars=["element"], value_name="frequency_in_document")
        .assign(document=year)
    )
#     kv_df.append(df[["town_code", "town_name"]])
#     emet_df.append(df[df["party"] == "אמת"][["document","town_code", "votes"]].rename(columns={"town_code": "element", "votes": "frequency_in_document"}).reset_index(drop=True))

full_df = pd.concat(full_df)
full_df = full_df[full_df["frequency_in_document"]>0].reset_index(drop=True).replace({"אמת ": "אמת"})
for variable in full_df["variable"].drop_duplicates().to_list():
    full_df[full_df["variable"] == variable].drop(columns=["variable"]).to_csv(f"data/elections/parties/{variable}.csv", index=False)
# emet_df["element"] = emet_df["element"].replace(9999, 99999)
# emet_df
full_df = full_df.rename(columns={"element": "city", "variable": "element"})



1996
1999
2003
2006
2009
2013
2015
2018
2019
2020
2021
2022


In [38]:
cs = []
for i in [3000, 3000]:
    cs.append(full_df[full_df['city'] == i].reset_index(drop=True).drop(columns=["city"]))


In [5]:
k = ["שם יישוב", "סמל יישוב"]
v = ["name", "element"]
kv = pd.read_excel("data/elections/bycode2021.xlsx")[k].rename(columns=dict(zip(k, v)))
kv = pd.concat([kv, pd.DataFrame(["מעטפות חיצוניות", 99999], index=v).T], ignore_index=True)

# pd.DataFrame.from_dict(dict(zip(v, )), orient="columns")

In [40]:
css = []
for f in cs:
    corpus = Corpus(f)
    dvr = corpus.create_dvr()
    corpus.create_signatures(epsilon=1/(len(dvr) * 2))
    css.append(corpus)

# print(css)
spd = sockpuppet_distance(*css)
sockpuppet_matrix(spd, "Tel Aviv", "Tel Aviv 2")

  for col_name, dtype in df.dtypes.iteritems():


### LPA

In [6]:
corpus = Corpus(freq=emet_df)
dvr = corpus.create_dvr()
pd.merge(dvr, kv).head(20)


Unnamed: 0,element,global_weight,name
0,3000,0.105517,ירושלים
1,6100,0.05349,בני ברק
2,99999,0.047087,מעטפות חיצוניות
3,70,0.045611,אשדוד
4,5000,0.043848,תל אביב -יפו
5,7400,0.032748,נתניה
6,6600,0.030517,חולון
7,9000,0.0297,באר שבע
8,7900,0.028214,פתח תקווה
9,8300,0.021468,ראשון לציון


In [7]:
epsilon_frac = 2
epsilon = 1 / (len(dvr) * epsilon_frac)
epsilon


0.00039184952978056425

In [8]:
prevelent = floor(len(corpus) * 0.75)
print(prevelent)
signatures, most_significant, temporary_array = corpus.create_signatures(
    epsilon=epsilon, most_significant=30, sig_length=500, prevelent=prevelent
)
display(signatures[0].head(10))


9


99999   -0.322609
1309    -0.087919
294      0.071905
6600    -0.071485
289      0.070993
633      0.061211
5000    -0.058087
139      0.055845
167      0.052052
7400    -0.051288
Name: 1996, dtype: float64