In [None]:
%matplotlib ipympl
from datetime import timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def load_try_encodings(file: str):
    try:
        return pd.read_csv(file, delimiter="|", encoding="utf-8")
    except UnicodeDecodeError:
        return pd.read_csv(file, delimiter="|", encoding="ISO-8859-1")


def load_year(
    voter_history: str,
    voter_list: str,
):
    history = load_try_encodings(voter_history)
    vlist = load_try_encodings(voter_list)
    #     history = pd.read_csv(voter_history, delimiter="|", encoding="utf-8")
    # except UnicodeDecodeError:
    #     history = pd.read_csv(voter_history, delimiter="|", encoding="ISO-8859-1")
    # vlist = pd.read_csv(voter_list, delimiter="|", encoding = "ISO-8859-1")
    # history = pd.read_csv("11-2-21 Voter History 49ANP_239643.txt", delimiter="|")
    # vlist = pd.read_csv("49VOT_238743 Nov 2021 election.txt", delimiter="|", encoding = "ISO-8859-1")
    history.rename({c: c.strip() for c in history.columns}, inplace=True, axis=1)
    vlist.rename({c: c.strip() for c in vlist.columns}, inplace=True, axis=1)

    # manual corrections
    #
    history = history.set_index("Voter ID Number")
    vlist = vlist.set_index("Voter ID Number")
    drop_ids = [
        # erroneously entered reg with birth year of 1191
        # re-registered with different id the same year
        "02CLS2791002",
        # 2015 birthdays that are unclear how to correct
        "01GSR0112000",  #    01/01/1812
        "01MCN0112006",  #    01/01/1812
        "01ANE0112001",  #    01/01/1812
        "01WXO0109000",  #    01/01/1809
        # 2013 birthdays that unclear how to correct
        "01ACE0108001",  #    01/01/1808
        "01SEN0108008",  #    01/01/1808
        "01HPL0108001",  #    01/01/1808
        "01RKL0108001",  #    01/01/1808
        "01ARA0108004",  #    01/01/1808
        "01MRT0108003",  #    01/01/1808
        "01DOA0107001",  #    01/01/1807
        "01CBN0108003",  #    01/01/1808
        "01BRD0108006",  #    01/01/1808
        "01PAN0107002",  #    01/01/1807
        "01VAY0105002",  #    01/01/1805
        "01LNN0107002",  #    01/01/1807
        "01MJA0107011",  #    01/01/1807
        # 2012 Birthdays unclear how to correct
        "09GAA0487001",  #    09/04/1487
        # 2011 birthday that unclear how to correct
        "01QLO0105000",  #   01/01/1805
        "01KAA0106007",  #   01/01/1806
        "01LSH0108007",  #   01/01/1808
        "01KJN0111026",  #   01/01/1811
        "01AMN0108001",  #   01/01/1808
    ]
    vlist = vlist.drop(drop_ids, errors="ignore")
    history = history.drop(drop_ids, errors="ignore")

    # 04WDA0180001 in 2011
    # 12GDA0186001
    # 01DML1591001
    # 08WJB0481002
    # I modified the birth date year from 0980 to 1980
    # if "04WDA0180001" in vlist:
    #     # correcting a date entered as 0980
    #     print('here?')
    #     vlist["04WDA0180001"]["Date of Birth"] = "04/01/1980"
    birth_dates = pd.to_datetime(vlist["Date of Birth"])
    # extract from the file
    # this will break if multiple elec in same year
    elec_date = pd.to_datetime(history["Election Date"].iloc[0])
    # # check that this is actually correct ideally
    age = (elec_date - birth_dates) // timedelta(days=365.2425)
    vlist["age"] = age
    vlist["voted"] = False
    vlist.loc[vlist.index.intersection(history.index), "voted"] = True
    return vlist

In [None]:
voters_2022 = load_year(
    "11-8-22 Voter History 49ANP_269498.txt", "49VOT_267488 nov 2022 voting list.txt"
)
voters_2020 = load_year(
    "11-3-20 Voter History 49ANP_225530.txt", "49VOT_224084 november 2020 election.txt"
)
voters_2018 = load_year(
    "11-6-18 Voter History 49ANP_162771.txt", "49VOT_162354 - Nov 6 2018 Election.txt"
)
voters_2016 = load_year(
    "11-8-16 Voter History 49ANP_140283.txt", "49VOT_139226 - Nov 8 2016.txt"
)
voters_2014 = load_year(
    "11-4-2014 State Election 49ANP_120872.txt", "49VOT_120372 - nov 4 2014.txt"
)
voters_2012 = load_year(
    "11.6.2012 StatePres 49ANP_103892.txt", "49VOT_103340 Nov 2012 election.txt"
)

voters_2021 = load_year(
    "11-2-21 Voter History 49ANP_239643.txt", "49VOT_238743 Nov 2021 election.txt"
)
voters_2019 = load_year(
    "11-5-19 Voter History 49ANP_202374.txt", "49VOT_199524 - Nov 2019 election.txt"
)
voters_2017 = load_year(
    "11.7.17 Voter History 49ANP_150723.txt", "49VOT_150177 - nov 7 2017.txt"
)
voters_2015 = load_year(
    "11.3.15 Voter History 49ANP_129528.txt", "49VOT_128567 - Nov 3, 2015.txt"
)
voters_2013 = load_year(
    "11.5.13 Municipal Election 49ANP_112159.txt", "49VOT_111500 - nov 5, 2013.txt"
)
voters_2011 = load_year(
    "11.8.2011 Voter History 49ANP_91255.txt", "49VOT_90931 Nov 2011 election.txt"
)

In [None]:
col_order = [
    "Last Name",
    "Middle Name",
    "First Name",
    "voted",
    "age",
    "Date of Birth",
    "Date of Registration",
    "Residential Address Street Number",
    "Residential Address Street Name",
    "univ_housing_name",
    "Residential Address Street Suffix",
    "Residential Address Apartment Number",
    "Residential Address Zip Code",
    # 'Mailing Address - Street Number and Name',
    # 'Mailing Address - Apartment Number',
    # 'Mailing Address - City or Town',
    # 'Mailing Address - State',
    # 'Mailing Address - Zip Code',
    "Gender F/M",
    "Voter Status",
    "Party Affiliation",
    "Ward Number",
    "Precinct Number",
    "Congressional District Number",
    "Senatorial District Number",
    "State Representative District",
    # 'Unnamed: 26',
    # 'Unnamed: 25',
    #  'Record Sequence Number',
    # 'Title',
]
voters = pd.concat(
    [
        voters_2011,
        voters_2012,
        voters_2013,
        voters_2014,
        voters_2015,
        voters_2016,
        voters_2017,
        voters_2018,
        voters_2019,
        voters_2020,
        voters_2021,
        voters_2022,
    ],
    keys=np.arange(2011, 2023),
)

# voters['Mailing Address ¿ Street Number and Name'] = voters['Mailing Address ¿ Street Number and Name'].combine_first(voters['Mailing Address ? Street Number and Name'])
# voters = voters.drop('Mailing Address ? Street Number and Name',axis=1)
# voters = voters.rename({'Mailing Address ¿ Street Number and Name': 'Mailing Address - Street Number and Name'})

# add a university housing name column
# makes it easier to do things like groupby for MIT dorms
voters["univ_housing_name"] = "NA"

voters = voters[col_order]
voters.index = voters.index.set_names(["year", "Voter ID Number"])

In [None]:
voters

In [None]:
def turnout_by_year_key(df, key, binn):
    """
    Calculate turnout per year based on the variable *key*.

    Parameters
    ----------
    df : pd.DataFrame
        Expected to have an outer (multi)index of *year*
    key : str
        The column to use for value_counts. e.g. "age"

    Returns
    -------
    pd.DataFrame
    """

    def _process_year(df):
        voted_counts = df[df["voted"]][key].value_counts().sort_index()
        reg_counts = df[key].value_counts().sort_index()
        df = pd.DataFrame({"voted": voted_counts, "registered": reg_counts})
        return df.fillna(0).astype(int)

    years = voters.index.unique(level=0)
    out = pd.concat([_process_year(df.loc[year]) for year in years], keys=years)
    out.index = out.index.set_names(["year", key])
    out["turnout"] = out["voted"] / out["registered"]
    return out


df = turnout_by_year_key(voters, "age")
df

In [None]:
grouped = df.reset_index()
age_groups = pd.cut(grouped["age"], np.arange(18, 114, 4), include_lowest=True)
grouped["age_group"] = age_groups
grouped = (
    grouped.groupby(["year", "age_group"])
    .sum()
    .sort_index()
    .drop("age", axis=1)
    .reset_index()
)
mid_points = [g.mid for g in grouped["age_group"]]
grouped["mid_points"] = mid_points  # convenience for plotting down the line
# transforming the intervals into strings for easy using the multiindex
# this can't be the best way to do this :(
# this is lowkey awful
grouped["age_group"] = [
    f"{int(np.round(g.left))}-{int(g.right)}" for g in grouped["age_group"]
]

grouped.index = pd.MultiIndex.from_frame(grouped[["year", "age_group"]])
grouped = grouped.drop(["year", "age_group"], axis=1)
grouped["turnout"] = grouped["voted"] / grouped["registered"]
grouped

### Turnout vs registration by age group (Municipal Elections)


In [None]:
def turnout_bar_graph(df, ax=None):
    bar_width = 3.75
    if ax is None:
        ax = plt.gca()
    ax.bar(
        df["mid_points"], df["voted"], width=bar_width, color="tab:green", label="Voted"
    )
    ax.bar(
        df["mid_points"],
        df["registered"] - df["voted"],
        bottom=df["voted"],
        width=bar_width,
        color="gray",
        label="Registered - did not vote",
    )
    ax.set_xlim([17, 85])


# fig, axs = plt.subplots(2,3, constrained_layout=True, figsize=(12,6),sharex=True)
# turnout_bar_graph(grouped.loc[2021], ax=axs[0,0])
# axs[0,0].set_title("2021")
# turnout_bar_graph(grouped.loc[2019], ax=axs[0,1])
# axs[0,1].set_title("2019")
# turnout_bar_graph(grouped.loc[2017], ax=axs[0,2])
# axs[0,2].set_title("2017")
# turnout_bar_graph(grouped.loc[2015], ax=axs[1, 0])
# axs[1,0].set_title("2015")
# turnout_bar_graph(grouped.loc[2013], ax=axs[1, 1])
# axs[1,1].set_title("2013")
# turnout_bar_graph(grouped.loc[2011], ax=axs[1, 2])
# axs[1,2].set_title("2011")
# axs[1,1].set_xlabel("Age")
# plt.legend()


fig, axs = plt.subplots(3, 3, figsize=(16, 6), sharex=True, sharey=True)
# fig.suptitle("Harvard Grad Dorms")

years = np.arange(2011, 2023)
year = 2022
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    ax.set_title(f"{year}")
    turnout_bar_graph(grouped.loc[year], ax)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
fig.supxlabel("Age (4 year bins)")
ax.legend()
plt.tight_layout()
# plt.tight_layout()

In [None]:
ages = np.arange(18, 79, 4)
groups = [f"{i}-{i+4}" for i in ages]

df = grouped.loc[2022]
fig, ax = plt.subplots(figsize=(6, 14))
bar_width = 3.75
ax.barh(
    df["mid_points"],
    df["registered"],
    height=bar_width,
    color="gray",
    label="Registered",
)
ax.barh(
    df["mid_points"], df["voted"], height=bar_width, color="tab:green", label="Voted"
)
ax.set_yticks(ages + 2, labels=groups, fontsize=20)
ax.legend(fontsize=25)
ax.set_ylim([85, 17])
plt.tight_layout()

In [None]:
ages = np.arange(18, 75, 4)
groups = [f"{i}-{i+4}" for i in ages]
colors = plt.cm.viridis(ages / ages.max())

plt.figure()
for i, group in enumerate(groups):
    grouped["turnout"].xs(group, level=1).plot(
        label=group, style="o--", color=colors[i]
    )
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)
plt.title("Turnout by Age and Year")
plt.ylabel("Turnout %")
plt.xlabel("Election Year")
plt.tight_layout()
plt.grid()

plt.figure()
for i, group in enumerate(groups):
    grouped["voted"].xs(group, level=1).plot(label=group, style="o--", color=colors[i])
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)
plt.title("Turnout by Age and Year")
plt.ylabel("Turnout Number")
plt.xlabel("Election Year")
plt.tight_layout()
plt.grid()

In [None]:
from cycler import cycler

years = np.arange(2011, 2023)
fig, axs = plt.subplots(1, 2, figsize=(12, 4.5))
axs[0].set_prop_cycle(cycler(color=plt.get_cmap("tab20").colors))
axs[1].set_prop_cycle(cycler(color=plt.get_cmap("tab20").colors))

for year in years:
    if year % 2 == 0:
        alpha = 0.8
        linestyle = "--"
    else:
        alpha = 1
        linestyle = "-"

    axs[0].plot(
        grouped.loc[year]["mid_points"].values,
        grouped.loc[year]["turnout"].values * 100,
        "o",
        linestyle=linestyle,
        label=year,
        alpha=alpha,
    )
    axs[1].plot(
        grouped.loc[year]["mid_points"].values,
        grouped.loc[year]["voted"].values,
        "o",
        linestyle=linestyle,
        label=year,
        alpha=alpha,
    )
axs[0].set_title("Turnout %")
axs[1].set_title("Turnout Numbers")
axs[0].set_ylabel("%")
axs[1].set_ylabel("Number of Votes Cast")
# axs[0].set_prop_cycle(cycler(color=plt.get_cmap('tab20c').color‌‌​​s))
# axs[1].set_prop_cycle(cycler(color=plt.get_cmap('tab20c').color‌‌​​s))
labelsize = 15
axs[0].grid()
axs[1].grid()
fig.supxlabel("Age (4 yr bins)", size=labelsize)
plt.legend()
plt.tight_layout()

In [None]:
fig, ax = plt.subplots()
ax.set_prop_cycle(cycler(color=plt.get_cmap("tab20").colors))
plt.title("Turnout vs Age")
plt.plot(
    grouped.loc[2021]["mid_points"].values,
    grouped.loc[2021]["turnout"].values,
    "o--",
    label="2021",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2019]["mid_points"].values,
    grouped.loc[2019]["turnout"].values,
    "o--",
    label="2019",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2017]["mid_points"].values,
    grouped.loc[2017]["turnout"].values,
    "o--",
    label="2017",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2015]["mid_points"].values,
    grouped.loc[2015]["turnout"].values,
    "o--",
    label="2015",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2013]["mid_points"].values,
    grouped.loc[2013]["turnout"].values,
    "o--",
    label="2013",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2011]["mid_points"].values,
    grouped.loc[2011]["turnout"].values,
    "o--",
    label="2011",
    alpha=0.8,
)
plt.legend()
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
plt.xlabel("Age (4 yr bins)")
plt.ylabel("Turnout %")

In [None]:
plt.figure()
plt.title("Turnout vs Age")
plt.grid()
plt.plot(
    grouped.loc[2021]["mid_points"].values,
    grouped.loc[2021]["voted"].values,
    "o--",
    label="2021",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2019]["mid_points"].values,
    grouped.loc[2019]["voted"].values,
    "o--",
    label="2019",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2017]["mid_points"].values,
    grouped.loc[2017]["voted"].values,
    "o--",
    label="2017",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2015]["mid_points"].values,
    grouped.loc[2015]["voted"].values,
    "o--",
    label="2015",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2013]["mid_points"].values,
    grouped.loc[2013]["voted"].values,
    "o--",
    label="2013",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2011]["mid_points"].values,
    grouped.loc[2011]["voted"].values,
    "o--",
    label="2011",
    alpha=0.8,
)
plt.legend()
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
plt.xlabel("Age (4 yr bins)")
plt.ylabel("Turnout (Vote count)")
plt.tight_layout()

In [None]:
plt.figure()
plt.title("Turnout vs Age")
plt.grid()
plt.plot(
    grouped.loc[2021]["mid_points"].values,
    grouped.loc[2021]["registered"].values,
    "o--",
    label="2021",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2019]["mid_points"].values,
    grouped.loc[2019]["registered"].values,
    "o--",
    label="2019",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2017]["mid_points"].values,
    grouped.loc[2017]["registered"].values,
    "o--",
    label="2017",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2015]["mid_points"].values,
    grouped.loc[2015]["registered"].values,
    "o--",
    label="2015",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2013]["mid_points"].values,
    grouped.loc[2013]["registered"].values,
    "o--",
    label="2013",
    alpha=0.8,
)
plt.plot(
    grouped.loc[2011]["mid_points"].values,
    grouped.loc[2011]["registered"].values,
    "o--",
    label="2011",
    alpha=0.8,
)
plt.legend()
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
# plt.plot(grouped.loc[2021]['mid_points'].values, grouped.loc[2021]['turnout'].values, 'o--')
plt.xlabel("Age (4 yr bins)")
plt.ylabel("Registered Voters")
plt.tight_layout()

# University Housing/Dorms



In [None]:
from collections import defaultdict
from collections.abc import Iterable


def find_housing_idxs(df: pd.DataFrame, housing_locations: dict):
    # make every street number a tuple for convenience
    building = dict(housing_locations)

    indices = defaultdict(lambda: np.zeros(len(df), dtype=bool))

    def _find_idx(street_num, street_name):
        idx = [
            street in street_name for street in df["Residential Address Street Name"]
        ]

        if street_num is not None:
            if not isinstance(street_num, Iterable):
                # turn single number addr into iterableto match places with multiple addresses
                street_num = (street_num,)
                # multiple valid street numbers
            idx = np.logical_and(
                idx,
                [num in street_num for num in df["Residential Address Street Number"]],
            )

        return idx

    for name, v in building.items():
        if isinstance(v, list):
            # complex with multiple addresses - e.g. holden green
            for addr in v:
                indices[name] |= _find_idx(addr[0], addr[1])
        else:
            indices[name] |= _find_idx(v[0], v[1])

    for name, idx in indices.items():
        df.loc[idx, "univ_housing_name"] = name
    indices["all"] = np.any(list(indices.values()), axis=0)
    return indices

### Harvard Grad Dorms

In [None]:
gsas_dorms = ["richards hl", "perkins hl", "conant hl", "child hl"]
gsas_dorms = {name: (None, name.upper()) for name in gsas_dorms}
gsas_idx = find_housing_idxs(voters, gsas_dorms)

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(16, 6), sharex=True, sharey=True)
fig.suptitle("Harvard Grad Dorms")

year = 2022
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, gsas_idx["all"], voters)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

In [None]:
df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()
df

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4.5), constrained_layout=True)
for dorm in gsas_dorms.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm.split()[0])
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

## Harvard Law School

In [None]:
hvd_law_dorms = [
    "dane hl",
    "ames hl",
    "shaw hl",
    "story hl",
    "holmes hl",
    "hastings hl",
    "north hl",
]
hvd_law_dorms = {name: (None, name.upper()) for name in hvd_law_dorms}
hvd_law_idx = find_housing_idxs(voters, hvd_law_dorms)

fig, axs = plt.subplots(3, 3, figsize=(16, 6), sharex=True, sharey=True)
plt.suptitle("Harvard Law Dorms Turnout")

for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, hvd_law_idx["all"], voters)
    ax.tick_params(axis="x", labelrotation=90)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(12, 4.5), constrained_layout=True)
for dorm in hvd_law_dorms.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm.split()[0])
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

## Harvard Housing

still missing some big ones. see map here: [https://osa.gse.harvard.edu/files/gse-osa/files/hu_housing_map.pdf](https://osa.gse.harvard.edu/files/gse-osa/files/hu_housing_map.pdf)

would also be cool to normalize by how many units are in each building


In [None]:
harvard_housing = {
    "Peabody Terrace": (None, "PEABODY TER"),
    "Holden Green": [
        (None, "HOLDEN GRN"),
        (list(range(10, 38 + 2, 2)), "HOLDEN ST"),
    ],  # multiple address here. the func will handle this
    "29 Garden St": (29, "GARDEN ST"),
    "Botanic Gardens": (None, "FERNALD DR"),
    "Kirkland Court": ((37, 39, 31), "KIRKLAND ST"),
    "10 Akron": (10, "AKRON ST"),
    "Ware St": (
        (9, 11, 13, 15, 17, 19),
        "WARE ST",
    ),  # as it stands the function won't differentiate between 13 and 13A ware st so should pick up both
    "Prescott": (list(range(85, 95 + 1, 2)), "PRESCOTT ST"),
}

# TODO: haskins hall, beckwith cricle, terry terrace

In [None]:
harvard_housing_idx = find_housing_idxs(voters, harvard_housing)

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(17, 8), sharex=True, sharey=True)
plt.suptitle("Harvard University Housing Turnout")

for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, harvard_housing_idx["all"], voters)
    ax.tick_params(axis="x", labelrotation=90)
ax.legend()
plt.tight_layout()


df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(16, 6), constrained_layout=True)
for dorm in harvard_housing.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

## Undergrad houses

In [None]:
harvard_ugrad_houses = [
    h + " House"
    for h in [
        "Leverett",
        "Pforzheimer",
        "Adams",
        "Currier",
        "Cabot",
        "Dunster",
        "Eliot",
        "Kirkland",
        "Lowell",
        "Mather",
        "Quincy",
        "Winthrop",
    ]
]
harvard_ugrad_houses = {name: (None, name.upper()) for name in harvard_ugrad_houses}
harvard_ugrad_idx = find_housing_idxs(voters, harvard_ugrad_houses)

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(16, 6), sharex=True, sharey=True)

plt.suptitle("Harvard Undergrad house turnout")
year = 2022
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, harvard_ugrad_idx["all"], voters)
    ax.tick_params(axis="x", labelrotation=90)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(16, 6), constrained_layout=True)
for dorm in harvard_ugrad_houses.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

### 1st year

In [None]:
harvard_1st_year = [
    h + " HL"
    for h in [
        "Greenough",
        "Hurlbut",
        "Pennypacker",
        "Wigglesworth ",
        "Grays",
        "Matthews",
        "Weld",
        "Apley",
        "Hollis",
        "Holworthy",
        "Lionel",
        "Mass Hall",
        "Mower",
        "Stoughton",
        "Straus",
        "Canaday",
        "Thayer",
    ]
]

harvard_1st_year = {name: (None, name.upper()) for name in harvard_1st_year}
harvard_1st_idx = find_housing_idxs(voters, harvard_1st_year)

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(16, 6), sharex=True, sharey=True)

plt.suptitle("Harvard Undergrad house turnout")
year = 2022
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    ax.set_title(f"{year}")
    university_housing_bar_chart(ax, year, harvard_ugrad_idx["all"], voters)
    ax.tick_params(axis="x", labelrotation=90)
ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(16, 6), constrained_layout=True)
for dorm in harvard_ugrad_houses.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm.split()[0])
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

## MIT


### Undergrad

These dorms are recorded with real addresses rather than house names

In [None]:
mit_dorms = {
    "Baker House": (362, "MEMORIAL DR"),
    "Burton Conner": (410, "MEMORIAL DR"),
    "East Campus": (3, "AMES ST"),
    "MacGregor House": (450, "MEMORIAL DR"),
    "Maseeh Hall": (305, "MEMORIAL DR"),
    "McCormick Hall": (320, "MEMORIAL DR"),
    "New House": (tuple(range(471, 476 + 1)), "MEMORIAL DR"),
    "Next House": (500, "MEMORIAL DR"),
    "New Vassar": (189, "VASSAR ST"),
    "Random Hall": (290, "MASSACHUSETTS AVE"),
    "Simmons Hall": (tuple(range(229, 243 + 1)), "VASSAR ST"),
}

mit_ugrad_idx = find_housing_idxs(voters, mit_dorms)

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(16, 6), sharex=True, sharey=True)

year = 2022
plt.suptitle("MIT Undergrad Dorm Turnout over the years")
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    university_housing_bar_chart(ax, year, mit_ugrad_idx["all"], voters)
    ax.set_title(f"{year}")
    ax.tick_params(axis="x", labelrotation=90)

    ax.tick_params(axis="x", labelrotation=90)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(16, 6), constrained_layout=True)
for dorm in mit_dorms.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm)
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

### MIT Grad


unlike harvard have a very convenient and easy to find table of places.


[https://studentlife.mit.edu/grad-residences](https://studentlife.mit.edu/grad-residences)


In [None]:
mit_grad_housing = {
    "70 Amherst": (70, "AMHERST ST"),
    "Ashdown": (235, "ALBANY ST"),
    "Edgerton": (143, "ALBANY ST"),
    "Grad Tower": (45, "HAYWARD ST"),
    "Sidney Pacific": (70, "PACIFIC ST"),
    "Tang Hall": (550, "MEMORIAL DR"),
    "The Warehouse": (224, "ALBANY ST"),
    "Westgate": (540, "MEMORIAL DRIVE"),
}
mit_grad_idx = find_housing_idxs(voters, mit_grad_housing)

In [None]:
fig, axs = plt.subplots(4, 3, figsize=(16, 6), sharex=True, sharey=True)

year = 2022
plt.suptitle("MIT Grad Housing Turnout")
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    university_housing_bar_chart(ax, year, mit_grad_idx["all"], voters)
    ax.set_title(f"{year}")
    ax.tick_params(axis="x", labelrotation=90)

    ax.tick_params(axis="x", labelrotation=90)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(16, 6), constrained_layout=True)
for dorm in mit_grad_housing.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm)
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

## Lesley

todo

[https://lesley.edu/students/housing/residence-halls](https://lesley.edu/students/housing/residence-halls)


cool interactive map here: [https://map.concept3d.com/?id=31#!ce/238?m/3276?s/](https://map.concept3d.com/?id=31#!ce/238?m/3276?s/)

In [None]:
lesley_housing = {
    "Doble": (30, "MELLEN ST"),
    "Compass House": (14, "WENDELL ST"),
    "Everett House": (28, "WENDELL ST"),
    "Jenckes House": (31, "MELLEN ST"),
    "Kidder House": ((2, 4), "SAINT JOHNS RD"),
    "[Lesley] Kirkland House": (61, "OXFORD ST"),
    "Kris House": (68, "OXFORD ST"),
    "Lawrence Hall": (99, "BRATTLE ST"),
    "MacKenzie Hall": (36, "MELLEN ST"),
    "Malloch Hall": (38, "MELLEN ST"),
    "Mellen House": (24, "MELLEN ST"),
    "Rousmaniere House": (6, "SAINT JOHNS RD"),
    "Wendell House": (63, "OXFORD ST"),
    "White Hall": (33, "EVERETT ST"),
    "Wilbur House": (78, "OXFORD ST"),
    "Wilson House": ((16, 18), "WENDELL ST"),
    "Winthrop Hall": (list(range(1, 7 + 1, 2)), "SAINT JOHNS RD"),
    "Wolfard Hall": (34, "MELLEN ST"),
}
lesley_housing_idx = find_housing_idxs(voters, lesley_housing)
fig, axs = plt.subplots(4, 3, figsize=(16, 6), sharex=True, sharey=True)

year = 2022
plt.suptitle("Lesley Housing Turnout")
bar_width = 0.75
for i, ax in enumerate(axs.reshape(-1)):
    year = years[-i - 1]
    university_housing_bar_chart(ax, year, lesley_housing_idx["all"], voters)
    ax.set_title(f"{year}")
    ax.tick_params(axis="x", labelrotation=90)

    ax.tick_params(axis="x", labelrotation=90)
# ax.set_xticklabels([l.get_text().split()[0] for l in ax.get_xticklabels()])
ax.legend()
plt.tight_layout()

df = turnout_by_year_key(voters, "univ_housing_name")

# groupby is the easiest way i could see to get the order I wanted
# couldn't get the reorder levels to work properly :/
df = df.groupby(["univ_housing_name", "year"]).mean()

fig, axs = plt.subplots(1, 3, figsize=(16, 6), constrained_layout=True)
for dorm in lesley_housing.keys():
    axs[0].plot(df.loc[dorm]["registered"], "o--", label=dorm.split()[0])
    axs[0].set_title("Registered")
    axs[1].set_title("Voted")
    axs[2].set_title("Turnout %")
    axs[1].plot(df.loc[dorm]["voted"], "o--", label=dorm)
    axs[2].plot(df.loc[dorm]["turnout"], "o--", label=dorm)
plt.ylabel("Turnout %")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

In [None]:
np.unique(
    voters[lesley_housing_idx["all"]].loc[2012]["univ_housing_name"], return_counts=True
)

In [None]:
df = turnout_by_year_key(voters, "univ_housing_name")
df.loc[2011, "shaw hl"]

In [None]:
voters[lesley_housing_idx["all"]].loc[2012]["univ_housing_name"]