In [234]:
import pandas as pd

In [235]:
from datetime import datetime
from math import floor


def get_congress_from_term_dates(term_dict):
    """
    Calculates which congress number corresponds to a term based on that term's
    start and end dates. Each term_dict corresponds to a single congress (so term_dict[end_date] - term_dict[start_date] will never be greater than 2 years), but
    the start and end dates might not exactly match that term's start and end (
    special elections, etc)
    Args:
        term_dict (dict): A dictionary containing term dates information.
            Expected structure:
            {
                'start_date': str,
                'end_date': str,
            }

    Returns:
        int: The Congress number.
    """
    start_date = datetime.strptime(term_dict["start"], "%Y-%m-%d")

    # Reference point: 74th Congress started on January 3, 1935
    reference_congress = 74
    reference_date = datetime(1935, 1, 3)

    years_difference = (start_date - reference_date).days / 365.25
    congress_difference = floor(years_difference / 2)

    return reference_congress + congress_difference

In [236]:
processed = pd.read_json("../../data/processed/processed-bios.json").T

In [237]:
currentLegs = pd.read_json(
    "https://theunitedstates.io/congress-legislators/legislators-current.json"
)
historicalLegs = pd.read_json(
    "https://theunitedstates.io/congress-legislators/legislators-historical.json"
)

for df in [currentLegs, historicalLegs]:
    df.loc[:, "district"] = df.loc[:, "terms"].apply(
        lambda terms: {
            get_congress_from_term_dates(term): (
                term["district"] if "district" in term else term["state"]
            )
            for term in terms
        }
    )

In [None]:
df = pd.concat((currentLegs, historicalLegs))

temp = df.loc[:, "district"]

df["district"] = (
    pd.DataFrame(
        {
            "empties": df["district"].apply(
                lambda x: {y: None for y in range(100, 119) if y not in x.keys()}
            ),
            "district": df.loc[:, "district"],
        }
    )
    .apply(
        lambda row: dict(sorted({**row["district"], **row["empties"]}.items())), axis=1
    )
    .apply(lambda row: {key: row[key] for key in row.keys() if key > 99})
)

In [None]:
for col in ["familyName", "givenName"]:
    processed[col] = processed[col].str.replace(r"[^a-zA-Z -]", "", regex=True)

In [242]:
df["usCongressBioId"] = df["id"].apply(lambda x: x["bioguide"])

In [245]:
import utils

df = df[
    ~df["terms"].apply(
        lambda row: any(
            x["state"] for x in row if x["state"] not in utils.state_dict.values()
        )
    )
]

In [255]:
left = pd.merge(
    processed,
    df[["usCongressBioId", "district"]],
    how="left",
)

In [260]:
left["district"] = left["district"].apply(
    lambda row: {key: row[key] for key in row.keys() if key > 99}
)

In [91]:
processed.to_json("../../data/processed/processed-bios.json", orient="index")