In [None]:
import gc
import io
import logging
import os
import sys
import ast
import re

import pandas as pd
from legcop import LegiScan

sys.path.append(os.getcwd())
from leg_eff_secrets import LEGISCAN_API_KEY

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

## Read in files

In [None]:
def load_dataset(state, year):
    """
    checks if datasets are in memory. loads them in if they are, downloads them if they aren't.
    only takes a single (state,year) tuple at a time - loop over it if you want more than one.
    returns a tuple of dfs: (bills, people, votes)
    """

    if os.path.exists(f"../../data/raw/{state}-{year}.json"):
        logger.info("Dataset already downloaded.")
        try:
            bills_df(
                pd.read_json(
                    f"../../data/raw/{state}-{year}.json",
                ).reset_index(drop=True),
            )
            logger.info("Dataset loaded into memory.")
            return bills_df
        except FileNotFoundError:
            logger.info("File not found.")
            pass
        except Exception as e:
            logger.error(
                "Looks like your files are downloaded, but there's something wrong. "
                + "If you aren't sure what to do, deleting the files and running"
                + " the code again is usually a safe idea."
            )

    logger.info(f"Dataset for {state}-{year} is either missing or incomplete.")

    legis = LegiScan(LEGISCAN_API_KEY)

    dataset_list = legis.get_dataset_list(state=state, year=year)

    ACCESS_KEY = dataset_list[0]["access_key"]
    SESSION_ID = dataset_list[0]["session_id"]

    del dataset_list

    logger.info(
        "Starting dataset download. This can take my laptop up to around 5 minutes, especially for large datasets."
    )
    dataset = legis.get_dataset(access_key=ACCESS_KEY, session_id=SESSION_ID)
    del ACCESS_KEY, SESSION_ID
    assert dataset["status"] == "OK"

    logger.info("Download complete. Starting pre-processing.")

    readable_dataset = legis.recode_zipfile(dataset)
    namelist = readable_dataset.namelist()

    list_of_bill_dfs = []

    for file in namelist:
        if "/bill/" in file:
            content = readable_dataset.read(file)
            list_of_bill_dfs.append(
                pd.read_json(io.StringIO(content.decode("UTF-8"))).T
            )

    del content, file, dataset, readable_dataset, namelist

    bills_df = pd.concat(list_of_bill_dfs)

    logger.info("Pre-processing complete. Saving to disk.")

    bills_df.reset_index(drop=True).to_json(
        f"../../data/raw/{state}-{year}.json", index=False
    )

    return bills_df.reset_index(drop=True)


ny_2021_bills = load_dataset("NY", 2021)

In [None]:
ny_2021_bills = pd.read_json(
    f"../../data/raw/NY-2021.json",
).reset_index(drop=True)

In [None]:
# get rid of resolutions
ny_2021_bills = ny_2021_bills[
    (
        ny_2021_bills["bill_number"].str.startswith("A")
        | ny_2021_bills["bill_number"].str.startswith("S")
    )
].reset_index(drop=True)


# for each bill...
# make a SAME_AS column
def get_same_as(sasts_column: pd.Series) -> pd.Series:
    return sasts_column.apply(
        lambda x: x[0]["sast_bill_number"] if len(x) > 0 else None
    )


ny_2021_bills["same_as"] = get_same_as(ny_2021_bills["sasts"])

# make a LAW column (bool) that's True if bill enacted OR True if the same_as is enacted
#  bill is enacted


def expand_progress(progress_list):
    progress_dict = {
        0: "N/A Pre-filed or pre-introduction",
        1: "Introduced",
        2: "Engrossed",
        3: "Enrolled",
        4: "Passed",
        5: "Vetoed",
        6: "Failed",  # Limited support based on state
        7: "Override",
        8: "Chaptered",
        9: "Refer",
        10: "Report Pass",
        11: "Report DNP",
        12: "Draft",
    }

    if len(progress_list) == 0:
        return []

    templist = []

    for update in progress_list:
        templist.append(progress_dict[update["event"]].lower())
    return templist


def expand_history(history_list):
    if len(history_list) == 0:
        return []

    templist = []

    for update in history_list:
        templist.append(update["action"].lower())
    return templist


def expand_votes(vote_list):
    if len(vote_list) == 0:
        return []

    templist = []

    for update in vote_list:
        templist.append(update["desc"].lower())
    return templist


ny_2021_bills["exp_progress"] = ny_2021_bills["progress"].apply(expand_progress)
ny_2021_bills["exp_history"] = ny_2021_bills["history"].apply(expand_history)
ny_2021_bills["exp_votes"] = ny_2021_bills["votes"].apply(expand_votes)

# get BILL
ny_2021_bills["BILL"] = ny_2021_bills["exp_progress"].apply(
    lambda x: "Introduced" in x if x is not None else False
)
# get AIC (check history?)

# get ABC (check history?)

# get PASS (check bill ID + history)
ny_2021_bills["chamber_of_origin"] = (
    ny_2021_bills["bill_number"]
    .str[0]
    .apply(lambda x: {"A": "assembly", "S": "senate"}[x])
)

ny_2021_bills["PASS"] = ny_2021_bills.apply(
    lambda row: f"passed {row['chamber_of_origin']}" in row["exp_history"],
    axis=1,
)

# get LAW
ny_2021_bills["LAW"] = ny_2021_bills["exp_history"].apply(
    lambda history_list: any("signed" in x for x in history_list)
)

In [None]:
def standardize_bill_number_length(bill_number: str) -> str:
    bill_letter = bill_number[0]
    bill_number = bill_number[1:]

    if bool(re.search(r"[a-zA-Z]", bill_number)):
        bill_number = bill_number[:-1]

    if len(bill_number) < 5:
        bill_number = "0" * (5 - len(bill_number)) + bill_number
    return bill_letter.upper() + bill_number

In [None]:
ny_2021_bills[
    ny_2021_bills["exp_votes"].apply(
        lambda votelist: not any("committee vote" in vote for vote in votelist)
    )
].sample()["exp_history"].iloc[0]

In [None]:
ny_2021_bills[ny_2021_bills["votes"].astype(bool)]["exp_votes"].sample().iloc[0]

In [None]:
ny_2021_bills["substituted_by"] = (
    ny_2021_bills["exp_history"]
    .apply(
        lambda history_list: (
            history_list[-1].split()[-1]
            if any("substituted by" in x for x in history_list)
            else None
        )
    )
    .apply(lambda x: standardize_bill_number_length(x) if x is not None else None)
)

enacted_nos = ny_2021_bills[ny_2021_bills["LAW"]]["bill_number"]
ny_2021_bills = ny_2021_bills[
    ~ny_2021_bills["bill_number"].isin(enacted_nos)
    & ny_2021_bills["substituted_by"].isin(enacted_nos)
]
# phew

In [None]:
ny_2021_bills[ny_2021_bills["bill_number"] == "S07237"]["exp_history"].iloc[0]

In [None]:
passed_bill_nos = ny_2021_bills[ny_2021_bills["PASS"]]["bill_number"]

In [None]:
for _, row in ny_2021_bills[~ny_2021_bills.PASS].iterrows():
    if row["same_as"] in passed_bill_nos:
        print(row["same_as"])

# glad this doesn't look like it'll be a problem

In [None]:
# ny_2023_bills['primary_sponsor'] =
ny_2023_bills["sponsors"].apply(
    lambda spons_dict: [x for x in spons_dict if x["sponsor_type_id"] == 1]
).apply(len)

# TODO: use sponsors

In [None]:
ny_2023_bills.loc[0].sponsors

In [None]:
ny_2023_bills.history

In [None]:
ny_2023_bills[ny_2023_bills["PASS"] == True].sponsors.iloc[0]

In [None]:
ny_2023_bills.loc[10593].progress

In [None]:
ny_2023_bills.loc[
    [10593, 6591, 7458, 3557, 3914, 7868, 1148, 7295, 2352, 2062]
].progress

In [None]:
ny_2023_bills.sort_values(by="progress", key=lambda x: len(x) if x is not None else 1)

In [None]:
ny_2023_bills.loc[ny_2023_bills["bill_number"] == "A07628"]["progress"].iloc[0]

In [None]:
ny_2023_bills.loc[ny_2023_bills["bill_number"] == "S06564"]["progress"].iloc[0]

In [None]:
bills_df.sort_values(
    by="history",
    key=lambda col: col.apply(len),
    ascending=False,
).iloc[1]

In [None]:
bills_df.sort_values(
    by="history",
    key=lambda col: col.apply(len),
    ascending=False,
)  # .iloc[1]

In [None]:
bills_df["sponsors"]  # .apply(lambda x: list(x)[0])

In [None]:
def find_same_as(df_row):
    

In [None]:
bills_df.loc[
    bills_df["bill_number"]
    .str.lower()
    .apply(lambda x: bool(re.match(x, standardize_bill_number_length("a7560b"))))
]