In [1]:
import io
import logging
import os
import re
import sys

import pandas as pd
from legcop import LegiScan

sys.path.append(os.getcwd())
from leg_eff_secrets import LEGISCAN_API_KEY

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

## Read in files

In [852]:
def load_dataset(state, year):
    """
    checks if datasets are in memory. loads them in if they are, downloads them if they aren't.
    only takes a single (state,year) tuple at a time - loop over it if you want more than one.
    returns a tuple of dfs: (bills, people, votes)
    """

    if os.path.exists(f"../../data/raw/{state}-{year}.json"):
        logger.info("Dataset already downloaded.")
        try:
            bills_df = pd.read_json(
                f"../../data/raw/{state}-{year}.json",
            ).reset_index(drop=True)
            logger.info("Dataset loaded into memory.")
            return bills_df
        except FileNotFoundError:
            logger.info("File not found.")
            pass
        except Exception as e:
            logger.error(
                "Looks like your files are downloaded, but there's something wrong. "
                + "If you aren't sure what to do, deleting the files and running"
                + " the code again is usually a safe idea."
                + f"Error: {e}"
            )

    logger.info(f"Dataset for {state}-{year} is either missing or incomplete.")

    legis = LegiScan(LEGISCAN_API_KEY)

    dataset_list = legis.get_dataset_list(state=state, year=year)

    ACCESS_KEY = dataset_list[0]["access_key"]
    SESSION_ID = dataset_list[0]["session_id"]

    del dataset_list

    logger.info(
        "Starting dataset download. This can take my laptop up to around 5 minutes, especially for large datasets."
    )
    dataset = legis.get_dataset(access_key=ACCESS_KEY, session_id=SESSION_ID)
    del ACCESS_KEY, SESSION_ID
    assert dataset["status"] == "OK"

    logger.info("Download complete. Starting pre-processing.")

    readable_dataset = legis.recode_zipfile(dataset)
    namelist = readable_dataset.namelist()

    list_of_bill_dfs = []

    for file in namelist:
        if "/bill/" in file:
            content = readable_dataset.read(file)
            list_of_bill_dfs.append(
                pd.read_json(io.StringIO(content.decode("UTF-8"))).T
            )

    del content, file, dataset, readable_dataset, namelist

    bills_df = pd.concat(list_of_bill_dfs)

    logger.info("Pre-processing complete. Saving to disk.")

    bills_df.reset_index(drop=True).to_json(
        f"../../data/raw/{state}-{year}.json", index=False
    )

    return bills_df.reset_index(drop=True)


ny_2021_bills = load_dataset("NY", 2021)

2024-08-31 09:34:58,923 - INFO - Dataset already downloaded.
2024-08-31 09:35:00,244 - INFO - Dataset loaded into memory.


In [853]:
# read in NY Senate API for home rule, program bill, actual sponsor.

# see read_senate_api.py

senate_NY_2021_bills = pd.read_json(
    os.path.join("..", "..", "data", "raw", "ny_senate_bills.json"),
)

In [855]:
# get rid of resolutions
ny_2021_bills = ny_2021_bills[
    ny_2021_bills["bill_number"].str.startswith("A")
    | ny_2021_bills["bill_number"].str.startswith("S")
].reset_index(drop=True)

# uncomment this once you've read in the ny senate data from 2021-22:
# ny_2021_bills = ny_2021_bills.merge(
#     ny_senate_data[columns], # home rule, program bill, actual sponsor.
#     left_on="bill_number",
#     right_on="whatever the ny senate one calls its bill number",
#     how='left'
# )


# make a SAME_AS column
def get_same_as(sasts_column: pd.Series) -> pd.Series:
    return sasts_column.apply(
        # check sast `type_id` == 1 for same_as.
        lambda x: x[0]["sast_bill_number"] if len(x) > 0 else None
    )


ny_2021_bills["same_as"] = get_same_as(ny_2021_bills["sasts"])


def expand_progress(progress_list):
    progress_dict = {
        0: "N/A Pre-filed or pre-introduction",
        1: "Introduced",
        2: "Engrossed",
        3: "Enrolled",
        4: "Passed",
        5: "Vetoed",
        6: "Failed",  # Limited support based on state
        7: "Override",
        8: "Chaptered",  # what bills are chaptered?
        9: "Refer",
        10: "Report Pass",
        11: "Report DNP",
        12: "Draft",
    }

    if len(progress_list) == 0:
        return []

    templist = []

    for update in progress_list:
        templist.append(progress_dict[update["event"]].lower())
    return templist


def expand_history(history_list):
    if len(history_list) == 0:
        return []

    templist = []

    for update in history_list:
        templist.append(update["action"].lower())
    return templist


def expand_votes(vote_list):
    if len(vote_list) == 0:
        return []

    templist = []

    for update in vote_list:
        templist.append(update["desc"].lower())
    return templist


ny_2021_bills["exp_progress"] = ny_2021_bills["progress"].apply(expand_progress)
ny_2021_bills["exp_history"] = ny_2021_bills["history"].apply(expand_history)
ny_2021_bills["exp_votes"] = ny_2021_bills["votes"].apply(expand_votes)

# get BILL
ny_2021_bills["bill"] = ny_2021_bills["exp_progress"].apply(
    lambda x: ("Introduced" in x) if x is not None else False
)
# get AIC (check history?)
ny_2021_bills["aic"] = ny_2021_bills["exp_history"].apply(
    lambda historyList: (
        any(("committee" in event) for event in historyList)
        or any(("reading" in event) for event in historyList)
        or any(("third reading" in event) for event in historyList)
        or any(("report" in event) for event in historyList)
        or any(("amend and recommit" in event) for event in historyList)
        or any(("amend (t) and recommit" in event) for event in historyList)
        # which of the following count as actions in committee?
        # or any(("enacting clause stricken" in event) for event in historyList)
        # or any(("print number" in event) for event in historyList)
        # or any(("to attorney-general for opinion" in event) for event in historyList)
        # or any(("held for consideration" in event) for event in historyList)
    )
)

# get ABC (check history?)
# # this, as written, is just "did it make it out of committee". probably rewrite?
ny_2021_bills["out_of_committee"] = ny_2021_bills["exp_votes"].apply(
    lambda votelist: (
        any(bool(re.search("committee: favorable$", vote)) for vote in votelist)
        # so as not to count, e.g. 'assembly codes committee: favorable refer to committee rules'
    )
)

# get PASS (check bill ID + history)
ny_2021_bills["chamber_of_origin"] = (
    ny_2021_bills["bill_number"]
    .str[0]
    .apply(lambda x: {"A": "assembly", "S": "senate"}[x])
)

# actually do 'passed_senate' and 'passed_assembly' columns so senators can get
# credit for bills passing senate but not for bills passing assembly
ny_2021_bills["pass"] = ny_2021_bills.apply(
    lambda row: f"passed {row['chamber_of_origin']}" in row["exp_history"],
    axis=1,
)


def standardize_bill_number_length(bill_number: str) -> str:
    bill_letter = bill_number[0]
    bill_number = bill_number[1:]

    if bool(re.search(r"[a-zA-Z]", bill_number)):
        bill_number = bill_number[:-1]

    if len(bill_number) < 5:
        bill_number = "0" * (5 - len(bill_number)) + bill_number
    return bill_letter.upper() + bill_number


ny_2021_bills["substituted_by"] = (
    # check -- does this give the same answer as using RAST?
    ny_2021_bills["exp_history"]
    .apply(
        lambda history_list: (
            history_list[-1].split()[-1]
            if any("substituted by" in x for x in history_list)
            else None
        )
    )
    .apply(lambda x: standardize_bill_number_length(x) if x is not None else None)
)

enacted_nos = ny_2021_bills[ny_2021_bills["law"]]["bill_number"]

# get LAW
ny_2021_bills["law"] = ny_2021_bills["exp_history"].apply(
    lambda history_list: any("signed" in x for x in history_list)
) | ny_2021_bills["substituted_by"].isin(enacted_nos)

TOTAL_INTRODUCED = ny_2021_bills["bill"].sum()
TOTAL_AIC = ny_2021_bills["aic"].sum()
TOTAL_OUT_OF_COMMITTEE = ny_2021_bills["out_of_committee"].sum()
TOTAL_PASS = ny_2021_bills["pass"].sum()
TOTAL_LAW = ny_2021_bills["law"].sum()

In [825]:
[x["name"] for x in ny_2021_bills.iloc[0].sponsors]

['Kevin Cahill',
 'Nick Perry',
 'Steven Cymbrowitz',
 'Sandra Galef',
 'Michael Montesano',
 'Crystal Peoples-Stokes',
 'Fred Thiele']

In [832]:
[x["name"] for x in ny_2021_bills.iloc[0].sponsors if x["sponsor_type_id"] == 1]

['Kevin Cahill',
 'Steven Cymbrowitz',
 'Sandra Galef',
 'Michael Montesano',
 'Crystal Peoples-Stokes',
 'Fred Thiele']

In [809]:
# action out of committee

ny_2021_bills["votes"].apply(
    lambda row: [
        v["date"]
        for v in row["votes"]
        if bool(re.search("committee: favorable", v["desc"].lower()))
        & (row["chamber_of_origin"] in v["desc"].lower())
    ]
)

TypeError: list indices must be integers or slices, not str

In [783]:
# if the date on a vote is after it gets out of committee?
# TODO: try this with action/history inst of vote
ny_2021_bills.apply(
    lambda row: row["votes"] if row["out_of_committee"] else False, axis=1
)

0                                                    False
1                                                    False
2                                                    False
3                                                    False
4        [{'roll_call_id': 1208155, 'date': '2022-05-24...
                               ...                        
19629                                                False
19630                                                False
19631                                                False
19632                                                False
19633                                                False
Length: 19634, dtype: object

In [772]:
# ny_2021_bills['out_of_committee'] =
ny_2021_bills["votes"].apply(
    lambda row: (
        [
            row["vote"]["date"]
            for vote in row["votes"]
            if re.search("committee: favorable$", row["exp_votes"])
        ]
        # so as not to count, e.g. 'assembly codes committee: favorable refer to committee rules'
    ),
    axis=1,
)

# .apply(
#    lambda row: [x for x in [
#        vote["date"] if re.search("committee: favorable$", vote['desc']) else None
#        for vote in row["votes"]
#    ] if x],
#    axis=1,
# )  # ['exp_votes'].sample().iloc[0]

0        []
1        []
2        []
3        []
4        []
         ..
19629    []
19630    []
19631    []
19632    []
19633    []
Name: votes, Length: 19634, dtype: object

In [856]:
ny_2021_bills["substituted_by"] = (
    # check -- does this give the same answer as using RAST?
    ny_2021_bills["exp_history"]
    .apply(
        lambda history_list: (
            history_list[-1].split()[-1]
            if any("substituted by" in x for x in history_list)
            else None
        )
    )
    .apply(lambda x: standardize_bill_number_length(x) if x is not None else None)
)

enacted_nos = ny_2021_bills[ny_2021_bills["law"]]["bill_number"]
ny_2021_bills[
    ~ny_2021_bills["bill_number"].isin(enacted_nos)
    & ny_2021_bills["substituted_by"].isin(enacted_nos)
]
# phew

Unnamed: 0,bill_id,change_hash,session_id,session,url,state_link,completed,status,status_date,progress,...,exp_progress,exp_history,exp_votes,bill,aic,out_of_committee,chamber_of_origin,pass,law,substituted_by
65,1389584,3cafe1b41dca2fe09392f155c7267219,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/A00086/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2021-01-06,"[{'date': '2021-01-06', 'event': 1}, {'date': ...",...,"[introduced, refer, report pass, refer, report...","[referred to transportation, amend and recommi...",[assembly transportation committee: favorable ...,False,True,True,assembly,False,False,S05354
92,1390074,9b26147ef858f1143c1d184ede1dab4e,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/A00113/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2021-01-06,"[{'date': '2021-01-06', 'event': 1}, {'date': ...",...,"[introduced, refer, report pass]","[referred to judiciary, reported, advanced to ...",[assembly judiciary committee: favorable],False,True,True,assembly,False,False,S00290
107,1389862,b37728d0702872abd433343b0b88619a,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/A00128/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2021-01-06,"[{'date': '2021-01-06', 'event': 1}, {'date': ...",...,"[introduced, refer, report pass, refer, report...","[referred to alcoholism and drug abuse, report...",[assembly alcoholism and drug abuse committee:...,False,True,True,assembly,False,False,S06044
125,1390850,2bb617a8948177a6cbc8080e0f206780,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/A00146/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2021-01-06,"[{'date': '2021-01-06', 'event': 1}, {'date': ...",...,"[introduced, refer, report pass, refer, refer,...","[referred to health, reported referred to ways...",[assembly health committee: favorable refer to...,False,True,True,assembly,False,False,S01594
139,1390507,186b242fc3f1c75f8dc727f7c925c4d0,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/A00160/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2021-01-06,"[{'date': '2021-01-06', 'event': 1}, {'date': ...",...,"[introduced, refer, report pass, refer, report...","[referred to health, reported referred to ways...",[assembly health committee: favorable refer to...,False,True,True,assembly,False,False,S02122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19430,1622452,089b3840b02eb6bde3305d6ff80812c9,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/S09419/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2022-05-25,"[{'date': '2022-05-25', 'event': 1}, {'date': ...",...,"[introduced, refer]","[referred to environmental conservation, commi...",[],False,True,False,senate,False,False,A07710
19450,1623064,01fb5a6ca37f00932e45acf5a8dc4902,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/S09439/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2022-05-27,"[{'date': '2022-05-27', 'event': 1}, {'date': ...",...,"[introduced, refer]","[referred to local government, committee disch...",[],False,True,False,senate,False,False,A09326
19452,1623075,43af03448e27e1b57ea99ca196f4730f,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/S09441/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2022-05-27,"[{'date': '2022-05-27', 'event': 1}, {'date': ...",...,"[introduced, refer]","[referred to procurement and contracts, commit...",[],False,True,False,senate,False,False,A07919
19458,1623090,9783f742c571b008238db898d34aaf55,1813,"{'session_id': 1813, 'state_id': 32, 'year_sta...",https://legiscan.com/NY/bill/S09447/2021,https://www.nysenate.gov/legislation/bills/202...,1,1,2022-05-27,"[{'date': '2022-05-27', 'event': 1}, {'date': ...",...,"[introduced, refer]","[referred to codes, committee discharged and c...",[],False,True,False,senate,False,False,A07079


In [845]:
passed_bill_nos = [
    standardize_bill_number_length(bill["bill_number"])
    for bill in ny_2021_bills[ny_2021_bills["pass"]]
]

TypeError: string indices must be integers, not 'str'

In [841]:
ny_2021_bills["same_as"]

0        S05653
1        S04133
2          None
3          None
4        S03275
          ...  
19629      None
19630    A40001
19631    A40002
19632    A41001
19633    A41002
Name: same_as, Length: 19634, dtype: object

In [840]:
for _, row in ny_2021_bills[~ny_2021_bills["pass"]].iterrows():
    if row["same_as"] in passed_bill_nos:
        print(row["same_as"])

# glad this doesn't look like it'll be a problem

TypeError: 'int' object is not subscriptable

In [None]:
# ny_2023_bills['primary_sponsor'] =
ny_2023_bills["sponsors"].apply(
    lambda spons_dict: [x for x in spons_dict if x["sponsor_type_id"] == 1]
).apply(len)

# TODO: use sponsors

In [203]:
ny_2021_bills[ny_2021_bills["exp_progress"].apply(lambda x: "chaptered" in x)][
    "bill_number"
]
# TODO: cross-check with senate api: are these all chapter amendments?
# doesn't matter too much, since this would only be chapter amendments that pass

4        A00025
75       A00096
87       A00108
97       A00118
105      A00126
          ...  
19489    S09478
19628    S09617
19630    S50001
19631    S50002
19632    S51001
Name: bill_number, Length: 1654, dtype: object