# LegiScan API
I've been working with the [LegiScan API](https://legiscan.com/legiscan) to download information about and text of anti-trans bills to work with for my final project. (Here's the [user manual](https://legiscan.com/misc/LegiScan_API_User_Manual.pdf).) Someone helpfully created a Python module, [pylegiscan](https://github.com/poliquin/pylegiscan), to access LegiScan through Python. [Jonathan Soma's](https://investigate.ai/azcentral-text-reuse-model-legislation/01-downloading-one-million-pieces-of-legislation-from-legiscan/) tutorial was extremely helpful for getting started.

Below is an example, downloading the text of over 300 anti-LGBTQ (mostly anti-trans) bills introduced in states and the US in the 2023 legislative session.

## Imports

In [None]:
import zipfile
import base64
import io
import glob
import time
import json
import os
import requests
import mimetypes
import csv
import pandas as pd
import numpy as np

from pypdf import PdfReader
from base64 import b64decode

from bs4 import BeautifulSoup

[Getting an absolute path from an interactive shell](https://bobbyhadz.com/blog/python-nameerror-name-file-is-not-defined)

# U.S. state names : abbreviations dict
Need this to turn the state names from the spreadsheet into abbreviations.

In [None]:
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# Canonical URL: https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-2:US

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
    "US": "US"
}

In [None]:
def swapStateAbbrev(state):
    state = us_state_to_abbrev.get(state)
    return state

In [None]:
swapStateAbbrev("Wisconsin")

## pylegiscan

To talk to LegiScan's API, we're borrowing some code from [pylegiscan](https://github.com/poliquin/pylegiscan). Since it isn't a package you can install with `pip`, it wound up being easier for distribution to just cut and paste it here.

In [None]:
# Taken from https://github.com/poliquin/pylegiscan/blob/master/pylegiscan/legiscan.py

# import os — we already imported these above
# import json
# import requests
from urllib.parse import urlencode
from urllib.parse import quote_plus

# current aggregate status of bill
BILL_STATUS = {1: "Introduced",
               2: "Engrossed",
               3: "Enrolled",
               4: "Passed",
               5: "Vetoed",
               6: "Failed/Dead"}

# significant steps in bill progress.
BILL_PROGRESS = {1: "Introduced",
                 2: "Engrossed",
                 3: "Enrolled",
                 4: "Passed",
                 5: "Vetoed",
                 6: "Failed/Dead",
                 7: "Veto Override",
                 8: "Chapter/Act/Statute",
                 9: "Committee Referral",
                10: "Committee Report Pass",
                11: "Committee Report DNP"}


"""
Interact with LegiScan API.

"""

# a helpful list of valid legiscan state abbreviations (no Puerto Rico)
STATES = ['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
          'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me',
          'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm',
          'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
          'ut', 'va', 'vt', 'wa', 'wi', 'wv', 'wy']

class LegiScanError(Exception):
    pass

class LegiScan(object):
    BASE_URL = 'http://api.legiscan.com/?key={0}&op={1}&{2}'

    def __init__(self, apikey=None):
        """LegiScan API.  State parameters should always be passed as
           USPS abbreviations.  Bill numbers and abbreviations are case
           insensitive.  Register for API at http://legiscan.com/legiscan
        """
        # see if API key available as environment variable
        if apikey is None:
            apikey = config.LEGISCAN_API_KEY
        self.key = apikey.strip()

    def _url(self, operation, params=None):
        """Build a URL for querying the API."""
        if not isinstance(params, str) and params is not None:
            params = urlencode(params)
        elif params is None:
            params = ''
        return self.BASE_URL.format(self.key, operation, params)

    def _get(self, url):
        """Get and parse JSON from API for a url."""
        req = requests.get(url)
        if not req.ok:
            raise LegiScanError('Request returned {0}: {1}'\
                    .format(req.status_code, url))
        data = json.loads(req.content)
        if data['status'] == "ERROR":
            raise LegiScanError(data['alert']['message'])
        return data

    def get_session_list(self, state):
        """Get list of available sessions for a state."""
        url = self._url('getSessionList', {'state': state})
        data = self._get(url)
        return data['sessions']

    def get_dataset_list(self, state=None, year=None):
        """Get list of available datasets, with optional state and year filtering.
        """
        if state is not None:
            url = self._url('getDatasetList', {'state': state})
        elif year is not None:
            url = self._url('getDatasetList', {'year': year})
        else:
            url = self._url('getDatasetList')
        data = self._get(url)
        # return a list of the bills
        return data['datasetlist']

    def get_dataset(self, id, access_key):
        """Get list of available datasets, with optional state and year filtering.
        """
        url = self._url('getDataset', {'id': id, 'access_key': access_key})
        data = self._get(url)
        # return a list of the bills
        return data['dataset']
      
    def get_master_list(self, state=None, session_id=None):
        """Get list of bills for the current session in a state or for
           a given session identifier.
        """
        if state is not None:
            url = self._url('getMasterList', {'state': state})
        elif session_id is not None:
            url = self._url('getMasterList', {'id': session_id})
        else:
            raise ValueError('Must specify session identifier or state.')
        data = self._get(url)
        # return a list of the bills
        return [data['masterlist'][i] for i in data['masterlist']]

    def get_bill(self, bill_id=None, state=None, bill_number=None):
        """Get primary bill detail information including sponsors, committee
           references, full history, bill text, and roll call information.

           This function expects either a bill identifier or a state and bill
           number combination.  The bill identifier is preferred, and required
           for fetching bills from prior sessions.
        """
        if bill_id is not None:
            url = self._url('getBill', {'id': bill_id})
        elif state is not None and bill_number is not None:
            url = self._url('getBill', {'state': state, 'bill': bill_number})
        else:
            raise ValueError('Must specify bill_id or state and bill_number.')
        return self._get(url)['bill']

    def get_bill_text(self, doc_id):
        """Get bill text, including date, draft revision information, and
           MIME type.  Bill text is base64 encoded to allow for PDF and Word
           data transfers.
        """
        url = self._url('getBillText', {'id': doc_id})
        return self._get(url)['text']

    def get_amendment(self, amendment_id):
        """Get amendment text including date, adoption status, MIME type, and
           title/description information.  The amendment text is base64 encoded
           to allow for PDF and Word data transfer.
        """
        url = self._url('getAmendment', {'id': amendment_id})
        return self._get(url)['amendment']

    def get_supplement(self, supplement_id):
        """Get supplement text including type of supplement, date, MIME type
           and text/description information.  Supplement text is base64 encoded
           to allow for PDF and Word data transfer.
        """
        url = self._url('getSupplement', {'id': supplement_id})
        return self._get(url)['supplement']

    def get_roll_call(self, roll_call_id):
        """Roll call detail for individual votes and summary information."""
        data = self._get(self._url('getRollcall', {'id': roll_call_id}))
        return data['roll_call']

    def get_sponsor(self, people_id):
        """Sponsor information including name, role, and a followthemoney.org
           person identifier.
        """
        url = self._url('getSponsor', {'id': people_id})
        return self._get(url)['person']

    def search(self, state, bill_number=None, query=None, year=2, page=1):
        """Get a page of results for a search against the LegiScan full text
           engine; returns a paginated result set.

           Specify a bill number or a query string.  Year can be an exact year
           or a number between 1 and 4, inclusive.  These integers have the
           following meanings:
               1 = all years
               2 = current year, the default
               3 = recent years
               4 = prior years
           Page is the result set page number to return.
        """
        if bill_number is not None:
            params = {'state': state, 'bill': bill_number}
        elif query is not None:
            params = {'state': state, 'query': query,
                      'year': year, 'page': page}
        else:
            raise ValueError('Must specify bill_number or query')
        data = self._get(self._url('search', params))['searchresult']
        # return a summary of the search and the results as a dictionary
        summary = data.pop('summary')
        results = {'summary': summary, 'results': [data[i] for i in data]}
        return results

    def __str__(self):
        return '<LegiScan API {0}>'.format(self.key)

    def __repr__(self):
        return str(self)

# Connect to LegiScan

Using pylegiscan, you just pass your API key to `LegiScan` and you're good to go. I set up an environment variable for mine.

In [None]:
import config

api_key = config.LEGISCAN_API_KEY
legis = LegiScan(api_key)

# Read in my anti-trans bills csv
Downloaded from [LegiAlerts.com](LegiAlerts.com) on February 25, 2023. There are more bills now.

In [None]:
df = pd.read_csv('anti-lgbtq-bills-tracker.csv', usecols=['State','Number','URL'])

In [None]:
df['Abbreviation'] = df.loc[:,'State']

In [None]:
df = df.reindex(columns=['State','Abbreviation', 'Number', 'URL'])
df.head()

In [None]:
df['Abbreviation'] = df['Abbreviation'].map(swapStateAbbrev)
df.head()

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-tracker.csv')

In [None]:
df['Bill ID'] = ''
df['Bill Text'] = ''
df.head()

---
# Get the bill IDs from state data
1. Look at the state abbreviation
2. Open and load the corresponding filepath to the JSON
3. Find the bill ID
4. Add it in this df

## Get bill ID for one bill

In [None]:
abbrv = df['Abbreviation'][0]
billnum = df['Number'][0]

In [None]:
abbrv

In [None]:
billnum

In [None]:
filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
data = json.load(open(filepath))
data

In [None]:
data['bill']['texts'][0]['doc_id']

## Get bill ID for all bills
SUCCESS! Exported to CSV.

In [None]:
df = pd.read_csv('anti-lgbtq-bills-abbrevs-tracker.csv', usecols=['State','Abbreviation','Number','URL','Bill ID','Bill Text'])

In [None]:
df.head()

In [None]:
def getbillid(abbrv, billnum):
    filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
    data = json.load(open(filepath))
    return data['bill']['bill_id']
df['Bill ID'] = df.apply(lambda row: getbillid(row['Abbreviation'],row['Number']), axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-tracker.csv')

---
# Get doc ID for one bill


In [None]:
abbrv = df['Abbreviation'][2]
billnum = df['Number'][2]
filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
data = json.load(open(filepath))
data['bill']['texts'][0]['doc_id']

---
# Get doc IDs for all bills

In [None]:
dfbackup = df

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-tracker.csv')

In [None]:
data['bill']['texts'][0]['doc_id']

In [None]:
df['Doc ID'] = ''

In [None]:
def getdocid(abbrv, billnum):
    filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
    data = json.load(open(filepath))
    try:
        print(abbrv, billnum, data['bill']['texts'][0]['doc_id'])
        return int(data['bill']['texts'][0]['doc_id'])
    except:
        print(abbrv, billnum, "doc_id not found")        
        return None
df['Doc ID'] = df.apply(lambda row: getdocid(row['Abbreviation'],row['Number']), axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-docids-tracker.csv')
dfbackup = df

---
# Get bill text for all bills
For some reason they come in base64 format, so I decode these in another notebook.

In [None]:
def getbilltext(abbrv, doc_id):
    try:
        docid = int(doc_id)
        billtextinfo = legis.get_bill_text(docid)
        print(billtextinfo)
        return billtextinfo
    except:
        print('failure')
        return None
    print(abbrv, " ", int(doc_id))

In [None]:
getbilltext(df['Abbreviation'][1], df['Doc ID'][1])

In [None]:
legis.get_bill(bill_id=df['Bill ID'][0])

In [None]:
id = int(df['Doc ID'][57])

In [None]:
id

In [None]:
2674851

In [None]:
# Only works to get doc when doc_id is an int. God knows why
legis.get_bill_text(id)

In [None]:
df['Bill Text Info'] = df.apply(lambda row: getbilltext(row['Abbreviation'],row['Doc ID']), axis=1)

In [None]:
dfbackup = df

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-docids-billinfo-tracker.csv')