# Downloading bill data from LegiScan

- [x]  Using `anti-lgbtq-bills-tracker.csv` , get states and URLs
- [ ]  Get the master list for the current legislative session for every state on that list
- [ ]  Use the master list for each state to look up all bill IDs using the URLs list
- [ ]  Use the bill IDs to find the doc IDs
- [ ]  Use the doc IDs to download the bill texts
- [ ]  Find a [better PDF-to-text parser](https://www.reddit.com/r/Python/comments/ql4xkf/extract_text_from_pdf/) for reading the bill texts
- [ ]  Find a spell checker and look up other ways to clean up the bill texts
- [ ]  Throw all the bill texts into one big .txt file
- [ ]  Run through spaCy and Tracery with it again

## Imports

In [2]:
import zipfile
import base64
import io
import glob
import time
import json
import os
import requests
import mimetypes
import csv
import pandas as pd
import numpy as np

from pypdf import PdfReader
from base64 import b64decode

from bs4 import BeautifulSoup

[Getting an absolute path from an interactive shell](https://bobbyhadz.com/blog/python-nameerror-name-file-is-not-defined)

# U.S. state names : abbreviations dict

In [None]:
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# Canonical URL: https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.  Data originally from Wikipedia at the url:
# https://en.wikipedia.org/wiki/ISO_3166-2:US

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
    "US": "US"
}

In [None]:
def swapStateAbbrev(state):
    state = us_state_to_abbrev.get(state)
    return state

In [None]:
swapStateAbbrev("Wisconsin")

## pylegiscan

To talk to LegiScan's API, we're borrowing some code from [pylegiscan](https://github.com/poliquin/pylegiscan). Since it isn't a package you can install with `pip`, it wound up being easier for distribution to just cut and paste it here.

In [None]:
# Taken from https://github.com/poliquin/pylegiscan/blob/master/pylegiscan/legiscan.py

import os
import json
import requests
from urllib.parse import urlencode
from urllib.parse import quote_plus

# current aggregate status of bill
BILL_STATUS = {1: "Introduced",
               2: "Engrossed",
               3: "Enrolled",
               4: "Passed",
               5: "Vetoed",
               6: "Failed/Dead"}

# significant steps in bill progress.
BILL_PROGRESS = {1: "Introduced",
                 2: "Engrossed",
                 3: "Enrolled",
                 4: "Passed",
                 5: "Vetoed",
                 6: "Failed/Dead",
                 7: "Veto Override",
                 8: "Chapter/Act/Statute",
                 9: "Committee Referral",
                10: "Committee Report Pass",
                11: "Committee Report DNP"}


"""
Interact with LegiScan API.

"""

# a helpful list of valid legiscan state abbreviations (no Puerto Rico)
STATES = ['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
          'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me',
          'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm',
          'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
          'ut', 'va', 'vt', 'wa', 'wi', 'wv', 'wy']

class LegiScanError(Exception):
    pass

class LegiScan(object):
    BASE_URL = 'http://api.legiscan.com/?key={0}&op={1}&{2}'

    def __init__(self, apikey=None):
        """LegiScan API.  State parameters should always be passed as
           USPS abbreviations.  Bill numbers and abbreviations are case
           insensitive.  Register for API at http://legiscan.com/legiscan
        """
        # see if API key available as environment variable
        if apikey is None:
            apikey = config.LEGISCAN_API_KEY
        self.key = apikey.strip()

    def _url(self, operation, params=None):
        """Build a URL for querying the API."""
        if not isinstance(params, str) and params is not None:
            params = urlencode(params)
        elif params is None:
            params = ''
        return self.BASE_URL.format(self.key, operation, params)

    def _get(self, url):
        """Get and parse JSON from API for a url."""
        req = requests.get(url)
        if not req.ok:
            raise LegiScanError('Request returned {0}: {1}'\
                    .format(req.status_code, url))
        data = json.loads(req.content)
        if data['status'] == "ERROR":
            raise LegiScanError(data['alert']['message'])
        return data

    def get_session_list(self, state):
        """Get list of available sessions for a state."""
        url = self._url('getSessionList', {'state': state})
        data = self._get(url)
        return data['sessions']

    def get_dataset_list(self, state=None, year=None):
        """Get list of available datasets, with optional state and year filtering.
        """
        if state is not None:
            url = self._url('getDatasetList', {'state': state})
        elif year is not None:
            url = self._url('getDatasetList', {'year': year})
        else:
            url = self._url('getDatasetList')
        data = self._get(url)
        # return a list of the bills
        return data['datasetlist']

    def get_dataset(self, id, access_key):
        """Get list of available datasets, with optional state and year filtering.
        """
        url = self._url('getDataset', {'id': id, 'access_key': access_key})
        data = self._get(url)
        # return a list of the bills
        return data['dataset']
      
    def get_master_list(self, state=None, session_id=None):
        """Get list of bills for the current session in a state or for
           a given session identifier.
        """
        if state is not None:
            url = self._url('getMasterList', {'state': state})
        elif session_id is not None:
            url = self._url('getMasterList', {'id': session_id})
        else:
            raise ValueError('Must specify session identifier or state.')
        data = self._get(url)
        # return a list of the bills
        return [data['masterlist'][i] for i in data['masterlist']]

    def get_bill(self, bill_id=None, state=None, bill_number=None):
        """Get primary bill detail information including sponsors, committee
           references, full history, bill text, and roll call information.

           This function expects either a bill identifier or a state and bill
           number combination.  The bill identifier is preferred, and required
           for fetching bills from prior sessions.
        """
        if bill_id is not None:
            url = self._url('getBill', {'id': bill_id})
        elif state is not None and bill_number is not None:
            url = self._url('getBill', {'state': state, 'bill': bill_number})
        else:
            raise ValueError('Must specify bill_id or state and bill_number.')
        return self._get(url)['bill']

    def get_bill_text(self, doc_id):
        """Get bill text, including date, draft revision information, and
           MIME type.  Bill text is base64 encoded to allow for PDF and Word
           data transfers.
        """
        url = self._url('getBillText', {'id': doc_id})
        return self._get(url)['text']

    def get_amendment(self, amendment_id):
        """Get amendment text including date, adoption status, MIME type, and
           title/description information.  The amendment text is base64 encoded
           to allow for PDF and Word data transfer.
        """
        url = self._url('getAmendment', {'id': amendment_id})
        return self._get(url)['amendment']

    def get_supplement(self, supplement_id):
        """Get supplement text including type of supplement, date, MIME type
           and text/description information.  Supplement text is base64 encoded
           to allow for PDF and Word data transfer.
        """
        url = self._url('getSupplement', {'id': supplement_id})
        return self._get(url)['supplement']

    def get_roll_call(self, roll_call_id):
        """Roll call detail for individual votes and summary information."""
        data = self._get(self._url('getRollcall', {'id': roll_call_id}))
        return data['roll_call']

    def get_sponsor(self, people_id):
        """Sponsor information including name, role, and a followthemoney.org
           person identifier.
        """
        url = self._url('getSponsor', {'id': people_id})
        return self._get(url)['person']

    def search(self, state, bill_number=None, query=None, year=2, page=1):
        """Get a page of results for a search against the LegiScan full text
           engine; returns a paginated result set.

           Specify a bill number or a query string.  Year can be an exact year
           or a number between 1 and 4, inclusive.  These integers have the
           following meanings:
               1 = all years
               2 = current year, the default
               3 = recent years
               4 = prior years
           Page is the result set page number to return.
        """
        if bill_number is not None:
            params = {'state': state, 'bill': bill_number}
        elif query is not None:
            params = {'state': state, 'query': query,
                      'year': year, 'page': page}
        else:
            raise ValueError('Must specify bill_number or query')
        data = self._get(self._url('search', params))['searchresult']
        # return a summary of the search and the results as a dictionary
        summary = data.pop('summary')
        results = {'summary': summary, 'results': [data[i] for i in data]}
        return results

    def __str__(self):
        return '<LegiScan API {0}>'.format(self.key)

    def __repr__(self):
        return str(self)

# Connect to LegiScan

Using pylegiscan, you just pass your API key to `LegiScan` and you're good to go. I set up an environment variable for mine, but you can also just paste yours at `OR_PUT_YOUR_API_KEY_HERE`.

In [None]:
import config

api_key = config.LEGISCAN_API_KEY
legis = LegiScan(api_key)

If you wanted to search for bills based on state or text, that's easy to do.

# Read in my anti-trans bills csv

In [3]:
df = pd.read_csv('anti-lgbtq-bills-tracker.csv', usecols=['State','Number','URL'])

FileNotFoundError: [Errno 2] No such file or directory: 'anti-lgbtq-bills-tracker.csv'

In [None]:
df['Abbreviation'] = df.loc[:,'State']

In [None]:
df = df.reindex(columns=['State','Abbreviation', 'Number', 'URL'])
df.head()

In [None]:
df['Abbreviation'] = df['Abbreviation'].map(swapStateAbbrev)
df.head()

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-tracker.csv')

In [None]:
df['Bill ID'] = ''
df['Bill Text'] = ''
df.head()

---
# Get the bill IDs from state data
1. Look at the state abbreviation
2. Open and load the corresponding filepath to the JSON
3. Find the bill ID
4. Add it in this df

## Get bill ID for one bill

In [None]:
abbrv = df['Abbreviation'][0]
billnum = df['Number'][0]

In [None]:
abbrv

In [None]:
billnum

In [None]:
filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
data = json.load(open(filepath))
data

In [None]:
data['bill']['texts'][0]['doc_id']

## Get bill ID for all bills
SUCCESS! Exported to CSV.

In [None]:
df = pd.read_csv('anti-lgbtq-bills-abbrevs-tracker.csv', usecols=['State','Abbreviation','Number','URL','Bill ID','Bill Text'])

In [None]:
df.head()

In [None]:
def getbillid(abbrv, billnum):
    filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
    data = json.load(open(filepath))
    return data['bill']['bill_id']
df['Bill ID'] = df.apply(lambda row: getbillid(row['Abbreviation'],row['Number']), axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-tracker.csv')

---
# Get doc ID for one bill


In [None]:
abbrv = df['Abbreviation'][2]
billnum = df['Number'][2]
filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
data = json.load(open(filepath))
data['bill']['texts'][0]['doc_id']

---
# Get doc IDs for all bills

In [None]:
dfbackup = df

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-tracker.csv')

In [None]:
data['bill']['texts'][0]['doc_id']

In [None]:
df['Doc ID'] = ''

In [None]:
def getdocid(abbrv, billnum):
    filepath = f"/Users/gabriel/Documents/GitHub/legiscan/azcentral-text-reuse-model-legislation/notebooks/20230224-legiscan-JSONs/{abbrv}/bill/{billnum}.json"
    data = json.load(open(filepath))
    try:
        print(abbrv, billnum, data['bill']['texts'][0]['doc_id'])
        return int(data['bill']['texts'][0]['doc_id'])
    except:
        print(abbrv, billnum, "doc_id not found")        
        return None
df['Doc ID'] = df.apply(lambda row: getdocid(row['Abbreviation'],row['Number']), axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-docids-tracker.csv')
dfbackup = df

---
# Get bill text for all bills
> In progress.
I forgot that I need a doc ID for this instead of the bill ID. Durr.

In [None]:
def getbilltext(abbrv, doc_id):
    try:
        docid = int(doc_id)
        billtextinfo = legis.get_bill_text(docid)
        print(billtextinfo)
        return billtextinfo
    except:
        print('failure')
        return None
    print(abbrv, " ", int(doc_id))

In [None]:
getbilltext(df['Abbreviation'][1], df['Doc ID'][1])

In [None]:
legis.get_bill(bill_id=df['Bill ID'][0])

In [None]:
id = int(df['Doc ID'][57])

In [None]:
id

In [None]:
2674851

In [None]:
# Only works to get doc when doc_id is an int. God knows why
legis.get_bill_text(id)

In [None]:
df['Bill Text Info'] = df.apply(lambda row: getbilltext(row['Abbreviation'],row['Doc ID']), axis=1)

In [None]:
dfbackup = df

In [None]:
df.to_csv('anti-lgbtq-bills-abbrevs-docids-billinfo-tracker.csv')

---
# Make a column with the MIME? Or just go right into the text parsing?
> I can't brain anymore! Stop here for today (Sunday 2023-02-26)

In [None]:
df2 = df

In [None]:
df2['MIME'] = ''

In [None]:
def getmime(index):
    try:
        return df2['Bill Text Info'][index]['mime']
    except:
        return None

In [None]:
df2['MIME'] = df2.apply(lambda row: getmime(row), axis=1)

In [None]:
df2.tail()

## Function to decode from base64 into PDF file

In [None]:
# create function
def decodepdf(bill_text):
    # Define the Base64 string of the PDF file
    b64 = bill_text['doc']

    # Decode the Base64 string, making sure that it contains only valid characters
    bytes = b64decode(b64, validate=True)

    # Perform a basic validation to make sure that the result is a valid PDF file
    # Be aware! The magic number (file signature) is not 100% reliable solution to validate PDF files
    # Moreover, if you get Base64 from an untrusted source, you must sanitize the PDF contents
    if bytes[0:4] != b'%PDF':
      raise ValueError('Missing the PDF file signature')
    
    bill_id_name = bill_text['bill_id']
    
    # Write the PDF contents to a local file
    f = open('f'bill_id-'{bill_id_name}.pdf', 'wb')
    f.write(bytes)
    f.close()

# Merge it with function to get text from PDF and output it to a txt file

In [None]:
# create function
def decodepdftotext(bill_text):
    # Define the Base64 string of the PDF file
    b64 = bill_text['doc']

    # Decode the Base64 string, making sure that it contains only valid characters
    bytes = b64decode(b64, validate=True)

    # Perform a basic validation to make sure that the result is a valid PDF file
    # Be aware! The magic number (file signature) is not 100% reliable solution to validate PDF files
    # Moreover, if you get Base64 from an untrusted source, you must sanitize the PDF contents
    if bytes[0:4] != b'%PDF':
      raise ValueError('Missing the PDF file signature')
    
    bill_id_name = bill_text['bill_id']
    
    # Write the PDF contents to a local file
    f = open(f"bill_id-{bill_id_name}.pdf", "wb")
    f.write(bytes)
    f.close()
    
    reader = PdfReader(f"bill_id-{bill_id_name}.pdf")
    text=""
    for n in range(0,len(reader.pages)):
        page = reader.pages[n]
        text = text + page.extract_text()
    g = open(f"bill_id-{bill_id_name}.txt", "w")
    g.write(text)
    g.close()    

## Do it for all the bill texts

In [None]:
for i in range(0,len(bill_texts)):
    if(bill_texts[i]['mime'] == "application/pdf"):
        decodepdftotext(bill_texts[i])
    else:
        continue

Finally I extracted all the text from these PDFs. Now how do I deal with the HTML? Let's make a list of just those so I can deal with them properly:

In [None]:
for i in range(33):
    bill_id = bill_texts[i]['bill_id']

# Deal with the HTML-formatted bill texts

In [None]:
htmlbills = []
for i in range(0,len(bill_texts)):
    if(bill_texts[i]['mime'] == "text/html"):
        htmlbills.append(bill_texts[i])
htmlbills[0:3]

In [None]:
htmlbills[0]

In [None]:
def b64tohtml(bill_text):
    # Define the Base64 string of the PDF file
    b64 = bill_text['doc']

    # Decode the Base64 string, making sure that it contains only valid characters
    bytes = b64decode(b64, validate=True)

    bill_id_name = bill_text['bill_id']
    
    # Write the PDF contents to a local file
    h = open(f"bill_id-{bill_id_name}.html", "wb")
    print(h)
    h.write(bytes)
    h.close()
    
    with open(f"bill_id-{bill_id_name}.html") as fp:
        soup = BeautifulSoup(fp)
    i = open(f"bill_id-{bill_id_name}.txt", "w")
    i.write(soup.get_text())
    i.close()

In [None]:
b64tohtml(htmlbills[9])

In [None]:
len(htmlbills)

In [None]:
for i in range(len(htmlbills)):
    b64tohtml(htmlbills[i])

In [None]:
with open('bill_id-1633355.html') as f:
    print(f)

In [None]:
htmlbills[9]

In [None]:
def b64tohtmlonly(bill_text):
    # Define the Base64 string of the PDF file
    b64 = bill_text['doc']

    # Decode the Base64 string, making sure that it contains only valid characters
    bytes = b64decode(b64, validate=True)

    bill_id_name = bill_text['bill_id']
    
    # Write the PDF contents to a local file
    h = open(f"bill_id-{bill_id_name}.html", "wb")
    print(h)
    h.write(bytes)
    h.close()

In [None]:
# 8 and 9 are the problematic ones
b64tohtmlonly(htmlbills[8])
b64tohtmlonly(htmlbills[9])

In [None]:
with open('bill_id-1633355.html') as f:
    print(f)

In [None]:
bill_details[0]

In [None]:
# test one file to see if it'll work
with open("billtxts/1632709.txt", "rb") as f:
    txt = f.readlines()
print(txt)

# Add text from txt files to bill details

In [None]:
for i in range(len(bill_details)):
    filename = bill_details[i]['bill_id']
    with open(f"billtxts/{filename}.txt", "rb") as f:
        bill_details[i]['text'] = f.readlines()

In [None]:
bill_details[0]['text']

In [None]:
type(bill_details[0])

In [None]:
keys = bill_details[0].keys()

with open('bill_details.csv','w',newline='') as output_file:
    dict_writer = csv.DictWriter(output_file,keys)
    dict_writer.writeheader()
    dict_writer.writerows(bill_details)    

# More stuff below

## Get the bill info for the first result
This is a dict.

In [None]:
print(bills['results'][0])

## Get the bill text for the first result
This is a base64 encoded PDF, as we can see in 'mime':'application/pdf'.

In [None]:
testbilltext = legis.get_bill_text(2631259)
print(testbilltext)

I struggled with opening this bill text and finally found the solution [here!](https://base64.guru/developers/python/examples/decode-pdf)

In [None]:
# Define the Base64 string of the PDF file
b64 = testbilltext['doc']

# Decode the Base64 string, making sure that it contains only valid characters
bytes = b64decode(b64, validate=True)

# Perform a basic validation to make sure that the result is a valid PDF file
# Be aware! The magic number (file signature) is not 100% reliable solution to validate PDF files
# Moreover, if you get Base64 from an untrusted source, you must sanitize the PDF contents
if bytes[0:4] != b'%PDF':
  raise ValueError('Missing the PDF file signature')

# Write the PDF contents to a local file
f = open('2631259.pdf', 'wb')
f.write(bytes)
f.close()

Finally, extracting the text from the entire PDF by finding the number of pages and getting their text one by one.

In [None]:
reader = PdfReader("2631259.pdf")
for n in range(0,len(reader.pages)):
    page = reader.pages[n]
    text = text + page.extract_text()

In [None]:
print(text)

In [None]:
g = open('2631259.txt', 'w')
g.write(text)
g.close()

I previously extracted the wrong text, because get_bill_text takes doc_id as the argument and not bill id!

# Next steps
I think I need to [create a pandas dataframe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_records.html#pandas.DataFrame.from_records) to hold all the bills['results'] info, then add an empty column for the bill text, then go through each of the bill IDs and download their text, accounting for each data type (some doc, some txt, some PDF, some HTML? check using the get_bill_text 'mime'). 

Then I have to clean it up, removing things like the numbers above. Then I can begin to use NLP tools to mess around with them.

In [None]:
df = pd.DataFrame.from_records(bills['results'])
df.shape

I just realized that bills is only the first 50 results, from the first page of the search query. To get them all I have to iterate through the pages.

In [None]:
billslist = []
for i in range(1,10):
    bills = legis.search(state='ALL', query='biological sex', page=i)
    billslist.append(bills)

In [None]:
for i in range(1,10):
    print(billslist[i]['summary'])

In [None]:
len(billslist)

In [None]:
billslist

## Different approach
I don't know what's going on with my queries! I did something weird to the page numbers. Maybe it would make more sense to:
1. Download the [tracker](https://docs.google.com/spreadsheets/d/1fTxHLjBa86GA7WCT-V6AbEMGRFPMJndnaVGoZZX4PMw/edit#gid=0) as a CSV
2. Use the URL to get the state & bill number for each bill
3. Use pylegiscan get_bill to get the bill info, including doc ID
4. Add it to a pandas dataframe
5. Use the doc ID to get_bill_text
6. Download the text and add that to the dataframe
7. Clean up all the text

Or, should I try downloading just a few more for now and see what results when I play with them?

Looking at the [code](https://github.com/alliraine/legialerts/blob/main/main.py) used to update the Google Sheet in the first place using the LegiScan API, they pull the legislative session master list and turn that into a CSV.

In [None]:
mainlist = legis.get_master_list(state=None,session_id=2031)

In [None]:
len(mainlist)

In [None]:
df = pd.DataFrame.from_records(mainlist)

In [None]:
df.shape

In [None]:
list(df.columns)

In [None]:
df[400:425]

In [None]:
antibillsdf = pd.read_csv("anti-lgbtq-bills-tracker.csv")
antibillsdf.shape

In [None]:
antibillsdf.head()

In [None]:
us_state_to_abbrev["Alaska"]

In [None]:
antibillsdf['State'] = antibillsdf['State'].replace(us_state_to_abbrev)
antibillsdf.head()

In [None]:
antibillsdf.columns

In [None]:
# get just the state abbrev and bill number from the whole CSV
statebillnodf = antibillsdf.filter(['State','Number','URL'])

In [None]:
statebillnodf.head()

In [None]:
len(antibillsdf)

In [None]:
urllist = list(antibillsdf['URL'])

In [None]:
stateabbvs = list(antibillsdf['State'])
billnos = list(antibillsdf['Number'])

In [None]:
stateabbvs[0]

In [None]:
billnos[0]

God this is such a dumb way to do this, but it's the way I know how!

In [None]:
antibillsdf.URL

In [None]:
len(antibillsdf.URL)

In [None]:
billinfos = []
for i in range(len(antibillsdf.URL)):
    if(billnos[i] != "nan"):
        billinfo = legis.get_bill(bill_id=None, state=stateabbvs[i], bill_number=billnos[i])    
        billinfos.append(billinfo)
    else:
        continue

# Example code continues below

In [None]:
# bills = legis.search(state='tx', query='abortion')
# bills['summary'] # how many results did we get?

You can also get single bills, one at a time, as long as you know their ID in the LegiScan database.

In [None]:
legis.get_bill('1635057')

In [None]:
df2.shape

In [None]:
df2[df2.url.isin(urllist)]

In [None]:
df2.url

In [None]:
df3 = df2['url'].isin(antibillsdf['URL'])

# LegiScan Datasets

It'd take forever to download the bills one at a time, so we take advantage of LegiScan's [datasets](https://legiscan.com/datasets) capability. They're a whole set of bill data for each session of the legislature.

In [None]:
datasets = legis.get_dataset_list()
dataset = legis.get_dataset(datasets[20]['session_id'], datasets[20]['access_key'])
dataset.keys()

In [None]:
print(df3)

They come in a _really_ weird format, though: a [base64-encoded](https://en.wikipedia.org/wiki/Base64) zip file. SO first we need to convert the base64 zipfile into a normal file, then unzip it!

In [None]:
z_bytes = base64.b64decode(dataset['zip'])
z = zipfile.ZipFile(io.BytesIO(z_bytes))
z.extractall("./sample-data")

It creates a lot lot lot lot lot of `.json` files. For example, let's take a look at a sample of what we just extracted.

In [None]:
import glob

filenames = glob.glob("./sample-data/*/*/bill/*", recursive=True)
filenames[:15]

Each file has all sorts of information about the bill, but **none of the text of the bill itself!** You can see for yourself:

In [None]:
import json

json_data = json.load(open("./sample-data/AK/2017-2018_30th_Legislature/bill/SCR10.json"))
json_data

You _can_ download the bill text if you have the ID, but... for some reason we don't do this. I'm going to be honest: I don't remember why. Maybe it's because they're older versions? They're incomplete? I truly have forgetten.

In [None]:
doc = legis.get_bill_text('2015157')
contents = base64.b64decode(doc['doc'])
with open("filename.html", "wb") as file:
    file.write(contents)

What we're going to need is the **URL to the published version.**

In [None]:
json_data['bill']['texts'][-1]

We're going to need the URL to the published version from _every single one of those JSON files_.

# Download and extract all of the datasets from LegiScan

In [None]:
datasets = legis.get_dataset_list()
len(datasets)

Downloading and extracting all 583 is going to take a while, so we'll use a progress bar from [tqdm](https://github.com/tqdm/tqdm) to keep track of where we're at.

In [None]:
import tqdm

total = len(datasets)
for dataset in tqdm.tqdm_notebook(datasets):
    session_id = dataset['session_id']
    access_key = dataset['access_key']
    details = legis.get_dataset(session_id, access_key)
    z_bytes = base64.b64decode(details['zip'])
    z = zipfile.ZipFile(io.BytesIO(z_bytes))
    z.extractall("./bill_data")

In [None]:
print(df3.index.where(df3[1] == True))

In [None]:
df3.index.isin("True")

# Converting the many JSON files to single CSV file

The data isn't doing us much good sitting around as a zillion json files, so we'll convert them into a CSV file with the pieces of information we're interested in. Those pieces are:

* State
* Bill title
* Bill URL

In [None]:
filenames = glob.glob("bill_data/*/*/bill/*.json")
len(filenames)

In [None]:
filenames[:5]

If we want to process over a million rows, it's going to take a while! To speed things up we're going to turn to [swifter](https://github.com/jmcarpenter2/swifter), a package that can parallelize work on pandas dataframes. It's pretty easy to use:

**without swifter:**

```python
df = pd.Series(filenames).apply(process_json)
```

**with swifter:**

```python
df = pd.Series(filenames).swifter.apply(process_json)
```

And it does all the hard work for you! You just use it and hope for the best.

In [None]:
import json
import os
import swifter
import pandas as pd

def process_json(filename):
    with open(filename) as file:
        bill_data = {}
        # We need to do a little string replacing so the 
        json_str = file.read().replace('"0000-00-00"', 'null')
        content = json.loads(json_str)['bill']

        bill_data['bill_id'] = content['bill_id']
        bill_data['code'] = os.path.splitext(os.path.basename(filename))[0]
        bill_data['bill_number'] = content['bill_number']
        bill_data['title'] = content['title']
        bill_data['description'] = content['description']
        bill_data['state'] = content['state']
        bill_data['session'] = content['session']['session_name']
        bill_data['filename'] = filename
        bill_data['status'] = content['status']
        bill_data['status_date'] = content['status_date']

        try:
            bill_data['url'] = content['texts'][-1]['state_link']
        except:
            pass

        return pd.Series(bill_data)

df = pd.Series(filenames).swifter.apply(process_json)
df.head()

And now we'll save it to prepare for the next step: **inserting it into a database.**

In [None]:
df.to_csv("data/bills-with-urls.csv", index=False)