In [132]:
import requests
from collections import Counter
from datetime import datetime

In [67]:
def openstates_v2_request (payload):
    """
    payload: string correspondong to a query.
    return: JSON object.
    """
    headers = {'x-api-key': API_KEY,
    'Content-Type': "application/x-www-form-urlencoded",}
    url = "https://openstates.org/graphql"
    response = requests.post(url, data=payload, headers=headers)
    return response.json()

def get_bill (identifier, state_name="California", session_year="20192020"):
    """
    Should return one bill given an identifier (e.g. 'AB 2'), a state, and a session.
    """
    payload = 'query= {bill(jurisdiction: "%s", session: "%s", identifier: "%s") {id}}' % (state_name, session_year, identifier)
    return openstates_v2_request(payload)

def all_CA_bills (after=""):
    """
    Loads the first 100 bills after the 'after' cursor. 
    Only returns (openstates) id, identifier (eg. 'AB 2'), title, classification, 
    updatedAt, createdAt, fromOrganization, and totalCount
    """
    payload = 'query= { bills(first: 100, jurisdiction: "California", session: "20192020", after: "%s") { totalCount pageInfo { startCursor endCursor hasNextPage } edges { bill:node { id identifier title classification updatedAt createdAt fromOrganization { id name} } } } } ' % (after)
    return openstates_v2_request(payload)

def find_bill (subject, state_name="California", session_year="20192020"):
    payload = 'query= { bills(first: 100, jurisdiction: "California", session: "20192020", subject: "%s") { totalCount pageInfo { startCursor endCursor hasNextPage } edges { bill:node { id identifier title classification updatedAt createdAt fromOrganization { id name} } } } } ' % (subject)
    return openstates_v2_request(payload)

In [6]:
batch_100_bills = all_CA_bills()

In [8]:
# Append first 100 bills and check total number of returned bills.
CA_list_of_bills = []
total_bills = batch_100_bills['data']['bills']['totalCount']
for bill in batch_100_bills['data']['bills']['edges']:
    CA_list_of_bills.append(bill)
len(CA_list_of_bills) == 100

True

In [9]:
# Keep downloading bills until all are stored in a list.
has_next = batch_100_bills['data']['bills']['pageInfo']['hasNextPage']
while has_next:
    batch_100_bills = all_CA_bills(after=batch_100_bills['data']['bills']['pageInfo']['endCursor'])
    for bill in batch_100_bills['data']['bills']['edges']:
        CA_list_of_bills.append(bill)
    has_next = batch_100_bills['data']['bills']['pageInfo']['hasNextPage']

In [10]:
#check that we've downloaded the 'totalCount' of bills.
print(total_bills == len(CA_list_of_bills), total_bills)

True 3497


In [13]:
#list of tuples. CA_bill_ids_and_titles[0] is the identifier (eg. "AB 2"), [1] is the title
CA_bill_ids_and_titles = [(bill['bill']['identifier'], bill['bill']['title'])
                          for bill in CA_list_of_bills]

In [15]:
len(CA_bill_ids_and_titles) == total_bills

True

In [16]:
count = Counter(CA_bill_ids_and_titles)
duplicate_bills = [bill for bill in count if count[bill] > 1]
#create list of just duplicate bill ids
duplicate_session_ids = [bill[0] for bill in duplicate_bills]

In [17]:
len(duplicate_session_ids)

743

In [68]:
#check a select few of the duplicate session IDs
for bill in CA_list_of_bills:
    if bill['bill']['identifier'] == duplicate_session_ids[32]:
        print(bill)

{'bill': {'id': 'ocd-bill/3d6e691a-36ab-4acc-ac8d-3d4fc8da3bd9', 'identifier': 'AB 912', 'title': 'Marine invasive species: ballast water and biofouling management requirements.', 'classification': ['bill'], 'updatedAt': '2019-03-07 18:09:17.550450+00:00', 'createdAt': '2019-03-07 18:09:17.550432+00:00', 'fromOrganization': {'id': 'ocd-organization/e70e812e-44d2-46b0-9eae-2f3b9a1d5150', 'name': 'Assembly'}}}
{'bill': {'id': 'ocd-bill/7db9f396-5e81-48fd-a7a8-d449185c1cc8', 'identifier': 'AB 912', 'title': 'Marine invasive species: ballast water and biofouling management requirements.', 'classification': ['bill'], 'updatedAt': '2019-03-07 06:09:32.982145+00:00', 'createdAt': '2019-03-07 06:09:32.982128+00:00', 'fromOrganization': {'id': 'ocd-organization/98623fa0-f9c3-47ae-80bb-51465349ed71', 'name': 'Senate'}}}


In [46]:
duplicate_dictionary = {}
for bill in CA_list_of_bills:
    identifier = bill['bill']['identifier']
    if bill['bill']['identifier'] in duplicate_session_ids:
        if bill['bill']['identifier'] in duplicate_dictionary:
            duplicate_dictionary[bill['bill']['identifier']].append(bill)
        else:
            duplicate_dictionary[bill['bill']['identifier']] = [bill]

In [47]:
len(duplicate_dictionary)

743

In [56]:
assembly_bills_with_bad_senate_version = []
for identifier, bill in duplicate_dictionary.items():
    if ((bill[0]['bill']['fromOrganization']['name'] == 'Senate' 
        or bill [1]['bill']['fromOrganization']['name'] == 'Senate')
        and bill[0]['bill']['identifier'][:1] == 'A'):
        assembly_bills_with_bad_senate_version.append(duplicate_dictionary[item])

In [57]:
len(assembly_bills_with_bad_senate_version)

138

In [58]:
senate_bills_with_bad_assembly_version = []
for identifier, bill in duplicate_dictionary.items():
    if ( (bill[0]['bill']['fromOrganization']['name'] == 'Assembly' 
        or bill[1]['bill']['fromOrganization']['name'] == 'Assembly')
        and bill[0]['bill']['identifier'][:1] == 'S'):
        senate_bills_with_bad_assembly_version.append(duplicate_dictionary[item])

In [59]:
len(senate_bills_with_bad_assembly_version)

605

In [60]:
len(senate_bills_with_bad_assembly_version) + len(assembly_bills_with_bad_senate_version) == len(duplicate_bills)

True

In [65]:
#if a bill is a duplicate, the "bad" version of the bill is the one that doesn't match
# its identifier. That is, an "AB" "ACR" or "AJR" bill shouldn't be coming from the Senate and
# and "SB", "SJR" or "SCR" shouldn't emanate from the Assembly.

to_purge = []
for bill in CA_list_of_bills:
    if bill['bill']['identifier'] in duplicate_session_ids:
        if bill['bill']['identifier'][:1] == 'A' and bill['bill']['fromOrganization']['name'] == 'Senate':
            to_purge.append(bill)
        elif bill['bill']['identifier'][:1] == 'S' and bill['bill']['fromOrganization']['name'] == 'Assembly':
            to_purge.append(bill)

In [66]:
len(duplicate_bills) == len(to_purge)

True

In [75]:
for bill in CA_list_of_bills:
    if bill['bill']['identifier'] == "AB 73":
        print(bill)

{'bill': {'id': 'ocd-bill/73e6c505-0527-4948-97bc-431682ead5c0', 'identifier': 'AB 73', 'title': 'Budget Act of 2019.', 'classification': ['bill'], 'updatedAt': '2019-01-05 00:07:43.397702+00:00', 'createdAt': '2019-01-05 00:07:43.397684+00:00', 'fromOrganization': {'id': 'ocd-organization/e70e812e-44d2-46b0-9eae-2f3b9a1d5150', 'name': 'Assembly'}}}
{'bill': {'id': 'ocd-bill/8c924d4a-9f49-468f-9fec-cf37990d5362', 'identifier': 'AB 73', 'title': 'Privacy: lodging and common carriers: state emergencies: Disaster Response-Emergency Operations Account.', 'classification': ['bill', 'appropriation'], 'updatedAt': '2019-03-08 18:09:03.449627+00:00', 'createdAt': '2019-03-07 06:09:06.380465+00:00', 'fromOrganization': {'id': 'ocd-organization/98623fa0-f9c3-47ae-80bb-51465349ed71', 'name': 'Senate'}}}


In [95]:
# earlier we looked at bills that had both identical identifiers and identical titles. How about bills that just have the same identifier?
CA_bill_ids = [bill['bill']['identifier'] for bill in CA_list_of_bills]
bill_id_count = Counter(CA_bill_ids)
duplicate_identifiers_bills = [bill for bill in bill_id_count if bill_id_count[bill] > 1]

In [106]:
# 9 cases where there is a duplicate identifier but unique title
edge_cases = []
for bill in duplicate_identifiers_bills:
    if bill not in duplicate_session_ids:
        edge_cases.append(bill)
edge_dictionary = {}
for bill in CA_list_of_bills:
    identifier = bill['bill']['identifier']
    if bill['bill']['identifier'] in edge_cases:
        if bill['bill']['identifier'] in edge_dictionary:
            edge_dictionary[bill['bill']['identifier']].append(bill)
        else:
            edge_dictionary[bill['bill']['identifier']] = [bill]

In [114]:
# This is ALL dupes.
all_duplicates_dictionary = {}
for bill in CA_list_of_bills:
    identifier = bill['bill']['identifier']
    if bill['bill']['identifier'] in duplicate_identifiers_bills:
        if bill['bill']['identifier'] in all_duplicates_dictionary:
            all_duplicates_dictionary[bill['bill']['identifier']].append(bill)
        else:
            all_duplicates_dictionary[bill['bill']['identifier']] = [bill]

In [115]:
len(all_duplicates_dictionary)

752

In [126]:
# Let's convert datetimes to just dates
for key, item in all_duplicates_dictionary.items():
    for bill in item:
        bill['bill']['updatedAt'] = datetime.strptime(bill['bill']['updatedAt'][:10], "%Y-%m-%d")
        bill['bill']['createdAt'] = datetime.strptime(bill['bill']['createdAt'][:10], "%Y-%m-%d")

In [130]:
# create dictionary to count all the dates on which a bad bill was created
time_error_dict = {}
for key, item in all_duplicates_dictionary.items():
    for bill in item:
        if bill['bill']['identifier'][:1] != bill['bill']['fromOrganization']['name'][:1]:
            time_error_dict[bill['bill']['createdAt']] = time_error_dict.get(bill['bill']['createdAt'], 0) + 1

In [131]:
time_error_dict

{datetime.datetime(2019, 3, 7, 0, 0): 748,
 datetime.datetime(2019, 3, 8, 0, 0): 4}