In [49]:
import time
import pickle

In [2]:
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup as bs

In [3]:
import pandas

## Build Bill URL and Request Page

In [4]:
bill_root = "http://www.ncleg.net/gascripts/BillLookUp/BillLookUp.pl?"

In [5]:
session_head = "Session="
bill_head = "BillID="

### Example

In [6]:
session = "2015"
bill_id = "H1"

In [7]:
def buildBillURL(session, bill_id):
    return(bill_root + session_head + session + "&" + bill_head + bill_id)

In [8]:
bill_url = buildBillURL(session=session, bill_id=bill_id)
bill_url

'http://www.ncleg.net/gascripts/BillLookUp/BillLookUp.pl?Session=2015&BillID=H1'

In [9]:
page = urlopen(bill_url)

In [10]:
page.status

200

In [11]:
page.read()[:200]

b'<!doctype html>\n<html>\n<head>\n\t<meta name="description" content="The Official Site of the North Carolina General Assembly.">\n\t<meta name="keywords" content="NCGA, North Carolina General Assembly, offi'

In [12]:
page.close()

In [13]:
pages = []
for num in range(1,5):
    bill_id = "H" + str(num)
    bill_url = buildBillURL(session=session, bill_id=bill_id)
    with urlopen(bill_url) as page:
        if page.status < 300:
            pages.append(page.read())

In [14]:
len(pages)

4

### More Stuff

In [15]:
# Session Values
with open("data/ncga_sessionToSearch", 'r') as f:
    session_html = "".join(f.readlines())

session_soup = bs(session_html, 'html.parser')
sessions = session_soup.find_all('option')
sessions = [{"query_value" : session['value'], "name" : session.text.strip()} \
            for session in sessions]
sessions = pandas.DataFrame(sessions)
sessions.shape

(33, 2)

In [16]:
sessions_of_interest = sessions[:5]
sessions_of_interest

Unnamed: 0,name,query_value
0,2016 Extra Session 4,20150000.0
1,2016 Extra Session 3,2015000.0
2,2016 Extra Session 2,201500.0
3,2016 Extra Session 1,20150.0
4,2015-2016 Session,2015.0


#### Identifying a "Bad Page" / "Not a Bill" Page

In [17]:
bad_page_ex = "http://www.ncleg.net/gascripts/BillLookUp/BillLookUp.pl?Session=2015&BillID=H11111"

In [18]:
bad_page = urlopen(bad_page_ex)
bad_soup = bs(bad_page, 'html.parser')

In [19]:
# Find the body "title"
bad_soup.find('div', {"id":"title"})

<div id="title">Bill H11111 Not Found</div>

In [20]:
# Create a flag
bad_soup.find('div', {"id":"title"}).text.lower().find("not found") > -1

True

In [40]:
def isEmptyPage(page):
    soup = bs(page, 'html.parser')
    return(soup.find('div', {"id":"title"}).text.lower().find("not found") > -1)

In [27]:
isEmptyPage(bad_page)

True

#### Try the scrape...

In [69]:
sess = sessions_of_interest['query_value'].tolist()
sess

['2015E4', '2015E3', '2015E2', '2015E1', '2015']

In [75]:
%%time

pages = []
for ses in sess:
    print('Session ' + ses)
    for house_flag in ["H", "S"]:
        print('  House ' + house_flag)
        stop = False
        counter = 1
        repeat_flag = False
        while not stop:
            if counter == 1 or counter % 100 == 0:
                print('\tBill ' + str(counter))
            time.sleep(0.75)
            bill_id = house_flag + str(counter)
            url = buildBillURL(session=ses, bill_id=bill_id)
            with urlopen(url) as page:
                if page.status < 300:
                    page = page.read()
                    if not isEmptyPage(page):
                        pages.append({'session' : ses, 'house' : house_flag, 'page' : page, 'bill' : counter})
                        counter += 1
                    else:
                        stop = True
                    repeat_flag = False
                else:
                    if repeat_flag == True:
                        stop = True
                    else:
                        repeat_flag = True
         ## DO NOT COMMENT  FOR TESTING
         #   if counter >= 1:
         #       stop = True
        print('  Total bills in {} session {}: {}'.format(house_flag, ses, counter-1))
                
#
print('\nTotal pages scrapped: {}'.format(len(pages)))
                
# Pickle the data
print('\nPickling data...')
with open('data/test_pages.pkl', 'wb') as f1:
    pickle.dump(pages, f1)
    
print('\nAll done!')
print('\n')

Session 2015E4
  House H
	Bill 1
  Total bills in H session 2015E4: 24
  House S
	Bill 1
  Total bills in S session 2015E4: 7
Session 2015E3
  House H
	Bill 1
  Total bills in H session 2015E3: 6
  House S
	Bill 1
  Total bills in S session 2015E3: 1
Session 2015E2
  House H
	Bill 1
  Total bills in H session 2015E2: 3
  House S
	Bill 1
  Total bills in S session 2015E2: 1
Session 2015E1
  House H
	Bill 1
  Total bills in H session 2015E1: 3
  House S
	Bill 1
  Total bills in S session 2015E1: 2
Session 2015
  House H
	Bill 1
	Bill 100
	Bill 200
	Bill 300
	Bill 400
	Bill 500
	Bill 600
	Bill 700
	Bill 800
	Bill 900
	Bill 1000
	Bill 1100
  Total bills in H session 2015: 1150
  House S
	Bill 1
	Bill 100
	Bill 200
	Bill 300
	Bill 400
	Bill 500
	Bill 600
	Bill 700
	Bill 800
	Bill 900
  Total bills in S session 2015: 903

Total pages scrapped: 2100

Pickling data...

All done!


CPU times: user 3min 18s, sys: 5.8 s, total: 3min 24s
Wall time: 1h 25min 3s
