In [50]:
import time
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup as bs
import pickle

In [20]:
bill_links_file = "data/bill_links_all.pkl"
bill_texts_file = "data/bill_texts_filed.pkl"

In [24]:
with open(bill_links_file, 'rb') as f1:
    bill_links = pickle.load(f1)

In [25]:
bill_links.head()

Unnamed: 0,session,house,bill,label,html,pdf
0,20150000.0,H,1,b'Filed',/Sessions/2015E4/Bills/House/HTML/H1v0.html,/Sessions/2015E4/Bills/House/PDF/H1v0.pdf
1,20150000.0,H,1,b'Edition 1',/Sessions/2015E4/Bills/House/HTML/H1v1.html,/Sessions/2015E4/Bills/House/PDF/H1v1.pdf
2,20150000.0,H,1,b'Edition 2',/Sessions/2015E4/Bills/House/HTML/H1v2.html,/Sessions/2015E4/Bills/House/PDF/H1v2.pdf
3,20150000.0,H,1,b'Adopted',/Sessions/2015E4/Bills/House/HTML/H1v3.html,/Sessions/2015E4/Bills/House/PDF/H1v3.pdf
4,20150000.0,H,2,b'Filed',/Sessions/2015E4/Bills/House/HTML/H2v0.html,/Sessions/2015E4/Bills/House/PDF/H2v0.pdf


In [26]:
bill_links.tail()

Unnamed: 0,session,house,bill,label,html,pdf
6291,2015,S,902,b'Resolution 2016-22',/Sessions/2015/Bills/Senate/HTML/S902v2.html,/Sessions/2015/Bills/Senate/PDF/S902v2.pdf
6292,2015,S,903,b'Filed',/Sessions/2015/Bills/Senate/HTML/S903v0.html,/Sessions/2015/Bills/Senate/PDF/S903v0.pdf
6293,2015,S,903,b'Edition 1',/Sessions/2015/Bills/Senate/HTML/S903v1.html,/Sessions/2015/Bills/Senate/PDF/S903v1.pdf
6294,2015,S,903,b'Edition 2',/Sessions/2015/Bills/Senate/HTML/S903v2.html,/Sessions/2015/Bills/Senate/PDF/S903v2.pdf
6295,2015,S,903,b'Resolution 2016-23',/Sessions/2015/Bills/Senate/HTML/S903v3.html,/Sessions/2015/Bills/Senate/PDF/S903v3.pdf


In [30]:
bill_links.label.describe()

count         6296
unique         478
top       b'Filed'
freq          2100
Name: label, dtype: object

In [34]:
bill_links_filed = bill_links[bill_links.label==b'Filed']

In [36]:
bill_links_filed.head()

Unnamed: 0,session,house,bill,label,html,pdf
0,20150000.0,H,1,b'Filed',/Sessions/2015E4/Bills/House/HTML/H1v0.html,/Sessions/2015E4/Bills/House/PDF/H1v0.pdf
4,20150000.0,H,2,b'Filed',/Sessions/2015E4/Bills/House/HTML/H2v0.html,/Sessions/2015E4/Bills/House/PDF/H2v0.pdf
7,20150000.0,H,3,b'Filed',/Sessions/2015E4/Bills/House/HTML/H3v0.html,/Sessions/2015E4/Bills/House/PDF/H3v0.pdf
9,20150000.0,H,4,b'Filed',/Sessions/2015E4/Bills/House/HTML/H4v0.html,/Sessions/2015E4/Bills/House/PDF/H4v0.pdf
11,20150000.0,H,5,b'Filed',/Sessions/2015E4/Bills/House/HTML/H5v0.html,/Sessions/2015E4/Bills/House/PDF/H5v0.pdf


In [37]:
bill_links_filed.shape

(2100, 6)

In [38]:
for b in bill_links_filed.head().itertuples():
    print(b.html)

/Sessions/2015E4/Bills/House/HTML/H1v0.html
/Sessions/2015E4/Bills/House/HTML/H2v0.html
/Sessions/2015E4/Bills/House/HTML/H3v0.html
/Sessions/2015E4/Bills/House/HTML/H4v0.html
/Sessions/2015E4/Bills/House/HTML/H5v0.html


In [52]:
%%time

root = "http://www.ncleg.net/"
bill_texts = []
missed = []

print("Scraping bill 'Filed' texts...")

for i,bill in enumerate(bill_links_filed.itertuples()):
    
    if i % 100 == 0:
        print('Retrieveing text for Bill {}-{}-{}...'.format(bill.session, bill.house, bill.bill))
    
    url = root + bill.html
    stop = False
    repeat_flag = False
    while not stop:
        try:
            with urlopen(url) as page:
                if page.status < 300:
                    page = page.read()
                    bill_texts.append({'session' : bill.session, 
                                       'house' : bill.house, 
                                       'bill' : bill.bill,
                                       'text' : page})
                    stop = True
                    repeat_flag = False
                else:
                    if repeat_flag == True:
                        stop = True
                    else:
                        repeat_flag = True
        except HTTPError:
            if repeat_flag == True:
                stop = True
                missed.append({'session' : bill.session, 
                               'house' : bill.house,
                               'bill' : bill.bill})
            else:
                repeat_flag = True
                        
    # Pause
    time.sleep(1)

#
print('\nTotal pages scrapped: {}'.format(len(bill_texts)))
print('\nTotal pages skipped: {}'.format(len(missed)))
                
# Pickle the data
print('\nPickling data...')
with open(bill_texts_file, 'wb') as f1:
    pickle.dump(bill_texts, f1)

with open('data/missed_filed.pkl', 'wb') as f1:
    pickle.dump(missed, f1)

    
print('\nAll done!')
print('\n')

Scraping bill 'Filed' texts...
Retrieveing text for Bill 2015E4-H-1...
Retrieveing text for Bill 2015-H-54...
Retrieveing text for Bill 2015-H-154...
Retrieveing text for Bill 2015-H-254...
Retrieveing text for Bill 2015-H-354...
Retrieveing text for Bill 2015-H-454...
Retrieveing text for Bill 2015-H-554...
Retrieveing text for Bill 2015-H-654...
Retrieveing text for Bill 2015-H-754...
Retrieveing text for Bill 2015-H-854...
Retrieveing text for Bill 2015-H-954...
Retrieveing text for Bill 2015-H-1054...
Retrieveing text for Bill 2015-S-4...
Retrieveing text for Bill 2015-S-104...
Retrieveing text for Bill 2015-S-204...
Retrieveing text for Bill 2015-S-304...
Retrieveing text for Bill 2015-S-404...
Retrieveing text for Bill 2015-S-504...
Retrieveing text for Bill 2015-S-604...
Retrieveing text for Bill 2015-S-704...
Retrieveing text for Bill 2015-S-804...

Total pages scrapped: 2098

Total pages skipped: 2

Pickling data...

All done!


CPU times: user 4.48 s, sys: 2.19 s, total: 6.67

In [53]:
missed

[{'bill': 1, 'house': 'H', 'session': '2015'},
 {'bill': 2, 'house': 'H', 'session': '2015'}]