# Web Scraper for the Presidential Documents Collection

In [3]:
# Base
import numpy as np
import pandas as pd
import json
import re
import string
from os import listdir
import math
import time

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Webscraping
from bs4 import BeautifulSoup
import urllib.request

## Methods

In [5]:
# Generate the URLs to Expand the Target Year-Month on the Webpage
def get_expandURLs(year):
    URLs = []
    # year = 2015
    for i in ["01","02","03","04","05","06","07","08","09","10","11","12"]:
        URL = ("https://www.gpo.gov/fdsys/browse/collection.action?collectionCode=CPD" +
            "&browsePath=" + str(year) + "%2F" + i + 
               "&isCollapsed=false&leafLevelBrowse=false&isDocumentResults=true")
        URLs.append(URL)
    return URLs

In [6]:
# Function to extract Table Info for Each Year-Month
#* Involves Call to Website *#
def get_dataScrape(URLs):
    rowsAll = []
    for url in URLs:
        print(url)
        # Get HTML w/ Exaned Year-Month
        content = urllib.request.urlopen(url)
        soup = BeautifulSoup(content)

        # Extract Table Only
        table = soup.find("table", {"class": "browse-node-table"})

        # Make each TD a Row in a Dataframe
        rows = table.findAll("td")
        # print(rows[0:5])
        rowsAll = rowsAll + rows
        print(len(rowsAll))
        
    # rowsAll = pd.DataFrame(rowsAll)
    return rowsAll

In [7]:
def structure_dataScrape(rows):
    #print(len(rows))
    #print(len(rows)/4)
    
    # Recreate the 4 Column Table
    master = []
    for i in np.arange(0,len(rows),4):
        # print(i)
        master.append(rows[i:i+4])
    master2 = pd.DataFrame(master)
    
    return master2

In [8]:
def clean_dataScrape(master):
# Take Data from Original Scrape and Clean
    newMaster = pd.DataFrame()
    newMaster['title'] = [x.getText() for x in master[0]]
    # newMaster = pd.DataFrame(newMaster)
    newMaster['subtitle'] = [x.getText() for x in master[1]]
    newMaster['subtitle'] = [x.strip('\n') for x in newMaster['subtitle']]
    
    newMaster['pdf'] = np.NaN
    newMaster['txt'] = np.NaN
    newMaster['more'] = np.NaN
    for i in range(0,len(master)):
        for a in master[2][i].find_all('a', href=True):
            if a.contents == ['PDF']:
                newMaster.loc[i,'pdf'] = a['href']
            elif a.contents == ['Text']:
                newMaster.loc[i,'txt'] = a['href']
            elif a.contents == []:
                newMaster.loc[i,'more'] = a['href']
    return newMaster

In [12]:
def write_dataScrape(cleanMaster):
    for i in range(0,len(cleanMaster)):
        print(i)
        content = urllib.request.urlopen(cleanMaster['txt'].ix[i])
        soup = BeautifulSoup(content)

        fileName = "./"+str(yr)+"/"+cleanMaster['fileName'].ix[i]+".txt"

        with open(fileName, 'w') as f:
            text = soup.getText()
            text = text.strip('\n')
            f.write(text)

In [2]:
def write_docContent(link, file, yr):
        content = urllib.request.urlopen(link)
        soup = BeautifulSoup(content)

        fileName = "./"+str(yr)+"/"+file+".txt"

        with open(fileName, 'w') as f:
            text = soup.getText()
            text = text.strip('\n')
            f.write(text)

In [10]:
pwd

'/Users/hopeemac/Documents/Education/Classes/UVA MSDS (OLD)/16S/Text Mining/CPD'

In [None]:
time.sleep(30) # time in second

### Run Loop to get Base data over all yrs

In [None]:
for yr in range(1993,2015):
    URLs = get_expandURLs(yr)
    rows = get_dataScrape(URLs)
    master = structure_dataScrape(rows)
    cleanMaster = clean_dataScrape(master)
    cleanMaster['fileName'] = [file[0:file.find(" - ")-1] for file in cleanMaster['title']]
    cleanMaster['year'] = yr
    outputFileName = 'cpd_'+str(yr)+'.csv'
    cleanMaster.to_csv(outputFileName, index = False)
    # write_dataScrape(cleanMaster)
    print("Done "+str(yr)+"!")

## Run Loop to get Documents for all yrs

In [29]:
yr

2014

In [36]:
listdir('./'+str(yr))

['DCPD-201400001.txt',
 'DCPD-201400002.txt',
 'DCPD-201400003.txt',
 'DCPD-201400004.txt',
 'DCPD-201400005.txt',
 'DCPD-201400006.txt',
 'DCPD-201400007.txt',
 'DCPD-201400008.txt',
 'DCPD-201400009.txt',
 'DCPD-201400010.txt',
 'DCPD-201400011.txt',
 'DCPD-201400012.txt',
 'DCPD-201400013.txt',
 'DCPD-201400014.txt',
 'DCPD-201400015.txt',
 'DCPD-201400016.txt',
 'DCPD-201400017.txt',
 'DCPD-201400018.txt',
 'DCPD-201400019.txt',
 'DCPD-201400020.txt',
 'DCPD-201400021.txt',
 'DCPD-201400022.txt',
 'DCPD-201400023.txt',
 'DCPD-201400024.txt',
 'DCPD-201400025.txt',
 'DCPD-201400026.txt',
 'DCPD-201400027.txt',
 'DCPD-201400028.txt',
 'DCPD-201400029.txt',
 'DCPD-201400030.txt',
 'DCPD-201400031.txt',
 'DCPD-201400032.txt',
 'DCPD-201400033.txt',
 'DCPD-201400034.txt',
 'DCPD-201400035.txt',
 'DCPD-201400036.txt',
 'DCPD-201400037.txt',
 'DCPD-201400038.txt',
 'DCPD-201400039.txt',
 'DCPD-201400040.txt',
 'DCPD-201400041.txt',
 'DCPD-201400042.txt',
 'DCPD-201400043.txt',
 'DCPD-2014

In [5]:
for yr in [2010]:
    cleanMaster = pd.read_csv('cpd_'+str(yr)+'.csv')
    scapedDocs = listdir('./'+str(yr))
    for i in range(0,len(cleanMaster)):
        print(i)
        if cleanMaster['fileName'].ix[i]+".txt" not in scapedDocs:
            print("scraping",cleanMaster['fileName'].ix[i])
            write_docContent(cleanMaster['txt'].ix[i], cleanMaster['fileName'].ix[i], yr)
    print("Done "+str(yr)+"!")
    
    print("Sleeping...")
    time.sleep(60) # time in second
    print("Back at it again!")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
scraping DCPD-201000133
132
scraping DCPD-201000134
133
scraping DCPD-201000136
134
scraping DCPD-201000137
135
scraping DCPD-201000138
136
scraping DCPD-201000135
137
scraping DCPD-201000139
138
scraping DCPD-201000140


KeyboardInterrupt: 

In [25]:
years = list(np.arange(1993,2014)).reverse(); years

### Take the More Info Pages and Get Metadata

In [132]:
# Updated for Space b/t Breaks in Tags Section of Metadata
def get_metadata(url):
    content = urllib.request.urlopen(url)
    soup = BeautifulSoup(content)
    table = soup.findAll("table", {"class": "page-details-budget-metadata-table"})
    keyTable = table[1]
    metaRows = keyTable.findAll("td")
    
    metaIndex = [metaRows[i].getText() for i in np.arange(0,len(metaRows),2)]
    new = [metaRows[i].contents for i in np.arange(1,len(metaRows),2)]
    # print(len(new))
    
    #print(metaIndex)
    
    if 'Subjects' in metaIndex:
        newTags = []
        #  print(new[6])
        for item in new[metaIndex.index('Subjects')]:
            # print(item.string)
            if item.string is None:
                continue
            else:
                newTags.append(item)

        s = "; "
        string = s.join(newTags)
        string

        new[metaIndex.index('Subjects')] = [string]
        
    if 'Notes' in metaIndex:
        del new[metaIndex.index('Notes')]
        del metaIndex[metaIndex.index('Notes')]
        
    if 'SuDoc Class Number' in metaIndex:
        del new[metaIndex.index('SuDoc Class Number')]
        del metaIndex[metaIndex.index('SuDoc Class Number')]
        
    metaDataDF = pd.DataFrame(new, index = metaIndex)
    metaDataDF = metaDataDF.transpose()
    return metaDataDF

### Get Metadata for Each Yr

In [124]:
yr = '2009'

In [125]:
cleanMaster = pd.read_csv('cpd_'+yr+'.csv')
cleanMaster[0:2]

Unnamed: 0,title,subtitle,pdf,txt,more,fileName,year
0,44 WCPD 1593 - Digest of Other White House Ann...,"Supplementary Materials. Friday, January 2, 2009.",https://www.gpo.gov/fdsys/pkg/WCPD-2009-01-05/...,https://www.gpo.gov/fdsys/pkg/WCPD-2009-01-05/...,search/pagedetails.action;jsessionid=IHBnbOg-l...,44 WCPD 159,2009
1,44 WCPD 1594 - Nominations Submitted to the Se...,"Supplementary Materials. Friday, January 2, 2009.",https://www.gpo.gov/fdsys/pkg/WCPD-2009-01-05/...,https://www.gpo.gov/fdsys/pkg/WCPD-2009-01-05/...,search/pagedetails.action;jsessionid=IHBnbOg-l...,44 WCPD 159,2009


In [136]:
set1 = range(199,len(cleanMaster))

In [127]:
len(cleanMaster)

1097

In [137]:
# Iterate through the Rows in cleanMaster

for i in set1:
    print(i)
    url = "https://www.gpo.gov/fdsys/"+cleanMaster['more'][i]
    meta = get_metadata(url)
    meta['title'] = cleanMaster['title'][i]
    if i == 0:
        metaAll = meta
    else:
        metaAll = metaAll.append(meta)
    print(i," complete")
    
    if(i != 0 and i%200 == 0):
        print("Sleeping...")
        time.sleep(60) # time in second
        print("Back at it again!")
    

199
199  complete
200
200  complete
Sleeping...
Back at it again!
201
201  complete
202
202  complete
203
203  complete
204
204  complete
205
205  complete
206
206  complete
207
207  complete
208
208  complete
209
209  complete
210
210  complete
211
211  complete
212
212  complete
213
213  complete
214
214  complete
215
215  complete
216
216  complete
217
217  complete
218
218  complete
219
219  complete
220
220  complete
221
221  complete
222
222  complete
223
223  complete
224
224  complete
225
225  complete
226
226  complete
227
227  complete
228
228  complete
229
229  complete
230
230  complete
231
231  complete
232
232  complete
233
233  complete
234
234  complete
235
235  complete
236
236  complete
237
237  complete
238
238  complete
239
239  complete
240
240  complete
241
241  complete
242
242  complete
243
243  complete
244
244  complete
245
245  complete
246
246  complete
247
247  complete
248
248  complete
249
249  complete
250
250  complete
251
251  complete
252
252  complet

In [112]:
metaAll.iloc[262]

Category                               Budget and Presidential Materials
Collection                         Compilation of Presidential Documents
Document Categories    Executive Orders : Regulatory planning and rev...
Document Category                                                    NaN
Event Date                                              January 30, 2009
Locations                                                            NaN
Names                                                                NaN
Page Number Range                                                    NaN
President                                                Barack H. Obama
Publication Title                                                    NaN
Publisher              Office of the Federal Register, National Archi...
SuDoc Class Number                                                   NaN
Subjects               Government agencies and employees : Regulatory...
title                  DCPD-200900091  - Executive 

In [135]:
len(metaAll)

199

In [113]:
cleanMaster.iloc[262]

title       DCPD-200900199  - Remarks on the United States...
subtitle       Addresses and Remarks. Monday, March 30, 2009.
pdf         https://www.gpo.gov/fdsys/pkg/DCPD-200900199/p...
txt         https://www.gpo.gov/fdsys/pkg/DCPD-200900199/h...
more        search/pagedetails.action;jsessionid=kBVnbPvEb...
fileName                                       DCPD-200900199
year                                                     2009
Name: 262, dtype: object

In [141]:
len(metaAll) == len(cleanMaster)

True

In [134]:
len(cleanMaster)

1097

In [138]:
cleanMasterPlus = cleanMaster.merge(right = metaAll, on = 'title')

In [139]:
! pwd

/Users/hopeemac/Documents/Education/Classes/UVA MSDS (OLD)/16S/Text Mining/CPD


In [140]:
cleanMasterPlus.to_csv('./cpd_wMeta_'+yr+'.csv')

In [122]:
# metaAll.to_csv('./cpd_wMeta_'+yr+'.csv')

### Add Dates and Presidents to Metadata Files

In [None]:
# extract dates
for i in range(len(files)):    
    df = pd.read_csv('C:/Users/brian/Documents/UVA/Text Mining/Final Project/metadata/' + files[i])
    df['date'] = [x.split('.')[len(x.split('.'))-2].split(', ')[1] if len(x.split('.')[1])>0 else x.split(', ')[1] for x in df['subtitle']]
    df.to_csv('C:/Users/brian/Documents/UVA/Text Mining/Final Project/metadata/with_date_' + files[i])

# extract speaker from date
for i in range(len(files)):    
    df = pd.read_csv('C:/Users/brian/Documents/UVA/Text Mining/Final Project/metadata/with_date_' + files[i])
    prez = []
    for j in range(len(df['year'])):
        start_diff = [days_between(df['date'][j] + ", " + str(df['year'][j]),x) for x in terms['Start Date']]
        prez.append(terms['President'][start_diff.index(min([x for x in start_diff if x > 0])) - 1])
    df['President'] = prez
       
    df.to_csv('C:/Users/brian/Documents/UVA/Text Mining/Final Project/metadata/with_date_and_prez_' + files[i])

In [44]:
# read in presidential terms
terms = pd.read_csv('presidential_terms.csv')
terms['Start Date'] = [x.split(', ')[1] + ", " + x.split(', ')[2] for x in terms['Start Date']]
terms['End Date'] = [x.split(', ')[1] + ", " + x.split(', ')[2] for x in terms['End Date']]

In [52]:
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%A, %B %d, %Y")
    d2 = datetime.strptime(d2,  "%B %d, %Y")
    return (d2 - d1).days

In [64]:
y = 'Friday, January 15, 1993.'

In [76]:
for yr in range(1993,2017):
    df = pd.read_csv(datafilepath+'cpd_'+str(yr)+'.csv', encoding = 'iso-8859-15')
    # scapedDocs = listdir('./'+str(yr))
    df['date'] = [x.split('.')[len(x.split('.'))-2].lstrip() if len(x.split('.')[1])>0 else x.lstrip().replace(".","") for x in df['subtitle']]
    # [x.split('.')[len(x.split('.'))-2].split(', ')[1] if len(x.split('.')[1])>0 else x.split(', ')[1] for x in df['subtitle']]
    df.to_csv(datafilepath+'all_cpd_'+str(yr)+'.csv')

In [67]:
x.split('.')[1].lstrip() if len(x.split('.')[1])>0 else x.lstrip()

'Tuesday, January 26, 1993'

In [71]:
x.split('.')[1].lstrip() if len(x.split('.')[1])>0 else x.lstrip().replace(".","")

'Friday, January 15, 1993'

In [61]:
x = df.loc[100,'subtitle']; x

'Appointments and Nominations. Tuesday, January 26, 1993.'

In [62]:
x.split('.')[1]

' Tuesday, January 26, 1993'

In [31]:
df['date'] = [x.split('.')[1] for x in df['subtitle']]

In [38]:
df.loc[0,'date']

'Friday, January 1, 1993'

In [9]:
datafilepath = '/Users/hopeemac/Documents/Education/Classes/UVA MSDS (OLD)/16S/Text Mining/CPD/CPD/metadata/'

In [10]:
yr = '1993'
cleanMaster = pd.read_csv(datafilepath+'cpd_'+str(yr)+'.csv')

In [11]:
cleanMaster[0:5]

Unnamed: 0,title,subtitle,pdf,txt,more,fileName,year
0,29 WCPD 1 - Statement by Press Secretary Fitzw...,"Statements Other Than Presidential. Friday, Ja...",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg1,1993
1,29 WCPD 1 - Remarks at a State Dinner Hosted b...,"Addresses and Remarks. Saturday, January 2, 1993.",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg1-2,1993
2,29 WCPD 2 - The President's News Conference Wi...,"Interviews With the News Media. Sunday, Januar...",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg2,1993
3,29 WCPD 8 - Proclamation 6521--National Sancti...,"Proclamations. Monday, January 4, 1993.",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg8,1993
4,29 WCPD 9 - Recess Appointment of Gregory Stew...,"Appointments and Nominations. Monday, January ...",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg9,1993


In [48]:
# define datediff function
from datetime import datetime

In [50]:
df = pd.read_csv(datafilepath+'all_cpd_'+str(yr)+'.csv', encoding = 'iso-8859-15')

In [51]:
df[0:5]

Unnamed: 0.1,Unnamed: 0,title,subtitle,pdf,txt,more,fileName,year,date
0,0,29 WCPD 1 - Statement by Press Secretary Fitzw...,"Statements Other Than Presidential. Friday, Ja...",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg1,1993,"Friday, January 1, 1993"
1,1,29 WCPD 1 - Remarks at a State Dinner Hosted b...,"Addresses and Remarks. Saturday, January 2, 1993.",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg1-2,1993,"Saturday, January 2, 1993"
2,2,29 WCPD 2 - The President's News Conference Wi...,"Interviews With the News Media. Sunday, Januar...",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg2,1993,"Sunday, January 3, 1993"
3,3,29 WCPD 8 - Proclamation 6521--National Sancti...,"Proclamations. Monday, January 4, 1993.",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg8,1993,"Monday, January 4, 1993"
4,4,29 WCPD 9 - Recess Appointment of Gregory Stew...,"Appointments and Nominations. Monday, January ...",https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,https://www.gpo.gov/fdsys/pkg/WCPD-1993-01-11/...,search/pagedetails.action?collectionCode=CPD&b...,WCPD-1993-01-11-Pg9,1993,"Monday, January 4, 1993"


In [60]:
df['date'][100]

'Tuesday, January 26, 1993'

In [58]:
np.isnan(np.array(df['date']))

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [77]:
for yr in range(1993,2017):
    df = pd.read_csv(datafilepath+'all_cpd_'+str(yr)+'.csv', encoding = 'iso-8859-15')
    # scapedDocs = listdir('./'+str(yr))
    prez = []
    for j in range(len(df['year'])):
        #print(j)
        start_diff = [days_between(df['date'][j],x) for x in terms['Start Date']]
        prez.append(terms['President'][start_diff.index(min([x for x in start_diff if x > 0])) - 1])
    df['President'] = prez
    df.to_csv(datafilepath+'all_cpd_'+str(yr)+'.csv')

In [75]:
df.ix[368]

Unnamed: 0                                                  368
title         29 WCPD 532 - Remarks and an Exchange With Rep...
subtitle      Russia-U.S. Summit in Vancouver, Canada. Satur...
pdf           https://www.gpo.gov/fdsys/pkg/WCPD-1993-04-12/...
txt           https://www.gpo.gov/fdsys/pkg/WCPD-1993-04-12/...
more          search/pagedetails.action?collectionCode=CPD&b...
fileName                                  WCPD-1993-04-12-Pg532
year                                                       1993
date                                                          S
Name: 368, dtype: object

In [None]:
# extract speaker from date
for i in range(len(files)):    
    df = pd.read_csv('C:/Users/brian/Documents/UVA/Text Mining/Final Project/metadata/with_date_' + files[i])
    prez = []
    for j in range(len(df['year'])):
        start_diff = [days_between(df['date'][j],x) for x in terms['Start Date']]
        prez.append(terms['President'][start_diff.index(min([x for x in start_diff if x > 0])) - 1])
    df['President'] = prez