# Getting the list of counties in California and links to quick facts from wikipedia

In [1]:
%matplotlib inline

# Web scrapping
import requests
from bs4 import BeautifulSoup

# Data handling
# import pandas as pd
import numpy as np
# import scipy as sp

# Graphing capabilities
import matplotlib.pyplot as plt
# import seaborn as sns

import json
import pickle
import time

### Get the list of counties and respective links

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_counties_in_California'
# baseurl = 'http://www.californiabreathing.org'

In [3]:
try:
    response = requests.get(url)
    print 'Successfully acquired page.'
except:
    print 'Failed to get url page.'

Successfully acquired page.


In [4]:
soup = BeautifulSoup(response.text, 'lxml')
# soup = BeautifulSoup(response.text, "html.parser")

Find table with list of counties

In [24]:
table= soup.find_all('table', attrs={'class':'wikitable sortable'})[0]
print 'Found %d table(s)' % len(tables)

Found 1 table(s)


Get headers

In [50]:
ths= table.find_all('th')

for i in xrange(len(ths)):
    print '%d -  %s' %(i, ths[i].get_text())

0 -  County
1 -  FIPS County Code
[3]
2 -  County seat
[4]
3 -  Established
[4]
4 -  Formed from
5 -  Etymology
[5]
6 -  Population
[6]
7 -  Area
[4]
8 -  Map


Choosing which columns we want

In [51]:
cols= [0,1,6,7]

In [56]:
header= []
for col in cols:
    header.append(ths[col].get_text().split('\n')[0])
    print header[-1]

County
FIPS County Code
Population
Area


In [130]:
trs= table.find_all('tr')[1:]  # remove header row
Nrows= len(trs)
print 'Found %d rows' % Nrows

Found 58 rows


In [161]:
table= []
links= []
for i in xrange(Nrows):
    tds= trs[i].find_all('td')

    # Find link to census page in FIPS column
    l= tds[1].find_all('a')[0]
    links.append(l.get('href')[:-3])

    row= []
    for col in cols:
    #     print tds[col].get_text()
        text= tds[col].get_text()
        if col== 0: # County: remove county text
            ind= text.lower().find('county')
            text= text[:ind].strip().title()
        if col== 1: # FIPS
            text = '06'+text
        if text.find(u"\u2660")>=0: # certain cell values have a weird character in it before the actual value
            text= text.split(u"\u2660")[1].replace(',','')
            ind= text.find('sq')
            if ind>=0:
                text= text[:ind].strip()
        row.append(text)

    print row
    table.append(row)
    #     print tds[col].find_all('a')

[u'Alameda', u'06001', u'1638215', u'738']
[u'Alpine', u'06003', u'1110', u'739']
[u'Amador', u'06005', u'37001', u'606']
[u'Butte', u'06007', u'225411', u'1640']
[u'Calaveras', u'06009', u'44828', u'1020']
[u'Colusa', u'06011', u'21482', u'1151']
[u'Contra Costa', u'06013', u'1126745', u'720']
[u'Del Norte', u'06015', u'27254', u'1008']
[u'El Dorado', u'06017', u'184452', u'1712']
[u'Fresno', u'06019', u'974861', u'5963']
[u'Glenn', u'06021', u'28017', u'1315']
[u'Humboldt', u'06023', u'135727', u'3573']
[u'Imperial', u'06025', u'180191', u'4175']
[u'Inyo', u'06027', u'18260', u'10192']
[u'Kern', u'06029', u'882176', u'8142']
[u'Kings', u'06031', u'150965', u'1390']
[u'Lake', u'06033', u'64591', u'1258']
[u'Lassen', u'06035', u'31345', u'4558']
[u'Los Angeles', u'06037', u'10170292', u'4060']
[u'Madera', u'06039', u'154998', u'2138']
[u'Marin', u'06041', u'261221', u'520']
[u'Mariposa', u'06043', u'17531', u'1451']
[u'Mendocino', u'06045', u'87649', u'3509']
[u'Merced', u'06047', u'26

In [165]:
# List of links
print links

['http://www.census.gov/quickfacts/table/PST045215/06001', 'http://www.census.gov/quickfacts/table/PST045215/06003', 'http://www.census.gov/quickfacts/table/PST045215/06005', 'http://www.census.gov/quickfacts/table/PST045215/06007', 'http://www.census.gov/quickfacts/table/PST045215/06009', 'http://www.census.gov/quickfacts/table/PST045215/06011', 'http://www.census.gov/quickfacts/table/PST045215/06013', 'http://www.census.gov/quickfacts/table/PST045215/06015', 'http://www.census.gov/quickfacts/table/PST045215/06017', 'http://www.census.gov/quickfacts/table/PST045215/06019', 'http://www.census.gov/quickfacts/table/PST045215/06021', 'http://www.census.gov/quickfacts/table/PST045215/06023', 'http://www.census.gov/quickfacts/table/PST045215/06025', 'http://www.census.gov/quickfacts/table/PST045215/06027', 'http://www.census.gov/quickfacts/table/PST045215/06029', 'http://www.census.gov/quickfacts/table/PST045215/06031', 'http://www.census.gov/quickfacts/table/PST045215/06033', 'http://www.c

Check names of counties

In [189]:
county_names= json.load(open('county_names.json','r'))
for i in xrange(Nrows):
    if county_names[i] != table[i][0]:
        print 'Does not match: %s != %s' % ( county_names[i], table[i][0] )
        print 'Correcting...'
        table[i][0]= county_names[i]

Save information of counties in file

In [184]:
fin= open('county_info.csv','w')

line= ','.join(header)
fin.write(line+'\n')

for i in xrange(Nrows):
    line= ','.join(table[i])
    fin.write(line+'\n')

fin.close()

## Get info from the census pages for each county

In [192]:
Nrows= len(county_names)
for i in xrange(Nrows):
    print '%s   %s' % (county_names[i], links[i])

Alameda   http://www.census.gov/quickfacts/table/PST045215/06001
Alpine   http://www.census.gov/quickfacts/table/PST045215/06003
Amador   http://www.census.gov/quickfacts/table/PST045215/06005
Butte   http://www.census.gov/quickfacts/table/PST045215/06007
Calaveras   http://www.census.gov/quickfacts/table/PST045215/06009
Colusa   http://www.census.gov/quickfacts/table/PST045215/06011
Contra Costa   http://www.census.gov/quickfacts/table/PST045215/06013
Del Norte   http://www.census.gov/quickfacts/table/PST045215/06015
El Dorado   http://www.census.gov/quickfacts/table/PST045215/06017
Fresno   http://www.census.gov/quickfacts/table/PST045215/06019
Glenn   http://www.census.gov/quickfacts/table/PST045215/06021
Humboldt   http://www.census.gov/quickfacts/table/PST045215/06023
Imperial   http://www.census.gov/quickfacts/table/PST045215/06025
Inyo   http://www.census.gov/quickfacts/table/PST045215/06027
Kern   http://www.census.gov/quickfacts/table/PST045215/06029
Kings   http://www.census.

### Now we need to go to each county page and scrape the data

In [373]:
try:
    response = requests.get(links[0])
except:
    print 'Failed to get url page %s' % links[i]

soup= BeautifulSoup(response.text,'lxml')

tables= soup.find_all('table')
print 'Found %d table(s)' % len(tables)
table= tables[0]
trs= table.find_all('tr')

Found 1 table(s)


In [374]:
for t in xrange(3,len(trs)):
    text=[]
    if trs[t].find_all('th') == []: # only consider rows that do not have header info
        text= trs[t].find_all('td')[0].get_text()
        text= text.strip()
        if text[:2]=='i ' or text[:2]=='i\n':
            text= text[2:]
            text= text.strip()
        if text[-2:]=='a)' or text[-2:]=='b)' or text[-2:]=='c)':
            text= text[:-3]
            text= text.strip()

    print '%d -  %s' %(t, text)

3 -  Population estimates, July 1, 2016,  (V2016)
4 -  Population estimates, July 1, 2015,  (V2015)
5 -  Population estimates base, April 1, 2010,  (V2016)
6 -  Population estimates base, April 1, 2010,  (V2015)
7 -  Population, percent change - April 1, 2010 (estimates base) to July 1, 2016,  (V2016)
8 -  Population, percent change - April 1, 2010 (estimates base) to July 1, 2015,  (V2015)
9 -  Population, Census, April 1, 2010
10 -  []
11 -  Persons under 5 years, percent, July 1, 2015,  (V2015)
12 -  Persons under 5 years, percent, April 1, 2010
13 -  Persons under 18 years, percent, July 1, 2015,  (V2015)
14 -  Persons under 18 years, percent, April 1, 2010
15 -  Persons 65 years and over, percent,  July 1, 2015,  (V2015)
16 -  Persons 65 years and over, percent, April 1, 2010
17 -  Female persons, percent,  July 1, 2015,  (V2015)
18 -  Female persons, percent, April 1, 2010
19 -  []
20 -  White alone, percent, July 1, 2015,  (V2015)
21 -  White alone, percent, April 1, 2010
22 -  

Columns to pick from census data

In [375]:
cols= [4,11,13,15,17,20,22,24,26,28,30,32,34,38,50,54,55,58,71,73,88,89]

In [389]:
header= []
for t in cols:
    text=[]
    if trs[t].find_all('th') == []: # only consider rows that do not have header info
        text= trs[t].find_all('td')[0].get_text()
        text= text.strip().replace(',','')
        if text[:2]=='i ' or text[:2]=='i\n':
            text= text[2:]
            text= text.strip()
        if text[-2:]=='a)' or text[-2:]=='b)' or text[-2:]=='c)':
            text= text[:-3]
            text= text.strip()

    header.append(text)

In [390]:
header

[u'Population estimates July 1 2015  (V2015)',
 u'Persons under 5 years percent July 1 2015  (V2015)',
 u'Persons under 18 years percent July 1 2015  (V2015)',
 u'Persons 65 years and over percent  July 1 2015  (V2015)',
 u'Female persons percent  July 1 2015  (V2015)',
 u'White alone percent July 1 2015  (V2015)',
 u'Black or African American alone percent July 1 2015  (V2015)',
 u'American Indian and Alaska Native alone percent July 1 2015  (V2015)',
 u'Asian alone percent July 1 2015  (V2015)',
 u'Native Hawaiian and Other Pacific Islander alone percent July 1 2015  (V2015)',
 u'Two or More Races percent July 1 2015  (V2015)',
 u'Hispanic or Latino percent July 1 2015  (V2015)',
 u'White alone not Hispanic or Latino percent July 1 2015  (V2015)',
 u'Foreign born persons percent 2011-2015',
 u'Persons per household 2011-2015',
 u'High school graduate or higher percent of persons age 25 years+ 2011-2015',
 u"Bachelor's degree or higher percent of persons age 25 years+ 2011-2015",
 u'P

In [391]:
header=[u'Population',
 u'Persons under 5 years percent',
 u'Persons under 18 years percent',
 u'Persons 65 years and over percent',
 u'Female persons percent',
 u'White alone percent',
 u'Black or African American alone percent',
 u'American Indian and Alaska Native alone percent',
 u'Asian alone percent',
 u'Native Hawaiian and Other Pacific Islander alone percent',
 u'Two or More Races percent',
 u'Hispanic or Latino percent',
 u'White alone  not Hispanic or Latino percent',
 u'Foreign born persons percent',
 u'Persons per household',
 u'High school graduate or higher percent of persons age 25 years+',
 u"Bachelor's degree or higher percent of persons age 25 years+",
 u'Persons  without health insurance  under age 65 years percent',
 u'Median household income (in 2015 dollars)',
 u'Persons in poverty percent',
 u'Population per square mile',
 u'Land area in square miles']

In [379]:
data=[]

for i in xrange(Nrows):
    row=[]
    try:
        response = requests.get(links[i])
    except:
        print 'Failed to get url page %s' % links[i]
        
    soup= BeautifulSoup(response.text,'lxml')

    tables= soup.find_all('table')
#     print 'Found %d table(s)' % len(tables)
    table= tables[0]
    trs= table.find_all('tr')


    for t in cols:
        text=[]
        if trs[t].find_all('th') == []: # only consider rows that do not have header info
            text= trs[t].find_all('td')[1].get_text()
            text= text.replace(',','').replace('%','').replace('&nbsp','').replace('$','').strip()

        row.append(text)

    data.append(row)

Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)
Found 1 table(s)


Save information of counties in file

In [392]:
fin= open('county_census.csv','w')

line= 'County,' + ','.join(header)
fin.write(line+'\n')

for i in xrange(Nrows):
    line= county_names[i] + ',' + ','.join(data[i])
    fin.write(line+'\n')

fin.close()