# Getting the data relative to the county profiles in California

Get the county asthma profile together with demographics data from all counties in California, as shown in http://www.californiabreathing.org/asthma-data/county-asthma-profiles

Adapted from https://github.com/gkafka/Rehab4Rehab/blob/master/GetDataRehabCenters.ipynb

In [21]:
%matplotlib inline

# Web scrapping
import requests
from bs4 import BeautifulSoup

# Data handling
import pandas as pd
import numpy as np
# import scipy as sp

# Graphing capabilities
import matplotlib.pyplot as plt
# import seaborn as sns

import json
# import matplotlib as mpl
# import string
import time

### Get the list of counties and respective pages

In [2]:
url = 'http://www.californiabreathing.org/asthma-data/county-asthma-profiles'
baseurl = 'http://www.californiabreathing.org'

In [3]:
try:
    response = requests.get(url)
    print 'Successfully acquired page.'
except:
    print 'Failed to get url page.'

Successfully acquired page.


In [4]:
soup = BeautifulSoup(response.text, 'lxml')
# soup = BeautifulSoup(response.text, "html.parser")

In [5]:
html_entries = soup.find_all('ul',attrs={'class': 'zoo-item-list zoo-list page-profiles'})
print 'Found %d unordered list(s)' % len(html_entries)

Found 1 unordered list(s)


In [6]:
county_entries = html_entries[0].find_all('li')
N_counties = len(county_entries)
print 'Found %d county entries' % len(county_entries)

Found 58 county entries


Find all counties and respective web pages

In [7]:
county_name= []
county_url= []

for entry in county_entries:
    county_link= entry.find_all('a')[0]
    county_url.append( county_link.get('href').strip() )
    county_name.append( county_link.get('title').strip().title() )   # strip all spaces of the title and make only the first letter capitalized

In [8]:
for i in xrange(len(county_name)):
    print '%s   %s%s' % (county_name[i].title(), baseurl, county_url[i])

Alameda   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/alameda-county-asthma-profile
Alpine   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/alpine-county-asthma-profile
Amador   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/amador-county-asthma-profile
Butte   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/butte-county-asthma-profile
Calaveras   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/calaveras-county-asthma-profile
Colusa   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/colusa-county-asthma-profile
Contra Costa   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/contra-costa-county-asthma-profile
Del Norte   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/del-norte-county-asthma-profile
El Dorado   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/el-dorado-county-asthma-profil

## Now we need to go to each county page and scrape the data

### Get population data

In [29]:
population = {'header':[]}

for i in xrange(N_counties):
    print county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table',attrs={'class': 'datatable'})
#     print 'Found %d tables' % len(html_entries)
    table = html_entries[0]  # population data is in first table
    try:
        print 'ID:', table['id']
    except:
        pass    

    rows = table.find_all('tr')

    # Get the column names if you don't have them yet
    if len(population['header']) == 0:  # still have no column names; get them
#         print 'Column header:'
        for j in xrange(1,len(rows)): # ignore first row: table header
            row= rows[j]
            col = row.find_all('th')[0]
            column_name = 'population ' + col.get_text().lower()
#             print column_name
            population['header'].append( column_name )

    # Get the data for each county
    population[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        val= eval(col.get_text().replace(',','')) # correct for all the thousands commas e.g., 10,000 to 10000
        print '%s: %d' % (population['header'][j-1], val)
        population[county_name[i]].append( val )

    print ''

### Get ethnicity data

In [29]:
population = {'header':[]}

for i in xrange(N_counties):
    print county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table',attrs={'class': 'datatable'})
#     print 'Found %d tables' % len(html_entries)
    table = html_entries[0]  # population data is in first table
    try:
        print 'ID:', table['id']
    except:
        pass    

    rows = table.find_all('tr')

    # Get the column names if you don't have them yet
    if len(population['header']) == 0:  # still have no column names; get them
#         print 'Column header:'
        for j in xrange(1,len(rows)): # ignore first row: table header
            row= rows[j]
            col = row.find_all('th')[0]
            column_name = 'population ' + col.get_text().lower()
#             print column_name
            population['header'].append( column_name )

    # Get the data for each county
    population[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        val= eval(col.get_text().replace(',','')) # correct for all the thousands commas e.g., 10,000 to 10000
        print '%s: %d' % (population['header'][j-1], val)
        population[county_name[i]].append( val )

    print ''

In [14]:
for t in html_entries:
    try:
        print t['id']
    except:
        print 'No id found'

population
ethnicity
prevalence
prevalence
riskfactors
No id found
No id found
No id found
numhospitalizations
No id found
No id found
disparities
hosprates
hospitals
edvisits


Datas of interest are 0,1,4,...