# Getting the data relative to the county profiles in California

Get the county asthma profile together with demographics data from all counties in California, as shown in http://www.californiabreathing.org/asthma-data/county-asthma-profiles

Adapted from https://github.com/gkafka/Rehab4Rehab/blob/master/GetDataRehabCenters.ipynb

In [1]:
%matplotlib inline

# Web scrapping
import requests
from bs4 import BeautifulSoup

# Data handling
# import pandas as pd
import numpy as np
# import scipy as sp

# Graphing capabilities
import matplotlib.pyplot as plt
# import seaborn as sns

import json
# import time

### Get the list of counties and respective pages

In [129]:
url = 'http://www.californiabreathing.org/asthma-data/county-asthma-profiles'
baseurl = 'http://www.californiabreathing.org'

In [130]:
try:
    response = requests.get(url)
    print 'Successfully acquired page.'
except:
    print 'Failed to get url page.'

Successfully acquired page.


In [131]:
soup = BeautifulSoup(response.text, 'lxml')
# soup = BeautifulSoup(response.text, "html.parser")

In [132]:
html_entries = soup.find_all('ul',attrs={'class': 'zoo-item-list zoo-list page-profiles'})
print 'Found %d unordered list(s)' % len(html_entries)

Found 1 unordered list(s)


In [133]:
county_entries = html_entries[0].find_all('li')
N_counties = len(county_entries)
print 'Found %d county entries' % len(county_entries)

Found 58 county entries


Find all counties and respective web pages

In [134]:
county_name= []
county_url= []

for entry in county_entries:
    county_link= entry.find_all('a')[0]
    county_url.append( county_link.get('href').strip() )
    county_name.append( county_link.get('title').strip().title() )   # strip all spaces of the title and make only the first letter capitalized
    
county_name = np.array(county_name)
county_url = np.array(county_url)

Making sure the counties are in alphabetical order

In [8]:
indsSort = np.argsort(county_name)
county_name = county_name[indsSort]
county_url = county_url[indsSort]

In [9]:
filename = 'county_names.json'
outfile = open(filename, "w")
json.dump(county_name.tolist(), outfile)
outfile.close()

In [10]:
for i in xrange(len(county_name)):
    print '%s   %s%s' % (county_name[i].title(), baseurl, county_url[i])

Alameda   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/alameda-county-asthma-profile
Alpine   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/alpine-county-asthma-profile
Amador   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/amador-county-asthma-profile
Butte   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/butte-county-asthma-profile
Calaveras   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/calaveras-county-asthma-profile
Colusa   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/colusa-county-asthma-profile
Contra Costa   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/contra-costa-county-asthma-profile
Del Norte   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/del-norte-county-asthma-profile
El Dorado   http://www.californiabreathing.org/asthma-data/county-asthma-profiles/el-dorado-county-asthma-profil

## Now we need to go to each county page and scrape the data

### Get population data

Data will be saved in a csv file

In [36]:
filename = 'Population_AgeGroup_byCounty.csv'
fout = open(filename, 'w')

Write the header for the csv file

In [37]:
line ='County,0-4,5-17,18-64,65+,Total'
fout.write(line+'\n')

In [38]:
dict_tmp = {}

for i in xrange(N_counties):
    print county_name[i]
    line= county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table')
#     print 'Found %d tables' % len(html_entries)

    # Population data is in first table
    ind= 0
    table = html_entries[ind]
    try:
        print 'ID:', table['id']
    except:
        pass    

    rows = table.find_all('tr')


    # Get the data for each county
    dict_tmp[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        val= eval(col.get_text().replace(',','')) # correct for all the thousands commas e.g., 10,000 to 10000
        print val
        line= line+','+np.str(val)

    print ''
    fout.write(line+'\n')

fout.close()

Alameda
ID: population
99911
248516
1022113
196707
1567248

Alpine
ID: population
35
210
683
219
1148

Amador
ID: population
1341
4466
22016
9011
36833

Butte
ID: population
11474
33791
140503
37584
223353

Calaveras
ID: population
1692
6109
26210
10992
45004

Colusa
ID: population
1590
4524
13132
2837
22083

Contra Costa
ID: population
62767
189096
679089
153863
1084815

Del Norte
ID: population
1511
4144
18464
4430
28549

El Dorado
ID: population
8414
29251
112898
31798
182360

Fresno
ID: population
79872
199933
585623
107296
972724

Glenn
ID: population
2041
5501
16985
4098
28626

Humboldt
ID: population
6806
19018
88330
21237
135392

Imperial
ID: population
15369
37095
112755
21524
186744

Inyo
ID: population
954
2635
11232
3834
18656

Kern
ID: population
73380
183155
540506
90089
887129

Kings
ID: population
12201
29268
99322
13938
154729

Lake
ID: population
3441
9284
39190
12968
64884

Lassen
ID: population
1406
4113
24387
4012
33918

Los Angeles
ID: population
670558
1651655
64

### Get ethnicity data

In [39]:
# Ethnicity data is in second table
ind= 1

Data will be saved in a csv file

In [40]:
filename = 'Ethnicity_byCounty.csv'
fout = open(filename, 'w')

Write the header for the csv file

In [41]:
line ='County,Asian,Black,Hispanic,White,Other'
fout.write(line+'\n')

In [42]:
dict_tmp = {}

for i in xrange(N_counties):
    print county_name[i]
    line= county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table')
#     print 'Found %d tables' % len(html_entries)

    table = html_entries[ind]
    try:
        print 'ID:', table['id']
    except:
        pass    

    rows = table.find_all('tr')


    # Get the data for each county
    dict_tmp[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        val= eval(col.get_text())
        print val
        line= line+','+np.str(val)

    print ''
    fout.write(line+'\n')

fout.close()

Alameda
ID: ethnicity
27.5
12.1
23.8
32.4
4.2

Alpine
ID: ethnicity
0.0
0.0
6.4
73.8
19.8

Amador
ID: ethnicity
1.4
2.6
13.8
77.9
4.2

Butte
ID: ethnicity
4.9
1.4
15.6
73.1
5.0

Calaveras
ID: ethnicity
1.3
0.8
10.9
82.9
4.1

Colusa
ID: ethnicity
1.4
0.8
56.5
38.7
2.6

Contra Costa
ID: ethnicity
14.9
8.6
25.4
47.0
4.1

Del Norte
ID: ethnicity
3.3
4.3
18.4
63.5
10.5

El Dorado
ID: ethnicity
4.0
0.7
13.2
78.4
3.7

Fresno
ID: ethnicity
9.6
4.9
51.7
31.4
2.5

Glenn
ID: ethnicity
2.5
0.7
39.8
53.6
3.4

Humboldt
ID: ethnicity
2.7
1.1
10.6
76.0
9.6

Imperial
ID: ethnicity
0.9
3.2
80.7
13.7
1.5

Inyo
ID: ethnicity
1.4
0.6
20.8
64.7
12.6

Kern
ID: ethnicity
3.7
5.4
50.4
38.0
2.6

Kings
ID: ethnicity
3.5
7.8
50.3
35.2
3.3

Lake
ID: ethnicity
1.1
1.8
18.1
73.1
5.8

Lassen
ID: ethnicity
1.5
9.1
16.3
67.5
5.7

Los Angeles
ID: ethnicity
13.9
7.9
49.5
26.5
2.2

Madera
ID: ethnicity
1.7
3.2
56.2
36.0
2.7

Marin
ID: ethnicity
5.9
2.8
17.0
71.0
3.2

Mariposa
ID: ethnicity
1.2
0.6
10.1
82.3
5.7

Mendocino

### Get risk factors data

In [135]:
# Risk factors data is in fourth table
ind= 4

Data will be saved in a csv file

In [136]:
filename = 'RiskFactors_byCounty.csv'
fout = open(filename, 'w')

Write the header for the csv file

In [137]:
line ='County,Adult smokers,Second-hand smoke,Adult obese,Federal poverty,Unemployment'
fout.write(line+'\n')

In [138]:
dict_tmp = {}

for i in xrange(N_counties):
    print county_name[i]
    line= county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table')
#     print 'Found %d tables' % len(html_entries)

    table = html_entries[ind]
    try:
        print 'ID:', table['id']
    except:
        pass    

    rows = table.find_all('tr')


    # Get the data for each county
    dict_tmp[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        if col.get_text()[:2]=='--': # no value: put empty cell
            val= ''
        else:
            try:
                val= eval(col.get_text().split('(')[0].strip())
            except:
                val= eval(col.get_text().split(' ')[0].strip())
        print val
        line= line+','+np.str(val)

    print ''
    fout.write(line+'\n')

fout.close()

Alameda
ID: riskfactors
11.0
7.4
21.0
10.8
10.7

Alpine
ID: riskfactors
16.9
8.0
18.7

14.0

Amador
ID: riskfactors
16.9
8.0
18.7
8.3
11.7

Butte
ID: riskfactors
16.7
14.5
23.8
18.9
12.5

Calaveras
ID: riskfactors
16.9
8.0
18.7
8.6
14.1

Colusa
ID: riskfactors
15.1
3.9
38.2
13.3
18.3

Contra Costa
ID: riskfactors
10.6
6.2
24.0
9.4
10.3

Del Norte
ID: riskfactors
15.2
11.5
31.4
20.2
12.2

El Dorado
ID: riskfactors
11.5
4.2
22.9
7.6
11.3

Fresno
ID: riskfactors
13.9
6.5
30.0
21.6
15.1

Glenn
ID: riskfactors
15.1
3.9
38.2
17.7
14.6

Humboldt
ID: riskfactors
17.4
4.9
27.6
19.0
11.0

Imperial
ID: riskfactors
12.6
3.8
41.7
22.4
28.2

Inyo
ID: riskfactors
16.9
8.0
18.7

9.1

Kern
ID: riskfactors
18.1
5.5
33.2
20.6
14.4

Kings
ID: riskfactors
14.8
2.9
36.6
17.4
14.6

Lake
ID: riskfactors
26.2
8.9
26.4
19.4
15.6

Lassen
ID: riskfactors
15.2
11.5
31.4
15.8
12.9

Los Angeles
ID: riskfactors
13.2
6.7
24.7
15.4
11.6

Madera
ID: riskfactors
15.2
7.1
34.4
16.9
13.7

Marin
ID: riskfactors
5.5
4.6
13.9

### Get payment sources data

In [145]:
# Payment sources data is in seventh table
ind= 7

Data will be saved in a csv file

In [146]:
filename = 'PaymentSources_byCounty.csv'
fout = open(filename, 'w')

Write the header for the csv file

In [147]:
line ='County,Medicare,Medi-Cal,Private,Other'
fout.write(line+'\n')

In [148]:
dict_tmp = {}

for i in xrange(N_counties):
    print county_name[i]
    line= county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table')
#     print 'Found %d tables' % len(html_entries)

    if county_name[i] == 'San Diego':  # San Diego has one missing table
        table = html_entries[ind-1]
    else:
        table = html_entries[ind]
    try:
        print 'ID: %s' % table['id']
    except:
        pass    

    rows = table.find_all('tr')


    # Get the data for each county
    dict_tmp[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        if col.get_text()[:2]=='--': # no value: put empty cell
            val= ''
        else:
            val= eval(col.get_text().split('%')[0].strip())
        print val
        line= line+','+np.str(val)

    print ''
    fout.write(line+'\n')

fout.close()

Alameda
14.97
46.92
23.22
14.9

Alpine





Amador
28.19
23.35
40.97
7.49

Butte
18.1
56.85
16.97
8.08

Calaveras
22.82
45.63
22.82
8.74

Colusa
14.13
54.35
21.74
9.78

Contra Costa
19.28
37.27
32.04
11.41

Del Norte
9.23
70.0
12.31
8.46

El Dorado
14.59
30.02
38.81
16.58

Fresno
8.87
69.13
15.51
6.49

Glenn
20.18
49.54
15.6
14.68

Humboldt
19.54
51.1
15.23
14.13

Imperial
11.55
43.5
38.48
6.47

Inyo
16.3
47.83
29.35
6.52

Kern
11.04
64.86
14.97
9.12

Kings
9.36
61.58
18.48
10.57

Lake
21.78
53.53
13.07
11.62

Lassen
18.45
39.81
24.27
17.48

Los Angeles
10.97
50.06
26.86
12.1

Madera
9.18
68.78
16.63
5.41

Marin
21.93
34.37
33.9
9.8

Mariposa
40.0
16.67
36.67
6.67

Mendocino
12.9
17.63
59.57
9.89

Merced
48.46
33.6
12.37
5.57

Modoc
10.0
53.33
21.67
15.0

Mono
6.12
44.9
24.49
24.49

Monterey
10.75
64.13
14.46
10.66

Napa
18.15
35.38
33.69
12.77

Nevada
19.24
44.17
23.58
13.01

Orange
13.1
39.74
37.3
9.87

Placer
18.81
18.81
52.36
10.01

Plumas
22.73
39.39
30.3
7.58

Riverside
11.15
41.

### Get hospitalization charges data

In [158]:
# Payment sources data is in seventh table
ind= 9

Data will be saved in a csv file

In [159]:
filename = 'HospitalizationCharges_byCounty.csv'
fout = open(filename, 'w')

Write the header for the csv file

In [160]:
line ='County,Children,Adults,All'
fout.write(line+'\n')

In [161]:
dict_tmp = {}

for i in xrange(N_counties):
    print county_name[i]
    line= county_name[i]

    try:
        response = requests.get(baseurl+county_url[i])
    except:
        print 'Failed to get url page %s' % url+county_url[i]

    soup = BeautifulSoup(response.text, 'lxml')

    # Find all tables containing data in the web page
    html_entries = soup.find_all('table')
#     print 'Found %d tables' % len(html_entries)

    if county_name[i] == 'Alameda':  # Alameda has one extra table
        table = html_entries[ind+1]
    elif county_name[i] == 'San Diego':  # San Diego has one missing table
        table = html_entries[ind-1]
    else:
        table = html_entries[ind]
    try:
        print 'ID: %s' % table['id']
    except:
        pass    

    rows = table.find_all('tr')


    # Get the data for each county
    dict_tmp[county_name[i]] = []
    for j in xrange(1,len(rows)): # ignore first row: table header
        row= rows[j]
        col= row.find_all('td')[0] # get the first table value
        if col.get_text()[:2]=='--': # no value: put empty cell
            val= ''
        else:
            val= eval(col.get_text().replace(',','').replace('$',''))
        print val
        line= line+','+np.str(val)

    print ''
    fout.write(line+'\n')

fout.close()

Alameda
23248.52
56622.29
41610.21

Alpine

Amador

39202.9
39202.9

Butte
24192.41
42561.53
35598.07

Calaveras

48807.72
45070.72

Colusa

23169.27
25524.46

Contra Costa
27723.19
54252.93
45783.82

Del Norte

32679.33
31990.11

El Dorado
22273.67
47367.81
40523.95

Fresno
20957.16
32153.66
26143.21

Glenn
20957.16
37677.0
36059.94

Humboldt
14269.6
45861.84
40820.52

Imperial
19060.77
35955.01
26535.19

Inyo
13522.33
27682.2
22372.25

Kern
20590.24
48617.97
39402.33

Kings
26144.38
21896.54
23279.56

Lake
25953.67
40137.03
36733.02

Lassen
12551.45
20799.37
17775.13

Los Angeles
23254.06
42843.4
36535.98

Madera
30507.6
30542.65
30519.95

Marin
41868.31
64327.3
56384.49

Mariposa
9828.0
23651.71
21923.75

Mendocino
18206.24
28218.6
23618.32

Merced
18404.43
51170.2
38991.77

Modoc

8379.63
8379.63

Mono
15195.0
41523.2
37135.17

Monterey
38019.69
68419.86
55409.09

Napa
21961.91
60705.43
52348.98

Nevada
19350.56
41554.97
37517.81

Orange
30498.09
43378.85
39241.96

Placer
23873.24
