# Dependencies

In [1]:
import pandas
from keys import census_key
import requests

Read in population data

In [2]:
population_data = pandas.read_csv('population_data.csv')

Filter out unnecessary columns

In [3]:
new_pop_data = population_data[['GEO.display-label','respop72013']]
new_pop_data

Unnamed: 0,GEO.display-label,respop72013
0,Geography,Population Estimate (as of July 1) - 2013
1,United States,316128839
2,"Alameda County, California",1578891
3,"Alpine County, California",1159
4,"Amador County, California",36519
5,"Butte County, California",222090
6,"Calaveras County, California",44515
7,"Colusa County, California",21358
8,"Contra Costa County, California",1094205
9,"Del Norte County, California",27873


Clean county names and create a new column of names

In [4]:
county_list = []
for county in new_pop_data['GEO.display-label']:
    try:
        county_list.append(county.split(', ')[0])
    except:
        county_list.append(county)
new_pop_data['County Name'] = county_list
new_pop_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,GEO.display-label,respop72013,County Name
0,Geography,Population Estimate (as of July 1) - 2013,Geography
1,United States,316128839,United States
2,"Alameda County, California",1578891,Alameda County
3,"Alpine County, California",1159,Alpine County
4,"Amador County, California",36519,Amador County


Read in diversity data

In [5]:
diversity_data = pandas.read_csv('diversityindex.csv')

Create a list that contains the state and county part of each location

In [6]:
state_lst = []
county_lst2 = []
for row in diversity_data['Location']:
    try:
        state_lst.append(row.split(', ')[1])
        county_lst2.append(row.split(', ')[0])
    except:
        state_lst.append(row)
        county_lst2.append(row)

Use the newly created list to add a state column to the data table

In [7]:
diversity_data['State'] = state_lst
diversity_data['County Name'] = county_lst2

In [8]:
ca_diversity = diversity_data.loc[diversity_data['State']=='CA']

In [9]:
combined_df = pandas.merge(ca_diversity, new_pop_data, on = 'County Name', how = 'left')

In [13]:
combined_df = combined_df[['County Name', 'Black or African American alone, percent, 2013', 'American Indian and Alaska Native alone, percent, 2013', 'Asian alone, percent, 2013','Native Hawaiian and Other Pacific Islander alone, percent,', 'Two or More Races, percent, 2013', 'Hispanic or Latino, percent, 2013', 'White alone, not Hispanic or Latino, percent, 2013', 'respop72013']]
combined_df.columns

Index(['County Name', 'Black or African American alone, percent, 2013',
       'American Indian and Alaska Native alone, percent, 2013',
       'Asian alone, percent, 2013',
       'Native Hawaiian and Other Pacific Islander alone, percent,',
       'Two or More Races, percent, 2013', 'Hispanic or Latino, percent, 2013',
       'White alone, not Hispanic or Latino, percent, 2013', 'respop72013'],
      dtype='object')

In [16]:
combined_df = combined_df.rename(columns = {'Black or African American alone, percent, 2013': '% Black or African American', 'American Indian and Alaska Native alone, percent, 2013':'% Native American or Alaska Native', 'Asian alone, percent, 2013':'% Asian', 'Native Hawaiian and Other Pacific Islander alone, percent,': '% Pacific Islander', 'Two or More Races, percent, 2013': '% Two or more', 'Hispanic or Latino, percent, 2013': '% Hispanic or Latino', 'White alone, not Hispanic or Latino, percent, 2013': '% White', 'respop72013':'Population' })

In [25]:
combined_df['Population'] = combined_df['Population'].astype(float)
combined_df.head()

Unnamed: 0,County Name,% Black or African American,% Native American or Alaska Native,% Asian,% Pacific Islander,% Two or more,% Hispanic or Latino,% White,Population,# Black or African American
0,Alameda County,12.4,1.2,28.2,1.0,5.2,22.7,33.2,1578891.0,195782.0
1,Solano County,14.9,1.3,15.4,1.0,6.7,25.2,40.0,424788.0,63293.0
2,Santa Clara County,2.9,1.4,34.1,0.5,4.0,26.8,33.9,1862041.0,53999.0
3,San Mateo County,3.0,0.9,26.9,1.6,4.4,25.4,41.1,747373.0,22421.0
4,Sacramento County,10.8,1.6,15.4,1.2,5.9,22.3,47.3,1462131.0,157910.0


In [24]:
combined_df['# Black or African American'] = round(combined_df['% Black or African American']/100*combined_df['Population'],0)