In [1]:
import numpy as np
import pandas as pd

## 2017-2018 school vaccination data
- Numbers in the original document have been surpressed for <=1%, <=2%, <=5% and >=95%,>=98%,>=99%. Original file has errors with <=,  >=, and --* symbols when importing directly from .xlsx
- The 3 raw datafiles for childcare, kindergarden, and 7th grade has been compiled manually into a csv called `raw17_18_combined.csv` with <= and >= removed and replaced with the corresponding numbers

In [2]:
# Import raw combined 2017-2018 school vaccination data
pertusis_1718 = pd.read_csv("Raw_Data/raw17_18_combined.csv",encoding = "ISO-8859-1")

In [3]:
pertusis_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,.,98,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,.,95,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,.,95,childcare,Y
3,13417441,ALAMEDA,PRIVATE,ALAMEDA,SUGAR AND SPICE,,.,.,childcare,N
4,6151211,ALAMEDA,PRIVATE,ALAMEDA,MCKINNEY CHRISTIAN ACADEMY,,.,.,kindergarten,N


In [4]:
#Clean up missing values and set to Nan
pertusis_1718.n=pertusis_1718.n.replace('.', np.nan)
pertusis_1718.pct=pertusis_1718.pct.replace('.',np.nan)

# uppercase all counties and cities
pertusis_1718.COUNTY=pertusis_1718['COUNTY'].str.upper()
pertusis_1718.CITY=pertusis_1718['CITY'].str.upper()

In [5]:
pertusis_1718.dtypes

FACILITY_NUMBER         int64
COUNTY                 object
pub_priv_headstart     object
CITY                   object
FACILITY_NAME          object
ENROLLMENT            float64
n                      object
pct                    object
vac_info_type          object
REPORTED               object
dtype: object

In [6]:
# Change n and percent to floats
pertusis_1718[['n', 'pct']] = pertusis_1718[['n', 'pct']].astype('float64')

In [7]:
# Check conversion
pertusis_1718.dtypes

FACILITY_NUMBER         int64
COUNTY                 object
pub_priv_headstart     object
CITY                   object
FACILITY_NAME          object
ENROLLMENT            float64
n                     float64
pct                   float64
vac_info_type          object
REPORTED               object
dtype: object

In [8]:
pertusis_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,,98.0,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,,95.0,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,,95.0,childcare,Y
3,13417441,ALAMEDA,PRIVATE,ALAMEDA,SUGAR AND SPICE,,,,childcare,N
4,6151211,ALAMEDA,PRIVATE,ALAMEDA,MCKINNEY CHRISTIAN ACADEMY,,,,kindergarten,N


In [9]:
sum(pertusis_1718.REPORTED=='N')

1005

In [10]:
sum(pertusis_1718.REPORTED=='Y')

18304

Subset dataset to those that reported

In [11]:
# Take subset that have numbers
vac_1718=pertusis_1718[pertusis_1718.REPORTED=='Y']

In [12]:
vac_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,,98.0,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,,95.0,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,,95.0,childcare,Y
6,13415928,ALAMEDA,PRIVATE,ALAMEDA,BRIGHT HORIZONS AT GARNER,109.0,,99.0,childcare,Y
7,13419403,ALAMEDA,PRIVATE,ALAMEDA,PETER PAN SCHOOL,100.0,,99.0,childcare,Y


Numbers in the original document have been surpressed for <=1%, <=2%, <=5% and >=95%,>=98%,>=99% so will impute values by assuming that the percentage is equal to whatever value listed and multiply by the number of students enrolled at the school. 

In [13]:
# Calculate n from enrollment and approximate percentage
vac_1718.loc[:,('n')]=vac_1718.n.fillna(vac_1718.ENROLLMENT*vac_1718.pct/100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [14]:
vac_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,61.74,98.0,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,44.65,95.0,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,41.8,95.0,childcare,Y
6,13415928,ALAMEDA,PRIVATE,ALAMEDA,BRIGHT HORIZONS AT GARNER,109.0,107.91,99.0,childcare,Y
7,13419403,ALAMEDA,PRIVATE,ALAMEDA,PETER PAN SCHOOL,100.0,99.0,99.0,childcare,Y


In [15]:
# Change to integers 
vac_1718.loc[:,('ENROLLMENT','n','pct')]=vac_1718.loc[:,('ENROLLMENT','n','pct')].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
vac_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63,61,98,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47,44,95,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44,41,95,childcare,Y
6,13415928,ALAMEDA,PRIVATE,ALAMEDA,BRIGHT HORIZONS AT GARNER,109,107,99,childcare,Y
7,13419403,ALAMEDA,PRIVATE,ALAMEDA,PETER PAN SCHOOL,100,99,99,childcare,Y


In [17]:
# Check County names
sorted(list(vac_1718.COUNTY.unique()))

['ALAMEDA',
 'AMADOR',
 'BUTTE',
 'CALAVERAS',
 'COLUSA',
 'CONTRA COSTA',
 'DEL NORTE',
 'EL DORADO',
 'FRESNO',
 'GLENN',
 'HUMBOLDT',
 'IMPERIAL',
 'INYO',
 'KERN',
 'KINGS',
 'LAKE',
 'LASSEN',
 'LOS ANGELES',
 'MADERA',
 'MARIN',
 'MARIPOSA',
 'MENDOCINO',
 'MERCED',
 'MODOC',
 'MONO',
 'MONTEREY',
 'NAPA',
 'NEVADA',
 'ORANGE',
 'PLACER',
 'PLUMAS',
 'RIVERSIDE',
 'SACRAMENTO',
 'SAN BENITO',
 'SAN BERNARDINO',
 'SAN DIEGO',
 'SAN FRANCISCO',
 'SAN JOAQUIN',
 'SAN LUIS OBISPO',
 'SAN MATEO',
 'SANTA BARBARA',
 'SANTA CLARA',
 'SANTA CRUZ',
 'SHASTA',
 'SIERRA',
 'SISKIYOU',
 'SOLANO',
 'SONOMA',
 'STANISLAUS',
 'SUTTER',
 'TEHAMA',
 'TRINITY',
 'TULARE',
 'TUOLUMNE',
 'VENTURA',
 'YOLO',
 'YUBA']

In [18]:
#Check city names
sorted(list(vac_1718.CITY.unique()))

['ACAMPO',
 'ACTON',
 'ACTON, CA',
 'ADELANTO',
 'AGOURA',
 'AGOURA HILLS',
 'AGUANGA',
 'AHWAHNEE',
 'ALAMEDA',
 'ALAMO',
 'ALBANY',
 'ALHAMBRA',
 'ALISO VIEJO',
 'ALPAUGH',
 'ALPINE',
 'ALTA LOMA',
 'ALTADENA',
 'ALTURAS',
 'ALVISO',
 'AMERICAN CANYON',
 'ANAHEIM',
 'ANAHEIM HILLS',
 'ANAHEIM,',
 'ANANHEIM',
 'ANDERSON',
 'ANGELS CAMP',
 'ANTELOPE',
 'ANTIOCH',
 'ANZA',
 'APPLE VALLEY',
 'APTOS',
 'ARBOGA',
 'ARBUCKLE',
 'ARCADIA',
 'ARCATA',
 'ARLETA',
 'ARMONA',
 'ARNOLD',
 'AROMAS',
 'ARROYO GRANDE',
 'ARTESIA',
 'ARVIN',
 'ATASCADERO',
 'ATHERTON',
 'ATWATER',
 'AUBERRY',
 'AUBURN',
 'AVALON',
 'AVENAL',
 'AVERY',
 'AZUSA',
 'BAKERSFIELD',
 'BALDWIN PARK',
 'BALLICO',
 'BANNING',
 'BANTA',
 'BARSTOW',
 'BAY POINT',
 'BAYSIDE',
 'BEALE AFB',
 'BEALE AIR FORCE BASE',
 'BEAUMONT',
 'BELL',
 'BELL GARDENS',
 'BELLA VISTA',
 'BELLFLOWER',
 'BELMONT',
 'BELVEDERE',
 'BENICA',
 'BENICIA',
 'BERKELEY',
 'BERMUDA DUNES',
 'BEVERLY HILLS',
 'BIG BEAR CITY',
 'BIG BEAR LAKE',
 'BIGGS',
 'BI

In [19]:
# Fix City Typos
vac_1718.loc[vac_1718.CITY=='ACTON, CA', 'CITY']='ACTON'
vac_1718.loc[vac_1718.CITY=='AGOURA', 'CITY']='AGOURA HILLS'
vac_1718.loc[vac_1718.CITY=='ANAHEIM,', 'CITY']='ANAHEIM'
vac_1718.loc[vac_1718.CITY=='ANANHEIM,', 'CITY']='ANAHEIM'
vac_1718.loc[vac_1718.CITY=='CA', 'CITY']='NAPA'
vac_1718.loc[vac_1718.CITY=='BEALE AIR FORCE BASE', 'CITY']='BEALE AFB' # to match census name
vac_1718.loc[vac_1718.CITY=='BENICA', 'CITY']='BENICIA'
vac_1718.loc[vac_1718.CITY=='CARDIFF', 'CITY']='CARDIFF BY THE SEA'
vac_1718.loc[vac_1718.CITY=='CARMEL', 'CITY']='CARMEL VALLEY'
vac_1718.loc[vac_1718.CITY=='CHINO,', 'CITY']='CHINO'
vac_1718.loc[vac_1718.CITY=='CHULAR', 'CITY']='CHUALAR'
vac_1718.loc[vac_1718.CITY=='CITY OF COMMERCE', 'CITY']='COMMERCE'
vac_1718.loc[vac_1718.CITY=='CITY OF INDUSTRY', 'CITY']='INDUSTRY'
vac_1718.loc[vac_1718.CITY=='CUDAHAY', 'CITY']='CUDAHY'
vac_1718.loc[vac_1718.CITY=='E. NICOLAUS', 'CITY']='EAST NICOLAUS'
vac_1718.loc[vac_1718.CITY=='E. RANCHO DOMINGUEZ', 'CITY']='EAST RANCHO DOMINGUEZ'
vac_1718.loc[vac_1718.CITY=='E. WHITTIER', 'CITY']='EAST WHITTIER'
vac_1718.loc[vac_1718.CITY=='EL SEGUNDO,', 'CITY']='EL SEGUNDO'
vac_1718.loc[vac_1718.CITY=='FAIRIFELD', 'CITY']='FAIRFIELD'
vac_1718.loc[vac_1718.CITY=='FREMOTN', 'CITY']='FREMONT'
vac_1718.loc[vac_1718.CITY=='FT. IRWIN', 'CITY']='FORT IRWIN'
vac_1718.loc[vac_1718.CITY=='GREENFILED', 'CITY']='GREENFIELD'
vac_1718.loc[vac_1718.CITY=='HUNTINGTON', 'CITY']='HUNTINGTON PARK'
vac_1718.loc[vac_1718.CITY=='JAMUAL', 'CITY']='JAMUL'
vac_1718.loc[vac_1718.CITY=='LA', 'CITY']='LOS ANGELES'
vac_1718.loc[vac_1718.CITY=='LA CANADA', 'CITY']='LA CANADA FLINTRIDGE'
vac_1718.loc[vac_1718.CITY=='LAKE VIEW TERRANCE', 'CITY']='LAKE VIEW TERRACE'
vac_1718.loc[vac_1718.CITY=='LAKEVIEW TERRACE', 'CITY']='LAKE VIEW TERRACE'
vac_1718.loc[vac_1718.CITY=='LANCASTER,', 'CITY']='LANCASTER'
vac_1718.loc[vac_1718.CITY=='LAVERNE', 'CITY']='LA VERNE'
vac_1718.loc[vac_1718.CITY=='MC KINLEYVILLE', 'CITY']='MCKINLEYVILLE'
vac_1718.loc[vac_1718.CITY=='MONTROSE', 'CITY']='LA CRESCENTA-MONTROSE'
vac_1718.loc[vac_1718.CITY=='MT. SHASTA', 'CITY']='MOUNT SHASTA'
vac_1718.loc[vac_1718.CITY=='N. HOLLYWOOD', 'CITY']='NORTH HOLLYWOOD'
vac_1718.loc[vac_1718.CITY=='N.A.S. LEMOORE', 'CITY']='LEMOORE STATION' # match census
vac_1718.loc[vac_1718.CITY=='NAS LEMOORE', 'CITY']='LEMOORE STATION'
vac_1718.loc[vac_1718.CITY=='NEWPORT BEACH,', 'CITY']='NEWPORT BEACH'
vac_1718.loc[vac_1718.CITY=='PACOMIA', 'CITY']='PACOIMA'
vac_1718.loc[vac_1718.CITY=='PANORAM ', 'CITY']='PANORAMA CITY'
vac_1718.loc[vac_1718.CITY=='PT. REYES STATION', 'CITY']='POINT REYES STATION'
vac_1718.loc[vac_1718.CITY=='RANCHO SANTA MARGARI', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHO SAN MARGARITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHO STA MARGAITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHO STA MARGARITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RCHO STA MARG', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHOS PALOS VERDES', 'CITY']='RANCHO PALOS VERDES'
vac_1718.loc[vac_1718.CITY=='RIVERSIDE,', 'CITY']='RIVERSIDE'
vac_1718.loc[vac_1718.CITY=='ROLLING HILLS ESTATE', 'CITY']='ROLLING HILLS ESTATES'
vac_1718.loc[vac_1718.CITY=='S. EL MONTE', 'CITY']='SOUTH EL MONTE'
vac_1718.loc[vac_1718.CITY=='S. LAKE TAHOE', 'CITY']='SOUTH LAKE TAHOE'
vac_1718.loc[vac_1718.CITY=='SAN BERARDINO', 'CITY']='SAN BERNARDINO'
vac_1718.loc[vac_1718.CITY=='SAN FRNCISCO', 'CITY']='SAN FRANCISCO'
vac_1718.loc[vac_1718.CITY=='SAN JOSE,', 'CITY']='SAN JOSE'
vac_1718.loc[vac_1718.CITY=='SANTA BARARA', 'CITY']='SANTA BARBARA'
vac_1718.loc[vac_1718.CITY=='SHASTA LAKE CITY', 'CITY']='SHASTA LAKE'
vac_1718.loc[vac_1718.CITY=='SILVERADO CANYON', 'CITY']='SILVERADO'
vac_1718.loc[vac_1718.CITY=='SOUTH LATE TAHOE', 'CITY']='SOUTH LAKE TAHOE'
vac_1718.loc[vac_1718.CITY=='SPRECKLES', 'CITY']='SPRECKELS'
vac_1718.loc[vac_1718.CITY=='STEVENSONS RANCH', 'CITY']='STEVENSON RANCH'
vac_1718.loc[vac_1718.CITY=='SUISUN', 'CITY']='SUISUN CITY'
vac_1718.loc[vac_1718.CITY=='SUNNYALE', 'CITY']='SUNNYVALE'
vac_1718.loc[vac_1718.CITY=='SUPELVEDA', 'CITY']='SEPULVEDA'
vac_1718.loc[vac_1718.CITY=='TURLOCK,', 'CITY']='TURLOCK'
vac_1718.loc[vac_1718.CITY=='UPPERLAKE', 'CITY']='UPPER LAKE'
vac_1718.loc[vac_1718.CITY=='VANDENBERG AIR FORCE BASE', 'CITY']='VANDENBERG AFB' # match census
vac_1718.loc[vac_1718.CITY=='W. SACRAMENTO', 'CITY']='WEST SACRAMENTO'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
#Double check city names
sorted(list(vac_1718.CITY.unique()))

['ACAMPO',
 'ACTON',
 'ADELANTO',
 'AGOURA HILLS',
 'AGUANGA',
 'AHWAHNEE',
 'ALAMEDA',
 'ALAMO',
 'ALBANY',
 'ALHAMBRA',
 'ALISO VIEJO',
 'ALPAUGH',
 'ALPINE',
 'ALTA LOMA',
 'ALTADENA',
 'ALTURAS',
 'ALVISO',
 'AMERICAN CANYON',
 'ANAHEIM',
 'ANAHEIM HILLS',
 'ANANHEIM',
 'ANDERSON',
 'ANGELS CAMP',
 'ANTELOPE',
 'ANTIOCH',
 'ANZA',
 'APPLE VALLEY',
 'APTOS',
 'ARBOGA',
 'ARBUCKLE',
 'ARCADIA',
 'ARCATA',
 'ARLETA',
 'ARMONA',
 'ARNOLD',
 'AROMAS',
 'ARROYO GRANDE',
 'ARTESIA',
 'ARVIN',
 'ATASCADERO',
 'ATHERTON',
 'ATWATER',
 'AUBERRY',
 'AUBURN',
 'AVALON',
 'AVENAL',
 'AVERY',
 'AZUSA',
 'BAKERSFIELD',
 'BALDWIN PARK',
 'BALLICO',
 'BANNING',
 'BANTA',
 'BARSTOW',
 'BAY POINT',
 'BAYSIDE',
 'BEALE AFB',
 'BEAUMONT',
 'BELL',
 'BELL GARDENS',
 'BELLA VISTA',
 'BELLFLOWER',
 'BELMONT',
 'BELVEDERE',
 'BENICIA',
 'BERKELEY',
 'BERMUDA DUNES',
 'BEVERLY HILLS',
 'BIG BEAR CITY',
 'BIG BEAR LAKE',
 'BIGGS',
 'BISHOP',
 'BLOOMINGTON',
 'BLUE LAKE',
 'BLYTHE',
 'BOLINAS',
 'BONITA',
 'B

In [21]:
#Get count of students vaccinated by county,city and grade
vac_n=vac_1718.groupby(by=['COUNTY','CITY','vac_info_type'])['n'].sum()
vac_n

COUNTY   CITY             vac_info_type
ALAMEDA  ALAMEDA          7th grade         931
                          childcare        1231
                          kindergarten      986
         ALBANY           7th grade         255
                          childcare         249
                          kindergarten      307
         BERKELEY         7th grade         908
                          childcare        2346
                          kindergarten      860
         CASTRO VALLEY    7th grade         776
                          childcare         797
                          kindergarten      864
         DUBLIN           7th grade         939
                          childcare        1275
                          kindergarten     1150
         EMERYVILLE       7th grade          50
                          childcare         238
                          kindergarten       70
         FREMONT          7th grade        2855
                          childcare        4388


In [22]:
#Export count of students vaccinated by county,city and grade
vac_n.to_csv("vac_n_table1718.CSV")

In [23]:
#Get average percentage of students vaccinated by county,city and grade
vac_pct=vac_1718.groupby(by=['COUNTY','CITY','vac_info_type'])['pct'].mean()
vac_pct

COUNTY   CITY             vac_info_type
ALAMEDA  ALAMEDA          7th grade        94.900000
                          childcare        96.000000
                          kindergarten     96.384615
         ALBANY           7th grade        98.000000
                          childcare        95.571429
                          kindergarten     97.666667
         BERKELEY         7th grade        96.750000
                          childcare        92.750000
                          kindergarten     91.333333
         CASTRO VALLEY    7th grade        97.000000
                          childcare        89.384615
                          kindergarten     97.416667
         DUBLIN           7th grade        97.200000
                          childcare        92.857143
                          kindergarten     97.800000
         EMERYVILLE       7th grade        96.000000
                          childcare        96.000000
                          kindergarten     98.000000
      

In [24]:
#Export average percentage of students vaccinated by county,city and grade
vac_pct.to_csv("vac_pct_table1718.CSV")

In [25]:
#Export full cleaned data
vac_1718.to_csv("clean1718.CSV", index=False)

## Census Data Cleaning

### Demographics

In [26]:
# Import and clean census data for basic demographics age gender race
census16 = pd.read_csv("Raw_Data/Census/ACS_16_demographic.csv",encoding = "ISO-8859-1")

In [27]:
census16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC108,HC04_VC108,HC01_VC109,HC02_VC109,HC03_VC109,HC04_VC109,HC01_VC110,HC02_VC110,HC03_VC110,HC04_VC110
0,Id,Id2,Geography,Estimate; SEX AND AGE - Total population,Margin of Error; SEX AND AGE - Total population,Percent; SEX AND AGE - Total population,Percent Margin of Error; SEX AND AGE - Total p...,Estimate; SEX AND AGE - Total population - Male,Margin of Error; SEX AND AGE - Total populatio...,Percent; SEX AND AGE - Total population - Male,...,"Percent; CITIZEN, VOTING AGE POPULATION - Citi...","Percent Margin of Error; CITIZEN, VOTING AGE P...","Estimate; CITIZEN, VOTING AGE POPULATION - Cit...","Margin of Error; CITIZEN, VOTING AGE POPULATIO...","Percent; CITIZEN, VOTING AGE POPULATION - Citi...","Percent Margin of Error; CITIZEN, VOTING AGE P...","Estimate; CITIZEN, VOTING AGE POPULATION - Cit...","Margin of Error; CITIZEN, VOTING AGE POPULATIO...","Percent; CITIZEN, VOTING AGE POPULATION - Citi...","Percent Margin of Error; CITIZEN, VOTING AGE P..."
1,1600000US0600135,0600135,"Acalanes Ridge CDP, California",1000,338,1000,(X),409,146,40.9,...,695,(X),279,101,40.1,7.1,416,194,59.9,7.1
2,1600000US0600156,0600156,"Acampo CDP, California",466,398,466,(X),291,277,62.4,...,466,(X),291,277,62.4,13.7,175,137,37.6,13.7
3,1600000US0600212,0600212,"Acton CDP, California",7170,555,7170,(X),3776,318,52.7,...,5777,(X),3020,279,52.3,2.2,2757,199,47.7,2.2
4,1600000US0600296,0600296,"Adelanto city, California",32311,35,32311,(X),16350,651,50.6,...,16322,(X),8399,706,51.5,2.4,7923,539,48.5,2.4


In [28]:
census16=census16[1:]

In [29]:
census16 = census16[['GEO.id','GEO.id2','GEO.display-label',
                     'HC01_VC03','HC03_VC03',
                     'HC01_VC04','HC03_VC04',
                     'HC01_VC05','HC03_VC05',
                     'HC01_VC08','HC03_VC08',
                     'HC01_VC09','HC03_VC09',
                     'HC01_VC10','HC03_VC10',
                     'HC01_VC11','HC03_VC11',
                     'HC01_VC12','HC03_VC12',
                     'HC01_VC13','HC03_VC13',
                     'HC01_VC14','HC03_VC14',
                     'HC01_VC15','HC03_VC15',
                     'HC01_VC16','HC03_VC16',
                     'HC01_VC17','HC03_VC17',
                     'HC01_VC18','HC03_VC18',
                     'HC01_VC19','HC03_VC19',
                     'HC01_VC20','HC03_VC20',
                     'HC01_VC23','HC03_VC23',
                     'HC01_VC88','HC03_VC88',
                     'HC01_VC94','HC03_VC94',
                     'HC01_VC95','HC03_VC95',
                     'HC01_VC96','HC03_VC96',
                     'HC01_VC97','HC03_VC97',
                     'HC01_VC98','HC03_VC98',
                     'HC01_VC99','HC03_VC99',
                     'HC01_VC100','HC03_VC100',
                     'HC01_VC101','HC03_VC101',
                     'HC01_VC102','HC03_VC102'                 
        ]]

In [30]:
census16.dtypes

GEO.id               object
GEO.id2              object
GEO.display-label    object
HC01_VC03            object
HC03_VC03            object
HC01_VC04            object
HC03_VC04            object
HC01_VC05            object
HC03_VC05            object
HC01_VC08            object
HC03_VC08            object
HC01_VC09            object
HC03_VC09            object
HC01_VC10            object
HC03_VC10            object
HC01_VC11            object
HC03_VC11            object
HC01_VC12            object
HC03_VC12            object
HC01_VC13            object
HC03_VC13            object
HC01_VC14            object
HC03_VC14            object
HC01_VC15            object
HC03_VC15            object
HC01_VC16            object
HC03_VC16            object
HC01_VC17            object
HC03_VC17            object
HC01_VC18            object
HC03_VC18            object
HC01_VC19            object
HC03_VC19            object
HC01_VC20            object
HC03_VC20            object
HC01_VC23           

In [31]:
#Clean up missing values and set to Nan
census16.replace('-', np.nan)

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC03_VC03,HC01_VC04,HC03_VC04,HC01_VC05,HC03_VC05,HC01_VC08,...,HC01_VC98,HC03_VC98,HC01_VC99,HC03_VC99,HC01_VC100,HC03_VC100,HC01_VC101,HC03_VC101,HC01_VC102,HC03_VC102
1,1600000US0600135,0600135,"Acalanes Ridge CDP, California",1000,1000,409,40.9,591,59.1,33,...,0,0.0,0,0.0,161,16.1,0,0.0,161,16.1
2,1600000US0600156,0600156,"Acampo CDP, California",466,466,291,62.4,175,37.6,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,1600000US0600212,0600212,"Acton CDP, California",7170,7170,3776,52.7,3394,47.3,350,...,0,0.0,56,0.8,99,1.4,0,0.0,99,1.4
4,1600000US0600296,0600296,"Adelanto city, California",32311,32311,16350,50.6,15961,49.4,3148,...,0,0.0,47,0.1,1007,3.1,10,0.0,997,3.1
5,1600000US0600310,0600310,"Adin CDP, California",155,155,59,38.1,96,61.9,7,...,0,0.0,0,0.0,5,3.2,0,0.0,5,3.2
6,1600000US0600394,0600394,"Agoura Hills city, California",20689,20689,10132,49.0,10557,51.0,910,...,0,0.0,90,0.4,462,2.2,25,0.1,437,2.1
7,1600000US0600450,0600450,"Agua Dulce CDP, California",3541,3541,1814,51.2,1727,48.8,208,...,0,0.0,0,0.0,252,7.1,0,0.0,252,7.1
8,1600000US0600464,0600464,"Aguanga CDP, California",673,673,306,45.5,367,54.5,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
9,1600000US0600478,0600478,"Ahwahnee CDP, California",2225,2225,1178,52.9,1047,47.1,101,...,0,0.0,0,0.0,13,0.6,0,0.0,13,0.6
10,1600000US0600535,0600535,"Airport CDP, California",1605,1605,892,55.6,713,44.4,260,...,0,0.0,0,0.0,6,0.4,0,0.0,6,0.4


In [32]:
census16[['HC01_VC03','HC01_VC04','HC01_VC05','HC01_VC08','HC01_VC09','HC01_VC10','HC01_VC11','HC01_VC12','HC01_VC13',
          'HC01_VC14','HC01_VC15','HC01_VC16','HC01_VC17','HC01_VC18','HC01_VC19','HC01_VC20','HC01_VC23','HC01_VC88',
          'HC01_VC94','HC01_VC95','HC01_VC96','HC01_VC97','HC01_VC98',
          'HC01_VC99','HC01_VC100','HC01_VC101','HC01_VC102']] = census16[['HC01_VC03','HC01_VC04','HC01_VC05','HC01_VC08','HC01_VC09','HC01_VC10','HC01_VC11','HC01_VC12','HC01_VC13',
          'HC01_VC14','HC01_VC15','HC01_VC16','HC01_VC17','HC01_VC18','HC01_VC19','HC01_VC20','HC01_VC23','HC01_VC88',
          'HC01_VC94','HC01_VC95','HC01_VC96','HC01_VC97','HC01_VC98',
          'HC01_VC99','HC01_VC100','HC01_VC101','HC01_VC102']].astype('int64')

ValueError: invalid literal for int() with base 10: '46.1'

In [None]:
# Check conversion
census16.dtypes

In [None]:
#combine other race, two or more races (HC01_VC99, HC01_VC100, HC01_VC101, HC01_VC102)
census16.other=census16['HC01_VC99']+census16['HC01_VC100']+census16['HC01_VC101']+census16['HC01_VC102']

In [None]:
census16.rename(columns={'HC01_VC03': 'tot_pop', 
                         'HC01_VC04': 'male',
                         'HC01_VC05': 'female',
                         'HC01_VC08':'under_5',
                         'HC01_VC09':'5_9',
                         'HC01_VC10':'10_14',
                         'HC01_VC11':'15_19',
                         'HC01_VC12':'20_24',
                         'HC01_VC13':'25_34',
                         'HC01_VC14':'35_44',
                         'HC01_VC15':'45_54',
                         'HC01_VC16':'55_59',
                         'HC01_VC17':'60_64',
                         'HC01_VC18':'65_74',
                         'HC01_VC19':'75_84',
                         'HC01_VC20':'85_over',
                         'HC01_VC23':'median_age',
                         'HC01_VC88':'hispanic_latino',
                         'HC01_VC89':'mexican',
                         'HC01_VC94':'white',
                         'HC01_VC95':'black',
                         'HC01_VC96':'aian',
                         'HC01_VC97':'asian',
                         'HC01_VC98':'nhopi',
                         
                         'HC03_VC03': 'tot_pop_pct', 
                         'HC03_VC04': 'male_pct',
                         'HC03_VC05': 'female_pct',
                         'HC03_VC08':'under_5_pct',
                         'HC03_VC09':'5_9_pct',
                         'HC03_VC10':'10_14_pct',
                         'HC03_VC11':'15_19_pct',
                         'HC03_VC12':'20_24_pct',
                         'HC03_VC13':'25_34_pct',
                         'HC03_VC14':'35_44_pct',
                         'HC03_VC15':'45_54_pct',
                         'HC03_VC16':'55_59_pct',
                         'HC03_VC17':'60_64_pct',
                         'HC03_VC18':'65_74_pct',
                         'HC03_VC19':'75_84_pct',
                         'HC03_VC20':'85_over_pct',
                         'HC03_VC23':'median_age_pct',
                         'HC03_VC88':'hispanic_latino_pct',
                         'HC03_VC89':'mexican_pct',
                         'HC03_VC94':'white_pct',
                         'HC03_VC95':'black_pct',
                         'HC03_VC96':'aian_pct',
                         'HC03_VC97':'asian_pct',
                         'HC03_VC98':'nhopi_pct'
                        }, 
                inplace=True)
# Other race categories
#                          'HC01_VC99':'other_only',
#                          'HC01_VC100':'more_two_races',
#                          'HC01_VC101':'more_two_races_other',
#                          'HC01_VC102':'more_two_races_other_exclude',


### Median Household Income and Health insurance Status

In [33]:
# Import and clean acs data for income and insurance status
acs16 = pd.read_csv("Raw_Data/Census/ACS_16_economic.csv",encoding = "ISO-8859-1")

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Import and clean acs data for income and insurance status
acs16 = pd.read_csv("Raw_Data/Census/ACS_16_economic.csv",encoding = "ISO-8859-1")