In [1]:
import numpy as np
import pandas as pd

## 2017-2018 school vaccination data
- Numbers in the original document have been surpressed for <=1%, <=2%, <=5% and >=95%,>=98%,>=99%. Original file has errors with <=,  >=, and --* symbols when importing directly from .xlsx
- The 3 raw datafiles for childcare, kindergarden, and 7th grade has been compiled manually into a csv called `raw17_18_combined.csv` with <= and >= removed and replaced with the corresponding numbers

In [2]:
# Import raw combined 2017-2018 school vaccination data
pertusis_1718 = pd.read_csv("vaxxfacts/raw_data/raw17_18_combined.csv",encoding = "ISO-8859-1")

In [3]:
pertusis_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,.,98,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,.,95,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,.,95,childcare,Y
3,13417441,ALAMEDA,PRIVATE,ALAMEDA,SUGAR AND SPICE,,.,.,childcare,N
4,6151211,ALAMEDA,PRIVATE,ALAMEDA,MCKINNEY CHRISTIAN ACADEMY,,.,.,kindergarten,N


In [4]:
#Clean up missing values and set to Nan
pertusis_1718.n=pertusis_1718.n.replace('.', np.nan)
pertusis_1718.pct=pertusis_1718.pct.replace('.',np.nan)

# uppercase all counties and cities
pertusis_1718.COUNTY=pertusis_1718['COUNTY'].str.upper()
pertusis_1718.CITY=pertusis_1718['CITY'].str.upper()

In [5]:
pertusis_1718.CITY

0            ALAMEDA
1            ALAMEDA
2            ALAMEDA
3            ALAMEDA
4            ALAMEDA
5            ALAMEDA
6            ALAMEDA
7            ALAMEDA
8            ALAMEDA
9            ALAMEDA
10           ALAMEDA
11           ALAMEDA
12           ALAMEDA
13           ALAMEDA
14           ALAMEDA
15           ALAMEDA
16           ALAMEDA
17           ALAMEDA
18           ALAMEDA
19           ALAMEDA
20           ALAMEDA
21           ALAMEDA
22           ALAMEDA
23           ALAMEDA
24           ALAMEDA
25           ALAMEDA
26           ALAMEDA
27           ALAMEDA
28           ALAMEDA
29           ALAMEDA
            ...     
19279     MARYSVILLE
19280     MARYSVILLE
19281     MARYSVILLE
19282     MARYSVILLE
19283     MARYSVILLE
19284     MARYSVILLE
19285     MARYSVILLE
19286     MARYSVILLE
19287     MARYSVILLE
19288     MARYSVILLE
19289     MARYSVILLE
19290     OLIVEHURST
19291     OLIVEHURST
19292     OLIVEHURST
19293     OLIVEHURST
19294     OLIVEHURST
19295     OLI

In [6]:
pertusis_1718.dtypes

FACILITY_NUMBER         int64
COUNTY                 object
pub_priv_headstart     object
CITY                   object
FACILITY_NAME          object
ENROLLMENT            float64
n                      object
pct                    object
vac_info_type          object
REPORTED               object
dtype: object

In [7]:
# Change n and percent to floats
pertusis_1718[['n', 'pct']] = pertusis_1718[['n', 'pct']].astype('float64')

In [8]:
# Check conversion
pertusis_1718.dtypes

FACILITY_NUMBER         int64
COUNTY                 object
pub_priv_headstart     object
CITY                   object
FACILITY_NAME          object
ENROLLMENT            float64
n                     float64
pct                   float64
vac_info_type          object
REPORTED               object
dtype: object

In [9]:
pertusis_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,,98.0,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,,95.0,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,,95.0,childcare,Y
3,13417441,ALAMEDA,PRIVATE,ALAMEDA,SUGAR AND SPICE,,,,childcare,N
4,6151211,ALAMEDA,PRIVATE,ALAMEDA,MCKINNEY CHRISTIAN ACADEMY,,,,kindergarten,N


In [10]:
sum(pertusis_1718.REPORTED=='N')

1005

In [11]:
sum(pertusis_1718.REPORTED=='Y')

18304

Subset dataset to those that reported

In [12]:
# Take subset that have numbers
vac_1718=pertusis_1718[pertusis_1718.REPORTED=='Y']

In [13]:
vac_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,,98.0,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,,95.0,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,,95.0,childcare,Y
6,13415928,ALAMEDA,PRIVATE,ALAMEDA,BRIGHT HORIZONS AT GARNER,109.0,,99.0,childcare,Y
7,13419403,ALAMEDA,PRIVATE,ALAMEDA,PETER PAN SCHOOL,100.0,,99.0,childcare,Y


Numbers in the original document have been surpressed for <=1%, <=2%, <=5% and >=95%,>=98%,>=99% so will impute values by assuming that the percentage is equal to whatever value listed and multiply by the number of students enrolled at the school. 

In [14]:
# Calculate n from enrollment and approximate percentage
vac_1718.loc[:,('n')]=vac_1718.n.fillna(vac_1718.ENROLLMENT*vac_1718.pct/100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
vac_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63.0,61.74,98.0,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47.0,44.65,95.0,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44.0,41.8,95.0,childcare,Y
6,13415928,ALAMEDA,PRIVATE,ALAMEDA,BRIGHT HORIZONS AT GARNER,109.0,107.91,99.0,childcare,Y
7,13419403,ALAMEDA,PRIVATE,ALAMEDA,PETER PAN SCHOOL,100.0,99.0,99.0,childcare,Y


In [16]:
# Change to integers 
vac_1718.loc[:,('ENROLLMENT','n','pct')]=vac_1718.loc[:,('ENROLLMENT','n','pct')].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
vac_1718.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,10214883,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63,61,98,childcare,Y
1,13417425,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START . SUE MATHESON CENTER,47,44,95,childcare,Y
2,13420593,ALAMEDA,HEAD START,ALAMEDA,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44,41,95,childcare,Y
6,13415928,ALAMEDA,PRIVATE,ALAMEDA,BRIGHT HORIZONS AT GARNER,109,107,99,childcare,Y
7,13419403,ALAMEDA,PRIVATE,ALAMEDA,PETER PAN SCHOOL,100,99,99,childcare,Y


In [18]:
# Change print settings to see all
#np.set_printoptions(threshold=np.nan)
#change back to default printing length
np.set_printoptions(threshold=5)
# Check County names
sorted(list(vac_1718.COUNTY.unique()))

['ALAMEDA',
 'AMADOR',
 'BUTTE',
 'CALAVERAS',
 'COLUSA',
 'CONTRA COSTA',
 'DEL NORTE',
 'EL DORADO',
 'FRESNO',
 'GLENN',
 'HUMBOLDT',
 'IMPERIAL',
 'INYO',
 'KERN',
 'KINGS',
 'LAKE',
 'LASSEN',
 'LOS ANGELES',
 'MADERA',
 'MARIN',
 'MARIPOSA',
 'MENDOCINO',
 'MERCED',
 'MODOC',
 'MONO',
 'MONTEREY',
 'NAPA',
 'NEVADA',
 'ORANGE',
 'PLACER',
 'PLUMAS',
 'RIVERSIDE',
 'SACRAMENTO',
 'SAN BENITO',
 'SAN BERNARDINO',
 'SAN DIEGO',
 'SAN FRANCISCO',
 'SAN JOAQUIN',
 'SAN LUIS OBISPO',
 'SAN MATEO',
 'SANTA BARBARA',
 'SANTA CLARA',
 'SANTA CRUZ',
 'SHASTA',
 'SIERRA',
 'SISKIYOU',
 'SOLANO',
 'SONOMA',
 'STANISLAUS',
 'SUTTER',
 'TEHAMA',
 'TRINITY',
 'TULARE',
 'TUOLUMNE',
 'VENTURA',
 'YOLO',
 'YUBA']

In [19]:
#Check city names
sorted(list(vac_1718.CITY.unique()))

['ACAMPO',
 'ACTON',
 'ACTON, CA',
 'ADELANTO',
 'AGOURA',
 'AGOURA HILLS',
 'AGUANGA',
 'AHWAHNEE',
 'ALAMEDA',
 'ALAMO',
 'ALBANY',
 'ALHAMBRA',
 'ALISO VIEJO',
 'ALPAUGH',
 'ALPINE',
 'ALTA LOMA',
 'ALTADENA',
 'ALTURAS',
 'ALVISO',
 'AMERICAN CANYON',
 'ANAHEIM',
 'ANAHEIM HILLS',
 'ANAHEIM,',
 'ANANHEIM',
 'ANDERSON',
 'ANGELS CAMP',
 'ANTELOPE',
 'ANTIOCH',
 'ANZA',
 'APPLE VALLEY',
 'APTOS',
 'ARBOGA',
 'ARBUCKLE',
 'ARCADIA',
 'ARCATA',
 'ARLETA',
 'ARMONA',
 'ARNOLD',
 'AROMAS',
 'ARROYO GRANDE',
 'ARTESIA',
 'ARVIN',
 'ATASCADERO',
 'ATHERTON',
 'ATWATER',
 'AUBERRY',
 'AUBURN',
 'AVALON',
 'AVENAL',
 'AVERY',
 'AZUSA',
 'BAKERSFIELD',
 'BALDWIN PARK',
 'BALLICO',
 'BANNING',
 'BANTA',
 'BARSTOW',
 'BAY POINT',
 'BAYSIDE',
 'BEALE AFB',
 'BEALE AIR FORCE BASE',
 'BEAUMONT',
 'BELL',
 'BELL GARDENS',
 'BELLA VISTA',
 'BELLFLOWER',
 'BELMONT',
 'BELVEDERE',
 'BENICA',
 'BENICIA',
 'BERKELEY',
 'BERMUDA DUNES',
 'BEVERLY HILLS',
 'BIG BEAR CITY',
 'BIG BEAR LAKE',
 'BIGGS',
 'BI

In [20]:
# Fix City Typos
vac_1718.loc[vac_1718.CITY=='ACTON, CA', 'CITY']='ACTON'
vac_1718.loc[vac_1718.CITY=='AGOURA', 'CITY']='AGOURA HILLS'
vac_1718.loc[vac_1718.CITY=='ANAHEIM,', 'CITY']='ANAHEIM'
vac_1718.loc[vac_1718.CITY=='ANANHEIM,', 'CITY']='ANAHEIM'
vac_1718.loc[vac_1718.CITY=='CA', 'CITY']='NAPA'
vac_1718.loc[vac_1718.CITY=='BEALE AIR FORCE BASE', 'CITY']='BEALE AFB' # to match census name
vac_1718.loc[vac_1718.CITY=='BENICA', 'CITY']='BENICIA'
vac_1718.loc[vac_1718.CITY=='CARDIFF', 'CITY']='CARDIFF BY THE SEA'
vac_1718.loc[vac_1718.CITY=='CARMEL', 'CITY']='CARMEL VALLEY'
vac_1718.loc[vac_1718.CITY=='CHINO,', 'CITY']='CHINO'
vac_1718.loc[vac_1718.CITY=='CHULAR', 'CITY']='CHUALAR'
vac_1718.loc[vac_1718.CITY=='CITY OF COMMERCE', 'CITY']='COMMERCE'
vac_1718.loc[vac_1718.CITY=='CITY OF INDUSTRY', 'CITY']='INDUSTRY'
vac_1718.loc[vac_1718.CITY=='CUDAHAY', 'CITY']='CUDAHY'
vac_1718.loc[vac_1718.CITY=='E. NICOLAUS', 'CITY']='EAST NICOLAUS'
vac_1718.loc[vac_1718.CITY=='E. RANCHO DOMINGUEZ', 'CITY']='EAST RANCHO DOMINGUEZ'
vac_1718.loc[vac_1718.CITY=='E RANCHO DOMINGUEZ', 'CITY']='EAST RANCHO DOMINGUEZ'
vac_1718.loc[vac_1718.CITY=='E. WHITTIER', 'CITY']='EAST WHITTIER'
vac_1718.loc[vac_1718.CITY=='EL SEGUNDO,', 'CITY']='EL SEGUNDO'
vac_1718.loc[vac_1718.CITY=='FAIRIFELD', 'CITY']='FAIRFIELD'
vac_1718.loc[vac_1718.CITY=='FREMOTN', 'CITY']='FREMONT'
vac_1718.loc[vac_1718.CITY=='FT. IRWIN', 'CITY']='FORT IRWIN'
vac_1718.loc[vac_1718.CITY=='GREENFILED', 'CITY']='GREENFIELD'
vac_1718.loc[vac_1718.CITY=='HUNTINGTON', 'CITY']='HUNTINGTON PARK'
vac_1718.loc[vac_1718.CITY=='JAMUAL', 'CITY']='JAMUL'
vac_1718.loc[vac_1718.CITY=='LA', 'CITY']='LOS ANGELES'
vac_1718.loc[vac_1718.CITY=='LA CANADA', 'CITY']='LA CANADA FLINTRIDGE'
vac_1718.loc[vac_1718.CITY=='LA CAÃ\x91ADA', 'CITY']='LA CANADA FLINTRIDGE'
vac_1718.loc[vac_1718.CITY=='LA CAÃ\x83â\x80\x98ADA', 'CITY']='LA CANADA FLINTRIDGE'
vac_1718.loc[vac_1718.CITY=='LA CRESCENTA', 'CITY']='LA CRESCENTA-MONTROSE'
vac_1718.loc[vac_1718.CITY=='LAKE VIEW TERRANCE', 'CITY']='LAKE VIEW TERRACE'
vac_1718.loc[vac_1718.CITY=='LAKEVIEW TERRACE', 'CITY']='LAKE VIEW TERRACE'
vac_1718.loc[vac_1718.CITY=='LANCASTER,', 'CITY']='LANCASTER'
vac_1718.loc[vac_1718.CITY=='LAVERNE', 'CITY']='LA VERNE'
vac_1718.loc[vac_1718.CITY=='MC KINLEYVILLE', 'CITY']='MCKINLEYVILLE'
vac_1718.loc[vac_1718.CITY=='MONTROSE', 'CITY']='LA CRESCENTA-MONTROSE'
vac_1718.loc[vac_1718.CITY=='MT. SHASTA', 'CITY']='MOUNT SHASTA'
vac_1718.loc[vac_1718.CITY=='N. HOLLYWOOD', 'CITY']='NORTH HOLLYWOOD'
vac_1718.loc[vac_1718.CITY=='N.A.S. LEMOORE', 'CITY']='LEMOORE STATION' # match census
vac_1718.loc[vac_1718.CITY=='NAS LEMOORE', 'CITY']='LEMOORE STATION'
vac_1718.loc[vac_1718.CITY=='NEWPORT BEACH,', 'CITY']='NEWPORT BEACH'
vac_1718.loc[vac_1718.CITY=='PACOMIA', 'CITY']='PACOIMA'
vac_1718.loc[vac_1718.CITY=='PANORAM ', 'CITY']='PANORAMA CITY'
vac_1718.loc[vac_1718.CITY=='PANORAM CITY', 'CITY']='PANORAMA CITY'
vac_1718.loc[vac_1718.CITY=='PT. REYES STATION', 'CITY']='POINT REYES STATION'
vac_1718.loc[vac_1718.CITY=='RANCHO SANTA MARGARI', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHO SAN MARGARITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHO STA MARGAITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHO STA MARGARITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RCHO STA MARG', 'CITY']='RANCHO SANTA MARGARITA'
vac_1718.loc[vac_1718.CITY=='RANCHOS PALOS VERDES', 'CITY']='RANCHO PALOS VERDES'
vac_1718.loc[vac_1718.CITY=='RIVERSIDE,', 'CITY']='RIVERSIDE'
vac_1718.loc[vac_1718.CITY=='ROLLING HILLS ESTATE', 'CITY']='ROLLING HILLS ESTATES'
vac_1718.loc[vac_1718.CITY=='S. EL MONTE', 'CITY']='SOUTH EL MONTE'
vac_1718.loc[vac_1718.CITY=='S. LAKE TAHOE', 'CITY']='SOUTH LAKE TAHOE'
vac_1718.loc[vac_1718.CITY=='S PASADENA', 'CITY']='SOUTH PASADENA'
vac_1718.loc[vac_1718.CITY=='SAN BERARDINO', 'CITY']='SAN BERNARDINO'
vac_1718.loc[vac_1718.CITY=='SAN FRNCISCO', 'CITY']='SAN FRANCISCO'
vac_1718.loc[vac_1718.CITY=='SAN JOSE,', 'CITY']='SAN JOSE'
vac_1718.loc[vac_1718.CITY=='SANTA BARARA', 'CITY']='SANTA BARBARA'
vac_1718.loc[vac_1718.CITY=='SHASTA LAKE CITY', 'CITY']='SHASTA LAKE'
vac_1718.loc[vac_1718.CITY=='SILVERADO CANYON', 'CITY']='SILVERADO'
vac_1718.loc[vac_1718.CITY=='SOUTH LATE TAHOE', 'CITY']='SOUTH LAKE TAHOE'
vac_1718.loc[vac_1718.CITY=='SPRECKLES', 'CITY']='SPRECKELS'
vac_1718.loc[vac_1718.CITY=='STEVENSONS RANCH', 'CITY']='STEVENSON RANCH'
vac_1718.loc[vac_1718.CITY=='SUISUN', 'CITY']='SUISUN CITY'
vac_1718.loc[vac_1718.CITY=='SUNNYALE', 'CITY']='SUNNYVALE'
vac_1718.loc[vac_1718.CITY=='SUPELVEDA', 'CITY']='SEPULVEDA'
vac_1718.loc[vac_1718.CITY=='TURLOCK,', 'CITY']='TURLOCK'
vac_1718.loc[vac_1718.CITY=='UPPERLAKE', 'CITY']='UPPER LAKE'
vac_1718.loc[vac_1718.CITY=='VANDENBERG AIR FORCE BASE', 'CITY']='VANDENBERG AFB' # match census
vac_1718.loc[vac_1718.CITY=='W. SACRAMENTO', 'CITY']='WEST SACRAMENTO'
vac_1718.loc[vac_1718.CITY=='WEST LOS ANGELES', 'CITY']='LOS ANGELES'

vac_1718.loc[vac_1718.COUNTY=='BERKELEY CITY', 'COUNTY']='ALAMEDA'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
# Change case to title case
vac_1718.COUNTY=vac_1718['COUNTY'].str.title()
vac_1718.CITY=vac_1718['CITY'].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [22]:
# RECODE NEIGHBORHOODS TO CITIES to match Census
vac_1718.loc[vac_1718.CITY=='Alta Loma', 'CITY']='Rancho Cucamonga'
vac_1718.loc[vac_1718.CITY=='Alviso', 'CITY']='San Jose'
vac_1718.loc[vac_1718.CITY=='Anaheim Hills', 'CITY']='Anaheim'
vac_1718.loc[vac_1718.CITY=='Angels Camp', 'CITY']='Angels'
vac_1718.loc[vac_1718.CITY=='Arleta', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Canoga Park', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Canyon Country', 'CITY']='Santa Clarita'
vac_1718.loc[vac_1718.CITY=='Capistrano Beach', 'CITY']='Dana Point'
vac_1718.loc[vac_1718.CITY=='Cardiff By The Sea', 'CITY']='Encinitas'
vac_1718.loc[vac_1718.CITY=='Carmel Valley', 'CITY']='Carmel Valley Village'
vac_1718.loc[vac_1718.CITY=='Chatsworth', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Corona Del Mar', 'CITY']='Newport Beach'
vac_1718.loc[vac_1718.CITY=='Edwards', 'CITY']='Edwards AFB'
vac_1718.loc[vac_1718.CITY=='Emerald Hills', 'CITY']='San Diego'
vac_1718.loc[vac_1718.CITY=='Encino', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Etiwanda', 'CITY']='Rancho Cucamonga'
vac_1718.loc[vac_1718.CITY=='Foothill Ranch', 'CITY']='Lake Forest'
vac_1718.loc[vac_1718.CITY=='Granada Hills', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Harbor City', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Highland Park', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Hollywood', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Idyllwild', 'CITY']='Idyllwild-Pine Cove'
vac_1718.loc[vac_1718.CITY=='La Jolla', 'CITY']='San Diego'
vac_1718.loc[vac_1718.CITY=='Lake Balboa', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Lake View Terrace', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Leucadia', 'CITY']='Encinitas'
vac_1718.loc[vac_1718.CITY=='Mar Vista', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Mcclellan', 'CITY']='North Highlands'
vac_1718.loc[vac_1718.CITY=='Mira Loma', 'CITY']='Jurupa Valley'
vac_1718.loc[vac_1718.CITY=='Murrietta', 'CITY']='Murrieta'
vac_1718.loc[vac_1718.CITY=='Newbury Park', 'CITY']='Thousand Oaks'
vac_1718.loc[vac_1718.CITY=='Newhall', 'CITY']='Santa Clarita'
vac_1718.loc[vac_1718.CITY=='North Hills', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='North Hollywood', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Northridge', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Olympic Valley', 'CITY']='Squaw Valley'
vac_1718.loc[vac_1718.CITY=='Pacific Palisades', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Pacoima', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Palos Verdes', 'CITY']='Palos Verdes Estates'
vac_1718.loc[vac_1718.CITY=='Palos Verdes Peninsula', 'CITY']='Palos Verdes Estates'
vac_1718.loc[vac_1718.CITY=='Panorama City', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Playa Del Rey', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Playa Vista', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Porter Ranch', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Quail Valley', 'CITY']='Menifee'
vac_1718.loc[vac_1718.CITY=='Reseda', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='San Pedro', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='San Ysidro', 'CITY']='San Diego'
vac_1718.loc[vac_1718.CITY=='Santa Catalina', 'CITY']='Avalon'
vac_1718.loc[vac_1718.CITY=='Saugus', 'CITY']='Santa Clarita'
vac_1718.loc[vac_1718.CITY=='Sepulveda', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Sherman Oaks', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Studio City', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Sun City', 'CITY']='Menifee'
vac_1718.loc[vac_1718.CITY=='Sun Valley', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Sunland', 'CITY']='Sunland-Tujunga'
vac_1718.loc[vac_1718.CITY=='Sunset Beach', 'CITY']='Huntington Beach'
vac_1718.loc[vac_1718.CITY=='Sylmar', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Royal Oaks', 'CITY']='Interlaken'
vac_1718.loc[vac_1718.CITY=='Tahoe City', 'CITY']='Sunnyside-Tahoe City'
vac_1718.loc[vac_1718.CITY=='Tarzana', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Tujunga', 'CITY']='Sunland-Tujunga'
vac_1718.loc[vac_1718.CITY=='Tuolumne', 'CITY']='Tuolumne City'
vac_1718.loc[vac_1718.CITY=='Twenty Nine Palms', 'CITY']='Twentynine Palms'
vac_1718.loc[vac_1718.CITY=='Valencia', 'CITY']='Santa Clarita'
vac_1718.loc[vac_1718.CITY=='Valley Glen', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Valley Village', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Van Nuys', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Venice', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Ventura', 'CITY']="San Buenaventura (Ventura)"
vac_1718.loc[vac_1718.CITY=='Walnut Creet', 'CITY']='Walnut Creek'
vac_1718.loc[vac_1718.CITY=='Walnut Valley', 'CITY']='Walnut'
vac_1718.loc[vac_1718.CITY=='West Hills', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Westchester', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Westminister', 'CITY']='Westminster'
vac_1718.loc[vac_1718.CITY=='Wilmington', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Winnetka', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Woodland Hills', 'CITY']='Los Angeles'

vac_1718.loc[vac_1718.CITY=='Angels City', 'CITY']='Angels'
vac_1718.loc[vac_1718.CITY=='Cardiff-By-The-Sea', 'CITY']='Encinitas'
vac_1718.loc[vac_1718.CITY=='Davis,', 'CITY']='Davis'
vac_1718.loc[vac_1718.CITY=='La Cañada', 'CITY']='La Canada Flintridge'
vac_1718.loc[vac_1718.CITY=='La Canada', 'CITY']='La Canada Flintridge'
vac_1718.loc[vac_1718.CITY=='Los Nietos', 'CITY']='West Whittier-Los Nietos'
vac_1718.loc[vac_1718.CITY=='Montclair,', 'CITY']='Montclair'
vac_1718.loc[vac_1718.CITY=='Newport Coast', 'CITY']='Newport Coast'
vac_1718.loc[vac_1718.CITY=='South San Francisoc', 'CITY']='South San Francisco'
vac_1718.loc[vac_1718.CITY=='Sunland-Tujunga', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Arcata Ca', 'CITY']='Arcata'
vac_1718.loc[vac_1718.CITY=='Berkeley Ave', 'CITY']='Berkeley'
vac_1718.loc[vac_1718.CITY=='Camp Pendelton', 'CITY']='Camp Pendleton North'
vac_1718.loc[vac_1718.CITY=='Carmicheal', 'CITY']='Carmichael'
vac_1718.loc[vac_1718.CITY=='Laguan Niguel', 'CITY']='Laguna Niguel'
vac_1718.loc[vac_1718.CITY=='Laguna Niguel Ste 3409', 'CITY']='Laguna Niguel'
vac_1718.loc[vac_1718.CITY=='Lapuente', 'CITY']='La Puente'
vac_1718.loc[vac_1718.CITY=='Las Flores', 'CITY']='Las Flores'
vac_1718.loc[vac_1718.CITY=='Mt. View', 'CITY']='Mountain View'
vac_1718.loc[vac_1718.CITY=='Mision Viejo', 'CITY']='Mission Viejo'
vac_1718.loc[vac_1718.CITY=='S Lake Tahoe', 'CITY']='South Lake Tahoe'
vac_1718.loc[vac_1718.CITY=='Saint Helena', 'CITY']='St. Helena'
vac_1718.loc[vac_1718.CITY=='Santa Clara Ca', 'CITY']='Santa Clara'
vac_1718.loc[vac_1718.CITY=='St Helena', 'CITY']='St. Helena'
vac_1718.loc[vac_1718.CITY=='St Helena', 'CITY']='St. Helena'
vac_1718.loc[vac_1718.CITY=='Stantan', 'CITY']='Stanton'
vac_1718.loc[vac_1718.CITY=='Stinson', 'CITY']='Stinson Beach'

vac_1718.loc[vac_1718.CITY=='Vandenberg Afb', 'CITY']='Vandenberg AFB'
vac_1718.loc[vac_1718.CITY=='Travis Afb', 'CITY']='Travis AFB'
vac_1718.loc[vac_1718.CITY=='Beale Afb', 'CITY']='Beale AFB'

vac_1718.loc[vac_1718.CITY=='"Avalon, Catalina Isl"', 'CITY']='Avalon'
vac_1718.loc[vac_1718.CITY=='"Beale Afb, Ca"', 'CITY']='Beale AFB'
vac_1718.loc[vac_1718.CITY=='"Menifee, Ca"', 'CITY']='Menifee'
vac_1718.loc[vac_1718.CITY=='"Palm Springs, Ca"', 'CITY']='Palm Springs'
vac_1718.loc[vac_1718.CITY=='Aliso Viejo, Ca.', 'CITY']='Aliso Viejo'
vac_1718.loc[vac_1718.CITY=='Avalon, Catalina Isl', 'CITY']='Avalon'
vac_1718.loc[vac_1718.CITY=='Beale Afb, Ca', 'CITY']='Beale AFB'
vac_1718.loc[vac_1718.CITY=='Camarilla', 'CITY']='Camarillo'
vac_1718.loc[vac_1718.CITY=='Camp Pendleton', 'CITY']='Camp Pendleton North'
vac_1718.loc[vac_1718.CITY=='Capinteria', 'CITY']='Carpinteria'
vac_1718.loc[vac_1718.CITY=='Carpenteria', 'CITY']='Carpinteria'
vac_1718.loc[vac_1718.CITY=='Castic', 'CITY']='Castaic'
vac_1718.loc[vac_1718.CITY=='Clear Lake', 'CITY']='Clearlake'
vac_1718.loc[vac_1718.CITY=='Coto De Caza', 'CITY']='Coto de Caza'
vac_1718.loc[vac_1718.CITY=='E. Palo Alto', 'CITY']='East Palo Alto'
vac_1718.loc[vac_1718.CITY=='El Sobrante', 'CITY']='El Sobrante (Contra Costa County)'
vac_1718.loc[vac_1718.CITY=='Fairfield Ca', 'CITY']='Fairfield'
vac_1718.loc[vac_1718.CITY=='Firebaugh Ca', 'CITY']='Firebaugh'
vac_1718.loc[vac_1718.CITY=='Fort Dick', 'CITY']='Fort Dick'
vac_1718.loc[vac_1718.CITY=='Ft Irwin', 'CITY']='Fort Irwin'
vac_1718.loc[vac_1718.CITY=='Grover City', 'CITY']='Grover Beach'
vac_1718.loc[vac_1718.CITY=='Hoopa', 'CITY']='Hoopa'
vac_1718.loc[vac_1718.CITY=='Hunington Beach', 'CITY']='Huntington Beach'
vac_1718.loc[vac_1718.CITY=='Huntington Beach', 'CITY']='Huntington Beach'
vac_1718.loc[vac_1718.CITY=="King'S Beach", 'CITY']='Kings Beach'
vac_1718.loc[vac_1718.CITY=='Ladrea Ranch', 'CITY']='Ladera Ranch'
vac_1718.loc[vac_1718.CITY=='Lancaster, Ca', 'CITY']='Lancaster'
vac_1718.loc[vac_1718.CITY=='Las Flores', 'CITY']='Las Flores'
vac_1718.loc[vac_1718.CITY=='Lemore Nas', 'CITY']='Lemoore'
vac_1718.loc[vac_1718.CITY=='Long Beach', 'CITY']='Long Beach'
vac_1718.loc[vac_1718.CITY=='Los Alto Hills', 'CITY']='Los Altos Hills'
vac_1718.loc[vac_1718.CITY=='Los Angeles, Ca', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Los Flores', 'CITY']='Las Flores (Orange County)'
vac_1718.loc[vac_1718.CITY=='Marina Del Rey', 'CITY']='Marina del Rey'
vac_1718.loc[vac_1718.CITY=='Mcarthur', 'CITY']='McArthur'
vac_1718.loc[vac_1718.CITY=='Mccloud', 'CITY']='McCloud'
vac_1718.loc[vac_1718.CITY=='Mcfarland', 'CITY']='McFarland'
vac_1718.loc[vac_1718.CITY=='Mckinleyville', 'CITY']='McKinleyville'
vac_1718.loc[vac_1718.CITY=='Mckittrick', 'CITY']='McKittrick'
vac_1718.loc[vac_1718.CITY=='Menifee, Ca', 'CITY']='Menifee'
vac_1718.loc[vac_1718.CITY=='Mountain Veiw', 'CITY']='Mountain View'
vac_1718.loc[vac_1718.CITY=='Mountian View', 'CITY']='Mountain View'
vac_1718.loc[vac_1718.CITY=='Mt Shasta', 'CITY']='Mount Shasta'
vac_1718.loc[vac_1718.CITY=='Newport Coast', 'CITY']='Newport Coast'
vac_1718.loc[vac_1718.CITY=='No. Hollywood', 'CITY']='North Hollywood'
vac_1718.loc[vac_1718.CITY=='Northrige', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Onatrio', 'CITY']='Ontario'
vac_1718.loc[vac_1718.CITY=='Pala', 'CITY']='Pala'
vac_1718.loc[vac_1718.CITY=='Palm Springs, Ca', 'CITY']='Palm Springs'
vac_1718.loc[vac_1718.CITY=='Palmdale,', 'CITY']='Palmdale'
vac_1718.loc[vac_1718.CITY=='Palos Verdes Peninsu', 'CITY']='Palos Verdes Estates'
vac_1718.loc[vac_1718.CITY=='Poplar', 'CITY']='Poplar-Cotton Center'
vac_1718.loc[vac_1718.CITY=='Presidio Of Sf', 'CITY']='San Francisco'
vac_1718.loc[vac_1718.CITY=='Rancho Bernardo', 'CITY']='San Diego'
vac_1718.loc[vac_1718.CITY=='Rolling Hills Est.', 'CITY']='Rolling Hills Estates'
vac_1718.loc[vac_1718.CITY=='Sacramento, Ca', 'CITY']='Sacramento'
vac_1718.loc[vac_1718.CITY=='San Bernadino', 'CITY']='San Bernadino'
vac_1718.loc[vac_1718.CITY=='San Miguel', 'CITY']='San Miguel'
vac_1718.loc[vac_1718.CITY=='Santa Barbra', 'CITY']='Santa Barbara'
vac_1718.loc[vac_1718.CITY=='Simi Valley', 'CITY']='Simi Valley'
vac_1718.loc[vac_1718.CITY=='So. Lake Tahoe', 'CITY']='South Lake Tahoe'
vac_1718.loc[vac_1718.CITY=='So. San Francisco', 'CITY']='South San Francisco'
vac_1718.loc[vac_1718.CITY=='Spring Valley', 'CITY']='Spring Valley'
vac_1718.loc[vac_1718.CITY=='Stevenson', 'CITY']='Stevenson Ranch'
vac_1718.loc[vac_1718.CITY=='Thousand Oak', 'CITY']='Thousand Oaks'
vac_1718.loc[vac_1718.CITY=='Tranquility', 'CITY']='Tranquillity'
vac_1718.loc[vac_1718.CITY=='Travis AFB', 'CITY']='Fairfield'
vac_1718.loc[vac_1718.CITY=='W. Hollywood', 'CITY']='West Hollywood'
vac_1718.loc[vac_1718.CITY=='West Los Angeles', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Westlake', 'CITY']='Westlake Village'
vac_1718.loc[vac_1718.CITY=='Westminter', 'CITY']='Westminster'
vac_1718.loc[vac_1718.CITY=='Yermo', 'CITY']='Yermo'

vac_1718.loc[vac_1718.CITY=='El Sobrante', 'CITY']='El Sobrante (Contra Costa County)'
vac_1718.loc[vac_1718.CITY=='Las Flores', 'CITY']='Las Flores (Orange County)'
vac_1718.loc[vac_1718.CITY=='Los Nietos', 'CITY']='West Whittier-Los Nietos'
vac_1718.loc[vac_1718.CITY=='West Whittier', 'CITY']='West Whittier-Los Nietos'
vac_1718.loc[vac_1718.CITY=='Paso Robles', 'CITY']='El Paso de Robles (Paso Robles)'
vac_1718.loc[vac_1718.CITY=='El Paso de Robles', 'CITY']='El Paso de Robles (Paso Robles)'
vac_1718.loc[vac_1718.CITY=='Piñon Hills', 'CITY']='Pinon Hills'
vac_1718.loc[vac_1718.CITY=='San Miguel', 'CITY']='San Miguel (San Luis Obispo County)'
vac_1718.loc[vac_1718.CITY=='Spring Valley', 'CITY']='Spring Valley (San Diego County)'

# Fix more typos
vac_1718.loc[vac_1718.CITY=='Ananheim', 'CITY']='Anaheim'
vac_1718.loc[vac_1718.CITY=='Costa Mesa,', 'CITY']='Costa Mesa'
vac_1718.loc[vac_1718.CITY=='Paso Robles', 'CITY']='El Paso De Robles (Paso Robles)'
vac_1718.loc[vac_1718.CITY=='El Paso de Robles', 'CITY']='El Paso De Robles (Paso Robles)'
vac_1718.loc[vac_1718.CITY=='Chasworth', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Huntington  Beach', 'CITY']='Huntington Beach'
vac_1718.loc[vac_1718.CITY=='Long  Beach', 'CITY']='Long Beach'
vac_1718.loc[vac_1718.CITY=='Mt Baldy', 'CITY']='Mount Baldy'
vac_1718.loc[vac_1718.CITY=='Mt. Baldy', 'CITY']='Mount Baldy'
vac_1718.loc[vac_1718.CITY=='North Hollywood', 'CITY']='Los Angeles'
vac_1718.loc[vac_1718.CITY=='Oneals', 'CITY']="O'Neals"
vac_1718.loc[vac_1718.CITY=='Canyon County', 'CITY']='Santa Clarita'
vac_1718.loc[vac_1718.CITY=='Coaresgold', 'CITY']='Coarsegold'
vac_1718.loc[vac_1718.CITY=='El Toro', 'CITY']='Lake Forest'
vac_1718.loc[vac_1718.CITY=='Herber', 'CITY']='Heber'
vac_1718.loc[vac_1718.CITY=='Hespera', 'CITY']='Hesperia'
vac_1718.loc[vac_1718.CITY=='Los Oso', 'CITY']='Los Osos'
vac_1718.loc[vac_1718.CITY=='Newport Coast', 'CITY']='Newport Beach'
vac_1718.loc[vac_1718.CITY=='Pinedale', 'CITY']='Fresno'
vac_1718.loc[vac_1718.CITY=='Rancho Tehema', 'CITY']='Rancho Tehama Reserve'
vac_1718.loc[vac_1718.CITY=='Redding ', 'CITY']='Redding'
vac_1718.loc[vac_1718.CITY=='Riversidty', 'CITY']='Riverside'
vac_1718.loc[vac_1718.CITY=='San Bernadino', 'CITY']='San Bernardino'
vac_1718.loc[vac_1718.CITY=='Simi  Valley', 'CITY']='Simi Valley'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [23]:
vac_1718.rename(columns={'COUNTY': 'county',
                         'CITY': 'city',
                         'FACILITY_NUMBER': 'facility_num',
                         'FACILITY_NAME':'facility_name',
                         'pub_priv_headstart':'is_public',
                         'ENROLLMENT': 'enrollment',
                         'REPORTED': 'reported'
                        }, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [24]:
vac_1718.loc[vac_1718.vac_info_type=='7th grade', 'vac_info_type']='7thGradeData'
vac_1718.loc[vac_1718.vac_info_type=='childcare', 'vac_info_type']='ChildCareData'
vac_1718.loc[vac_1718.vac_info_type=='kindergarten', 'vac_info_type']='KindergartenData'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [25]:
#Get count of students vaccinated by county,city and grade
vac_n=vac_1718.groupby(by=['county','city','vac_info_type'], as_index=False)['n'].sum()
vac_n

Unnamed: 0,county,city,vac_info_type,n
0,Alameda,Alameda,7thGradeData,931
1,Alameda,Alameda,ChildCareData,1231
2,Alameda,Alameda,KindergartenData,986
3,Alameda,Albany,7thGradeData,255
4,Alameda,Albany,ChildCareData,249
5,Alameda,Albany,KindergartenData,307
6,Alameda,Berkeley,7thGradeData,908
7,Alameda,Berkeley,ChildCareData,2346
8,Alameda,Berkeley,KindergartenData,860
9,Alameda,Castro Valley,7thGradeData,776


In [26]:
#Export count of students vaccinated by county,city and grade
vac_n.to_csv("vac_n_table1718.CSV")

In [27]:
#Get average percentage of students vaccinated by county,city and grade
vac_pct=vac_1718.groupby(by=['county','city','vac_info_type'], as_index=False)['pct'].mean()
vac_pct

Unnamed: 0,county,city,vac_info_type,pct
0,Alameda,Alameda,7thGradeData,94.900000
1,Alameda,Alameda,ChildCareData,96.000000
2,Alameda,Alameda,KindergartenData,96.384615
3,Alameda,Albany,7thGradeData,98.000000
4,Alameda,Albany,ChildCareData,95.571429
5,Alameda,Albany,KindergartenData,97.666667
6,Alameda,Berkeley,7thGradeData,96.750000
7,Alameda,Berkeley,ChildCareData,92.750000
8,Alameda,Berkeley,KindergartenData,91.333333
9,Alameda,Castro Valley,7thGradeData,97.000000


In [28]:
#Export average percentage of students vaccinated by county,city and grade
vac_pct.to_csv("vac_pct_table1718.CSV")

In [29]:
# create grouping by county for comparison to outbreak incidence rates
#Get average percentage of students vaccinated by county and grade
vac_county_pct=vac_1718.groupby(by=['county','vac_info_type'], as_index=False)['pct'].mean()
vac_county_pct

Unnamed: 0,county,vac_info_type,pct
0,Alameda,7thGradeData,96.408759
1,Alameda,ChildCareData,95.118881
2,Alameda,KindergartenData,96.327138
3,Amador,7thGradeData,97.666667
4,Amador,ChildCareData,91.666667
5,Amador,KindergartenData,94.500000
6,Butte,7thGradeData,93.952381
7,Butte,ChildCareData,93.714286
8,Butte,KindergartenData,94.435897
9,Calaveras,7thGradeData,81.250000


In [30]:
#Export average percentage of students vaccinated by county,and grade
vac_county_pct.to_csv("vac_pct_county_table1718.CSV")

In [31]:
vac_1718.head()

Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,vac_info_type,reported
0,10214883,Alameda,HEAD START,Alameda,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63,61,98,ChildCareData,Y
1,13417425,Alameda,HEAD START,Alameda,ALAMEDA HEAD START . SUE MATHESON CENTER,47,44,95,ChildCareData,Y
2,13420593,Alameda,HEAD START,Alameda,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44,41,95,ChildCareData,Y
6,13415928,Alameda,PRIVATE,Alameda,BRIGHT HORIZONS AT GARNER,109,107,99,ChildCareData,Y
7,13419403,Alameda,PRIVATE,Alameda,PETER PAN SCHOOL,100,99,99,ChildCareData,Y


In [32]:
#Export full cleaned data
vac_1718.to_csv("clean1718.csv", index=False)

## Census Data Cleaning

### Demographics

In [33]:
# Import and clean census data for basic demographics age gender race
census16 = pd.read_csv("vaxxfacts/raw_data/Census/ACS_16_demographic.csv",encoding = "ISO-8859-1",header=[0],na_values=['-'],)

In [34]:
census16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC108,HC04_VC108,HC01_VC109,HC02_VC109,HC03_VC109,HC04_VC109,HC01_VC110,HC02_VC110,HC03_VC110,HC04_VC110
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",1000,338,1000,(X),409,146,40.9,...,695,(X),279,101,40.1,7.1,416,194,59.9,7.1
1,1600000US0600156,600156,"Acampo CDP, California",466,398,466,(X),291,277,62.4,...,466,(X),291,277,62.4,13.7,175,137,37.6,13.7
2,1600000US0600212,600212,"Acton CDP, California",7170,555,7170,(X),3776,318,52.7,...,5777,(X),3020,279,52.3,2.2,2757,199,47.7,2.2
3,1600000US0600296,600296,"Adelanto city, California",32311,35,32311,(X),16350,651,50.6,...,16322,(X),8399,706,51.5,2.4,7923,539,48.5,2.4
4,1600000US0600310,600310,"Adin CDP, California",155,91,155,(X),59,40,38.1,...,115,(X),52,33,45.2,15.8,63,35,54.8,15.8


In [35]:
demographic16 = census16[['GEO.id','GEO.id2','GEO.display-label',
                     'HC01_VC03',
                     'HC01_VC04','HC03_VC04',
                     'HC01_VC05','HC03_VC05',
                     'HC01_VC08','HC03_VC08',
                     'HC01_VC09','HC03_VC09',
                     'HC01_VC10','HC03_VC10',
                     'HC01_VC11','HC03_VC11',
                     'HC01_VC12','HC03_VC12',
                     'HC01_VC13','HC03_VC13',
                     'HC01_VC14','HC03_VC14',
                     'HC01_VC15','HC03_VC15',
                     'HC01_VC16','HC03_VC16',
                     'HC01_VC17','HC03_VC17',
                     'HC01_VC18','HC03_VC18',
                     'HC01_VC19','HC03_VC19',
                     'HC01_VC20','HC03_VC20',
                     'HC01_VC23','HC03_VC23',
                     'HC01_VC88','HC03_VC88',
                     'HC01_VC94','HC03_VC94',
                     'HC01_VC95','HC03_VC95',
                     'HC01_VC96','HC03_VC96',
                     'HC01_VC97','HC03_VC97',
                     'HC01_VC98','HC03_VC98',
                     'HC01_VC99','HC03_VC99',
                     'HC01_VC100','HC03_VC100',
                     'HC01_VC101','HC03_VC101',
                     'HC01_VC102','HC03_VC102'                 
        ]]

In [36]:
demographic16.dtypes

GEO.id                object
GEO.id2                int64
GEO.display-label     object
HC01_VC03              int64
HC01_VC04              int64
HC03_VC04            float64
HC01_VC05              int64
HC03_VC05            float64
HC01_VC08              int64
HC03_VC08            float64
HC01_VC09              int64
HC03_VC09            float64
HC01_VC10              int64
HC03_VC10            float64
HC01_VC11              int64
HC03_VC11            float64
HC01_VC12              int64
HC03_VC12            float64
HC01_VC13              int64
HC03_VC13            float64
HC01_VC14              int64
HC03_VC14            float64
HC01_VC15              int64
HC03_VC15            float64
HC01_VC16              int64
HC03_VC16            float64
HC01_VC17              int64
HC03_VC17            float64
HC01_VC18              int64
HC03_VC18            float64
HC01_VC19              int64
HC03_VC19            float64
HC01_VC20              int64
HC03_VC20            float64
HC01_VC23     

In [37]:
# demographic16[['HC01_VC03','HC01_VC04','HC01_VC05','HC01_VC08','HC01_VC09','HC01_VC10','HC01_VC11','HC01_VC12','HC01_VC13',
#           'HC01_VC14','HC01_VC15','HC01_VC16','HC01_VC17','HC01_VC18','HC01_VC19','HC01_VC20','HC01_VC23','HC01_VC88',
#           'HC01_VC94','HC01_VC95','HC01_VC96','HC01_VC97','HC01_VC98',
#           'HC01_VC99','HC01_VC100','HC01_VC101','HC01_VC102']] = demographic16[['HC01_VC03','HC01_VC04','HC01_VC05','HC01_VC08','HC01_VC09','HC01_VC10','HC01_VC11','HC01_VC12','HC01_VC13',
#           'HC01_VC14','HC01_VC15','HC01_VC16','HC01_VC17','HC01_VC18','HC01_VC19','HC01_VC20','HC01_VC23','HC01_VC88',
#           'HC01_VC94','HC01_VC95','HC01_VC96','HC01_VC97','HC01_VC98',
#           'HC01_VC99','HC01_VC100','HC01_VC101','HC01_VC102']].astype('float64')

In [38]:
#combine other race, two or more races (HC01_VC99, HC01_VC100, HC01_VC101, HC01_VC102)
demographic16['other']=pd.Series(census16['HC01_VC99']+census16['HC01_VC100']+census16['HC01_VC101']+census16['HC01_VC102'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [39]:
demographic16['other_pct']=pd.Series(census16['HC03_VC99']+census16['HC03_VC100']+census16['HC03_VC101']+census16['HC03_VC102'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
demographic16.rename(columns={'GEO.display-label': 'city',
                         'HC01_VC03': 'tot_pop', 
                         'HC01_VC04': 'male',
                         'HC01_VC05': 'female',
                         'HC01_VC08':'under_5',
                         'HC01_VC09':'5_9',
                         'HC01_VC10':'10_14',
                         'HC01_VC11':'15_19',
                         'HC01_VC12':'20_24',
                         'HC01_VC13':'25_34',
                         'HC01_VC14':'35_44',
                         'HC01_VC15':'45_54',
                         'HC01_VC16':'55_59',
                         'HC01_VC17':'60_64',
                         'HC01_VC18':'65_74',
                         'HC01_VC19':'75_84',
                         'HC01_VC20':'85_over',
                         'HC01_VC23':'median_age',
                         'HC01_VC88':'hispanic_latino',
                         'HC01_VC89':'mexican',
                         'HC01_VC94':'white',
                         'HC01_VC95':'black',
                         'HC01_VC96':'aian',
                         'HC01_VC97':'asian',
                         'HC01_VC98':'nhopi',

                         'HC03_VC04': 'male_pct',
                         'HC03_VC05': 'female_pct',
                         'HC03_VC08':'under_5_pct',
                         'HC03_VC09':'5_9_pct',
                         'HC03_VC10':'10_14_pct',
                         'HC03_VC11':'15_19_pct',
                         'HC03_VC12':'20_24_pct',
                         'HC03_VC13':'25_34_pct',
                         'HC03_VC14':'35_44_pct',
                         'HC03_VC15':'45_54_pct',
                         'HC03_VC16':'55_59_pct',
                         'HC03_VC17':'60_64_pct',
                         'HC03_VC18':'65_74_pct',
                         'HC03_VC19':'75_84_pct',
                         'HC03_VC20':'85_over_pct',
                         'HC03_VC23':'median_age_pct',
                         'HC03_VC88':'hispanic_latino_pct',
                         'HC03_VC89':'mexican_pct',
                         'HC03_VC94':'white_pct',
                         'HC03_VC95':'black_pct',
                         'HC03_VC96':'aian_pct',
                         'HC03_VC97':'asian_pct',
                         'HC03_VC98':'nhopi_pct'
                        }, 
                inplace=True)
# Other race categories
#                          'HC01_VC99':'other_only',
#                          'HC01_VC100':'more_two_races',
#                          'HC01_VC101':'more_two_races_other',
#                          'HC01_VC102':'more_two_races_other_exclude',


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [41]:
demographic16.dtypes

GEO.id                  object
GEO.id2                  int64
city                    object
tot_pop                  int64
male                     int64
male_pct               float64
female                   int64
female_pct             float64
under_5                  int64
under_5_pct            float64
5_9                      int64
5_9_pct                float64
10_14                    int64
10_14_pct              float64
15_19                    int64
15_19_pct              float64
20_24                    int64
20_24_pct              float64
25_34                    int64
25_34_pct              float64
35_44                    int64
35_44_pct              float64
45_54                    int64
45_54_pct              float64
55_59                    int64
55_59_pct              float64
60_64                    int64
60_64_pct              float64
65_74                    int64
65_74_pct              float64
75_84                    int64
75_84_pct              float64
85_over 

In [42]:
demographic16['city']=demographic16.city.replace(' city, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
demographic16['city']=demographic16.city.replace(' CDP, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
demographic16['city']=demographic16.city.replace(' town, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [45]:
## Uppercase cities
demographic16['city']=demographic16['city'].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [46]:
demographic16=demographic16.drop(columns=['HC01_VC99','HC03_VC99','HC01_VC100','HC03_VC100','HC01_VC101','HC03_VC101','HC01_VC102','HC03_VC102'])
demographic16.head()

Unnamed: 0,GEO.id,GEO.id2,city,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,black,black_pct,aian,aian_pct,asian,asian_pct,nhopi,nhopi_pct,other,other_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,0,0.0,0,0.0,209,20.9,0,0.0,322,32.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,120,1.7,15,0.2,67,0.9,0,0.0,254,3.6
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,5810,18.0,12,0.0,497,1.5,0,0.0,2061,6.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,0,0.0,0,0.0,0,0.0,0,0.0,10,6.4


### Median Household Income and Health insurance Status

Median Household Income has categories 2,500-, 250,000+ which was changed to 2500 and 250000. 

In [47]:
# Import and clean acs data for income and insurance status
acs16 = pd.read_csv("vaxxfacts/raw_data/Census/ACS_16_economic.csv", encoding = "ISO-8859-1", header=[0], na_values=['-']) 

In [48]:
acs16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC178,HC04_VC178,HC01_VC179,HC02_VC179,HC03_VC179,HC04_VC179,HC01_VC180,HC02_VC180,HC03_VC180,HC04_VC180
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",747,288,747,(X),497,296,66.5,...,0.0,26.1,(X),(X),0.0,3.5,(X),(X),0.0,26.9
1,1600000US0600156,600156,"Acampo CDP, California",466,398,466,(X),368,359,79.0,...,0.0,27.6,(X),(X),0.0,9.3,(X),(X),0.0,21.6
2,1600000US0600212,600212,"Acton CDP, California",6098,398,6098,(X),3917,339,64.2,...,5.2,3.6,(X),(X),1.6,1.2,(X),(X),39.7,11.7
3,1600000US0600296,600296,"Adelanto city, California",21534,628,21534,(X),9910,597,46.0,...,21.3,6.9,(X),(X),41.5,5.6,(X),(X),49.4,5.9
4,1600000US0600310,600310,"Adin CDP, California",123,63,123,(X),51,36,41.5,...,32.5,43.5,(X),(X),0.0,28.3,(X),(X),34.0,32.3


In [49]:
acs16=acs16[['GEO.id','GEO.id2','GEO.display-label','HC01_VC85','HC01_VC131','HC03_VC131',
                'HC01_VC132','HC03_VC132','HC01_VC133','HC03_VC133','HC01_VC134','HC03_VC134']]
income16=acs16

In [50]:
income16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC85,HC01_VC131,HC03_VC131,HC01_VC132,HC03_VC132,HC01_VC133,HC03_VC133,HC01_VC134,HC03_VC134
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,"Acampo CDP, California",155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,"Acton CDP, California",91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,"Adelanto city, California",29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,"Adin CDP, California",55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [51]:
income16.dtypes

GEO.id                object
GEO.id2                int64
GEO.display-label     object
HC01_VC85            float64
HC01_VC131             int64
HC03_VC131           float64
HC01_VC132             int64
HC03_VC132           float64
HC01_VC133             int64
HC03_VC133           float64
HC01_VC134             int64
HC03_VC134           float64
dtype: object

In [52]:
income16.rename(columns={'GEO.display-label': 'city',
                         'HC01_VC85': 'median_income',
                         'HC01_VC131': 'insurance',
                         'HC01_VC132': 'private_insure',
                         'HC01_VC133': 'public_insure',
                         'HC01_VC134': 'no_insurance',

                         'HC03_VC131': 'insurance_pct',
                         'HC03_VC132': 'private_insure_pct',
                         'HC03_VC133': 'public_insure_pct',
                         'HC03_VC134': 'no_insurance_pct'  
                        }, 
                inplace=True)

In [53]:
income16.dtypes

GEO.id                 object
GEO.id2                 int64
city                   object
median_income         float64
insurance               int64
insurance_pct         float64
private_insure          int64
private_insure_pct    float64
public_insure           int64
public_insure_pct     float64
no_insurance            int64
no_insurance_pct      float64
dtype: object

In [54]:
income16['city']=income16.city.replace(' CDP, California', '', regex=True)

In [55]:
income16['city']=income16.city.replace(' city, California', '', regex=True)

In [56]:
income16['city']=income16.city.replace(' town, California', '', regex=True)

In [57]:
income16.head()

Unnamed: 0,GEO.id,GEO.id2,city,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


## Join Demographics and income data

In [58]:
pop_data=demographic16.merge(income16, left_on='GEO.id', right_on='GEO.id', how='outer')
pop_data.head()

Unnamed: 0,GEO.id,GEO.id2_x,city_x,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,city_y,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,Acalanes Ridge,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,Acampo,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,Acton,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,Adelanto,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,Adin,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [59]:
list(pop_data.columns.values)

['GEO.id',
 'GEO.id2_x',
 'city_x',
 'tot_pop',
 'male',
 'male_pct',
 'female',
 'female_pct',
 'under_5',
 'under_5_pct',
 '5_9',
 '5_9_pct',
 '10_14',
 '10_14_pct',
 '15_19',
 '15_19_pct',
 '20_24',
 '20_24_pct',
 '25_34',
 '25_34_pct',
 '35_44',
 '35_44_pct',
 '45_54',
 '45_54_pct',
 '55_59',
 '55_59_pct',
 '60_64',
 '60_64_pct',
 '65_74',
 '65_74_pct',
 '75_84',
 '75_84_pct',
 '85_over',
 '85_over_pct',
 'median_age',
 'median_age_pct',
 'hispanic_latino',
 'hispanic_latino_pct',
 'white',
 'white_pct',
 'black',
 'black_pct',
 'aian',
 'aian_pct',
 'asian',
 'asian_pct',
 'nhopi',
 'nhopi_pct',
 'other',
 'other_pct',
 'GEO.id2_y',
 'city_y',
 'median_income',
 'insurance',
 'insurance_pct',
 'private_insure',
 'private_insure_pct',
 'public_insure',
 'public_insure_pct',
 'no_insurance',
 'no_insurance_pct']

In [60]:
pop_data=pop_data.drop(columns=['GEO.id2_y','city_y'])

In [61]:
pop_data=pop_data.rename(columns={'GEO.id': 'geoid',
                         'GEO.id2_x': 'geoid2',
                         'city_x': 'city'})

In [62]:
# Fix City naming to be similar for special characters
pop_data.loc[pop_data.city=='La Cañada Flintridge', 'city']='La Canada Flintridge'
pop_data.loc[pop_data.city=='Piñon Hills', 'city']='Pinon Hills'

# Change census cities names that are present in multiple places to match vaccine data city
pop_data.loc[pop_data.city=='El Sobrante Cdp (Contra Costa County), California', 'city']='El Sobrante (Contra Costa County)'
pop_data.loc[pop_data.city=='Edwards Afb', 'city']='Edwards AFB'
pop_data.loc[pop_data.city=='Las Flores Cdp (Orange County), California', 'city']='Las Flores (Orange County)'
pop_data.loc[pop_data.city=='West Whittier-Los Nietos CDP, California', 'city']='Los Nietos'
pop_data.loc[pop_data.city=="El Paso de Robles (Paso Robles)", 'city']='Paso Robles'
pop_data.loc[pop_data.city=='San Miguel Cdp (San Luis Obispo County), California', 'city']='San Miguel'
pop_data.loc[pop_data.city=='Spring Valley Cdp (San Diego County), California', 'city']='Spring Valley'

In [63]:
pop_data.head()

Unnamed: 0,geoid,geoid2,city,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,other_pct,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,32.2,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,0.0,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,3.6,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,6.3,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,6.4,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [64]:
# Exported merged census data
pop_data.to_csv("pop_data16.csv", index=False)

## Join Vaccine data to Census data

Join needs unique cities in the vaccine data to use as a key. Will need to figure out how to weight the different grade levels based on the census population numbers for age. 

In [65]:
vac_pop_1718=vac_1718.merge(pop_data, left_on='city', right_on='city', how='left')
vac_pop_1718

Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,vac_info_type,reported,...,other_pct,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,10214883,Alameda,HEAD START,Alameda,ALAMEDA HEAD START . ANGELA AGUILAR CENTER,63,61,98,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
1,13417425,Alameda,HEAD START,Alameda,ALAMEDA HEAD START . SUE MATHESON CENTER,47,44,95,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
2,13420593,Alameda,HEAD START,Alameda,ALAMEDA HEAD START. COLLEGE OF ALAMEDA HEAD START,44,41,95,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
3,13415928,Alameda,PRIVATE,Alameda,BRIGHT HORIZONS AT GARNER,109,107,99,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
4,13419403,Alameda,PRIVATE,Alameda,PETER PAN SCHOOL,100,99,99,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
5,13418175,Alameda,PRIVATE,Alameda,FUZZY CATERPILLAR,72,70,98,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
6,13419405,Alameda,PRIVATE,Alameda,PETER PAN ACADEMY,50,49,98,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
7,13420949,Alameda,PRIVATE,Alameda,RISING STAR MONTESSORI SCHOOL,75,73,98,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
8,13418967,Alameda,PRIVATE,Alameda,SMALL SIZE BIG MIND PRESCHOOL & INFANT CENTER,55,53,98,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
9,13411388,Alameda,PRIVATE,Alameda,ABC PRESCHOOL,30,28,95,ChildCareData,Y,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3


In [66]:
#Export full cleaned data
vac_pop_1718.to_csv("cleanjoin1718.csv", index=False)

In [67]:
vac_pop_1718.dtypes

facility_num             int64
county                  object
is_public               object
city                    object
facility_name           object
enrollment               int64
n                        int64
pct                      int64
vac_info_type           object
reported                object
geoid                   object
geoid2                 float64
tot_pop                float64
male                   float64
male_pct               float64
female                 float64
female_pct             float64
under_5                float64
under_5_pct            float64
5_9                    float64
5_9_pct                float64
10_14                  float64
10_14_pct              float64
15_19                  float64
15_19_pct              float64
20_24                  float64
20_24_pct              float64
25_34                  float64
25_34_pct              float64
35_44                  float64
                        ...   
65_74_pct              float64
75_84   