In [32]:
import numpy as np
import pandas as pd

## 2015-2016 school vaccination data
- The 3 raw datafiles for childcare, kindergarden, and 7th grade has been compiled manually into a csv called `raw15_16_combined.csv` with <= and >= removed and replaced with the corresponding numbers

In [33]:
# Import raw combined 2015-2016 school vaccination data
pertusis_1516 = pd.read_csv("Raw_Data/raw15_16_combined.csv",encoding = "ISO-8859-1")

In [34]:
pertusis_1516.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,13417683,ALAMEDA,PRIVATE,HAYWARD,A JOYFUL NOISE LEARNING CENTER,26.0,26.0,100.0,childcare,Y
1,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,childcare,Y
2,13422032,ALAMEDA,,SAN LEANDRO,AB'S PRESCHOOL AND DAYCARE,,,,childcare,N
3,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,childcare,Y
4,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.0,childcare,Y


In [35]:
#Clean up missing values and set to Nan
pertusis_1516.n=pertusis_1516.n.replace('.', np.nan)
pertusis_1516.pct=pertusis_1516.pct.replace('.',np.nan)

# uppercase all counties and cities
pertusis_1516.COUNTY=pertusis_1516['COUNTY'].str.upper()
pertusis_1516.CITY=pertusis_1516['CITY'].str.upper()

In [36]:
pertusis_1516.CITY

0                HAYWARD
1                ALAMEDA
2            SAN LEANDRO
3                FREMONT
4                FREMONT
5                ALAMEDA
6          CASTRO VALLEY
7                OAKLAND
8                OAKLAND
9                OAKLAND
10               OAKLAND
11            UNION CITY
12           SAN LORENZO
13           SAN LORENZO
14            PLEASANTON
15               OAKLAND
16               ALAMEDA
17               ALAMEDA
18               ALAMEDA
19                ALBANY
20                ALBANY
21                ALBANY
22         CASTRO VALLEY
23               HAYWARD
24               OAKLAND
25               FREMONT
26                ALBANY
27               ALAMEDA
28            EMERYVILLE
29               OAKLAND
              ...       
21493            ESPARTO
21494    WEST SACRAMENTO
21495    WEST SACRAMENTO
21496    WEST SACRAMENTO
21497    WEST SACRAMENTO
21498    WEST SACRAMENTO
21499    WEST SACRAMENTO
21500    WEST SACRAMENTO
21501    WEST SACRAMENTO


In [37]:
pertusis_1516.dtypes

FACILITY_NUMBER         int64
COUNTY                 object
pub_priv_headstart     object
CITY                   object
FACILITY_NAME          object
ENROLLMENT            float64
n                     float64
pct                   float64
vac_info_type          object
REPORTED               object
dtype: object

In [38]:
# Change n and percent to floats
pertusis_1516[['n', 'pct']] = pertusis_1516[['n', 'pct']].astype('float64')

In [39]:
# Check conversion
pertusis_1516.dtypes

FACILITY_NUMBER         int64
COUNTY                 object
pub_priv_headstart     object
CITY                   object
FACILITY_NAME          object
ENROLLMENT            float64
n                     float64
pct                   float64
vac_info_type          object
REPORTED               object
dtype: object

In [40]:
pertusis_1516.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,13417683,ALAMEDA,PRIVATE,HAYWARD,A JOYFUL NOISE LEARNING CENTER,26.0,26.0,100.0,childcare,Y
1,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,childcare,Y
2,13422032,ALAMEDA,,SAN LEANDRO,AB'S PRESCHOOL AND DAYCARE,,,,childcare,N
3,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,childcare,Y
4,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.0,childcare,Y


In [41]:
sum(pertusis_1516.REPORTED=='N')

1823

In [42]:
sum(pertusis_1516.REPORTED=='Y')

19700

Subset dataset to those that reported

In [43]:
# Take subset that have numbers
vac_1516=pertusis_1516[pertusis_1516.REPORTED=='Y']

In [44]:
vac_1516.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,13417683,ALAMEDA,PRIVATE,HAYWARD,A JOYFUL NOISE LEARNING CENTER,26.0,26.0,100.0,childcare,Y
1,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,childcare,Y
3,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,childcare,Y
4,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.0,childcare,Y
5,13411388,ALAMEDA,PRIVATE,ALAMEDA,ABC PRESCHOOL,35.0,34.0,97.14,childcare,Y


Numbers in the original document have been surpressed for <=1%, <=2%, <=5% and >=95%,>=98%,>=99% so will impute values by assuming that the percentage is equal to whatever value listed and multiply by the number of students enrolled at the school. 

In [45]:
# Calculate n from enrollment and approximate percentage
vac_1516.loc[:,('n')]=vac_1516.n.fillna(vac_1516.ENROLLMENT*vac_1516.pct/100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [46]:
vac_1516.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,13417683,ALAMEDA,PRIVATE,HAYWARD,A JOYFUL NOISE LEARNING CENTER,26.0,26.0,100.0,childcare,Y
1,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,childcare,Y
3,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,childcare,Y
4,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.0,childcare,Y
5,13411388,ALAMEDA,PRIVATE,ALAMEDA,ABC PRESCHOOL,35.0,34.0,97.14,childcare,Y


In [47]:
# Change to integers 
vac_1516.loc[:,('ENROLLMENT','n','pct')]=vac_1516.loc[:,('ENROLLMENT','n','pct')].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [48]:
vac_1516.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,13417683,ALAMEDA,PRIVATE,HAYWARD,A JOYFUL NOISE LEARNING CENTER,26,26,100,childcare,Y
1,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99,93,93,childcare,Y
3,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45,42,93,childcare,Y
4,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25,25,100,childcare,Y
5,13411388,ALAMEDA,PRIVATE,ALAMEDA,ABC PRESCHOOL,35,34,97,childcare,Y


In [49]:
# Change print settings to see all
#np.set_printoptions(threshold=np.nan)
#change back to default printing length
np.set_printoptions(threshold=5)
# Check County names
sorted(list(vac_1516.COUNTY.unique()))

['ALAMEDA',
 'AMADOR',
 'BERKELEY CITY',
 'BUTTE',
 'CALAVERAS',
 'COLUSA',
 'CONTRA COSTA',
 'DEL NORTE',
 'EL DORADO',
 'FRESNO',
 'GLENN',
 'HUMBOLDT',
 'IMPERIAL',
 'INYO',
 'KERN',
 'KINGS',
 'LAKE',
 'LASSEN',
 'LOS ANGELES',
 'MADERA',
 'MARIN',
 'MARIPOSA',
 'MENDOCINO',
 'MERCED',
 'MODOC',
 'MONO',
 'MONTEREY',
 'NAPA',
 'NEVADA',
 'ORANGE',
 'PLACER',
 'PLUMAS',
 'RIVERSIDE',
 'SACRAMENTO',
 'SAN BENITO',
 'SAN BERNARDINO',
 'SAN DIEGO',
 'SAN FRANCISCO',
 'SAN JOAQUIN',
 'SAN LUIS OBISPO',
 'SAN MATEO',
 'SANTA BARBARA',
 'SANTA CLARA',
 'SANTA CRUZ',
 'SHASTA',
 'SIERRA',
 'SISKIYOU',
 'SOLANO',
 'SONOMA',
 'STANISLAUS',
 'SUTTER',
 'TEHAMA',
 'TRINITY',
 'TULARE',
 'TUOLUMNE',
 'VENTURA',
 'YOLO',
 'YUBA']

In [50]:
#Check city names
sorted(list(vac_1516.CITY.unique()))

['ACAMPO',
 'ACTON',
 'ADELANTO',
 'AGOURA',
 'AGOURA HILLS',
 'AGUA DULCE',
 'AGUANGA',
 'AHWAHNEE',
 'ALAMEDA',
 'ALAMO',
 'ALBANY',
 'ALHAMBRA',
 'ALISO VIEJO',
 'ALLENSWORTH',
 'ALPAUGH',
 'ALPINE',
 'ALTA',
 'ALTA LOMA',
 'ALTADENA',
 'ALTURAS',
 'ALVISO',
 'AMERICAN CANYON',
 'ANAHEIM',
 'ANAHEIM HILLS',
 'ANDERSON',
 'ANGELS CAMP',
 'ANGWIN',
 'ANTELOPE',
 'ANTIOCH',
 'ANZA',
 'APPLE VALLEY',
 'APTOS',
 'ARBOGA',
 'ARBUCKLE',
 'ARCADIA',
 'ARCATA',
 'ARCATA CA',
 'ARLETA',
 'ARMONA',
 'ARNOLD',
 'AROMAS',
 'ARROYO GRANDE',
 'ARTESIA',
 'ARVIN',
 'ATASCADERO',
 'ATHERTON',
 'ATWATER',
 'AUBERRY',
 'AUBURN',
 'AVALON',
 'AVENAL',
 'AVERY',
 'AZUSA',
 'BAKER',
 'BAKERSFIELD',
 'BALDWIN PARK',
 'BALDY MESA',
 'BALLICO',
 'BANGOR',
 'BANNING',
 'BANTA',
 'BARSTOW',
 'BAY POINT',
 'BAYSIDE',
 'BEALE AFB',
 'BEALE AIR FORCE BASE',
 'BEAUMONT',
 'BELL',
 'BELL GARDENS',
 'BELLA VISTA',
 'BELLFLOWER',
 'BELMONT',
 'BELVEDERE',
 'BEN LOMOND',
 'BENICIA',
 'BERKELEY',
 'BERKELEY AVE',
 'BE

In [51]:
# Fix City Typos
vac_1516.loc[vac_1516.CITY=='ACTON, CA', 'CITY']='ACTON'
vac_1516.loc[vac_1516.CITY=='AGOURA', 'CITY']='AGOURA HILLS'
vac_1516.loc[vac_1516.CITY=='ANAHEIM,', 'CITY']='ANAHEIM'
vac_1516.loc[vac_1516.CITY=='ANANHEIM', 'CITY']='ANAHEIM'
vac_1516.loc[vac_1516.CITY=='CA', 'CITY']='NAPA'
vac_1516.loc[vac_1516.CITY=='BEALE AIR FORCE BASE', 'CITY']='BEALE AFB' # to match census name
vac_1516.loc[vac_1516.CITY=='BENICA', 'CITY']='BENICIA'
vac_1516.loc[vac_1516.CITY=='CARDIFF', 'CITY']='CARDIFF BY THE SEA'
vac_1516.loc[vac_1516.CITY=='CARMEL', 'CITY']='CARMEL VALLEY'
vac_1516.loc[vac_1516.CITY=='CHINO,', 'CITY']='CHINO'
vac_1516.loc[vac_1516.CITY=='CHULAR', 'CITY']='CHUALAR'
vac_1516.loc[vac_1516.CITY=='CITY OF COMMERCE', 'CITY']='COMMERCE'
vac_1516.loc[vac_1516.CITY=='COSTA MESA,', 'CITY']='COSTA MESA'
vac_1516.loc[vac_1516.CITY=='CITY OF INDUSTRY', 'CITY']='INDUSTRY'
vac_1516.loc[vac_1516.CITY=='CUDAHAY', 'CITY']='CUDAHY'
vac_1516.loc[vac_1516.CITY=='E. NICOLAUS', 'CITY']='EAST NICOLAUS'
vac_1516.loc[vac_1516.CITY=='E. RANCHO DOMINGUEZ', 'CITY']='EAST RANCHO DOMINGUEZ'
vac_1516.loc[vac_1516.CITY=='E. WHITTIER', 'CITY']='EAST WHITTIER'
vac_1516.loc[vac_1516.CITY=='EL SEGUNDO,', 'CITY']='EL SEGUNDO'
vac_1516.loc[vac_1516.CITY=='FAIRIFELD', 'CITY']='FAIRFIELD'
vac_1516.loc[vac_1516.CITY=='FREMOTN', 'CITY']='FREMONT'
vac_1516.loc[vac_1516.CITY=='FT. IRWIN', 'CITY']='FORT IRWIN'
vac_1516.loc[vac_1516.CITY=='GREENFILED', 'CITY']='GREENFIELD'
vac_1516.loc[vac_1516.CITY=='HUNTINGTON', 'CITY']='HUNTINGTON PARK'
vac_1516.loc[vac_1516.CITY=='JAMUAL', 'CITY']='JAMUL'
vac_1516.loc[vac_1516.CITY=='LA', 'CITY']='LOS ANGELES'
vac_1516.loc[vac_1516.CITY=='LA CANADA', 'CITY']='LA CANADA FLINTRIDGE'
vac_1516.loc[vac_1516.CITY=='LA CRESCENTA', 'CITY']='LA CRESCENTA-MONTROSE'
vac_1516.loc[vac_1516.CITY=='LAKE VIEW TERRANCE', 'CITY']='LAKE VIEW TERRACE'
vac_1516.loc[vac_1516.CITY=='LAKEVIEW TERRACE', 'CITY']='LAKE VIEW TERRACE'
vac_1516.loc[vac_1516.CITY=='LANCASTER,', 'CITY']='LANCASTER'
vac_1516.loc[vac_1516.CITY=='LAVERNE', 'CITY']='LA VERNE'
vac_1516.loc[vac_1516.CITY=='MC KINLEYVILLE', 'CITY']='MCKINLEYVILLE'
vac_1516.loc[vac_1516.CITY=='MONTROSE', 'CITY']='LA CRESCENTA-MONTROSE'
vac_1516.loc[vac_1516.CITY=='MT. SHASTA', 'CITY']='MOUNT SHASTA'
vac_1516.loc[vac_1516.CITY=='N. HOLLYWOOD', 'CITY']='NORTH HOLLYWOOD'
vac_1516.loc[vac_1516.CITY=='N.A.S. LEMOORE', 'CITY']='LEMOORE STATION' # match census
vac_1516.loc[vac_1516.CITY=='NAS LEMOORE', 'CITY']='LEMOORE STATION'
vac_1516.loc[vac_1516.CITY=='NEWPORT BEACH,', 'CITY']='NEWPORT BEACH'
vac_1516.loc[vac_1516.CITY=='PACOMIA', 'CITY']='PACOIMA'
vac_1516.loc[vac_1516.CITY=='PANORAM ', 'CITY']='PANORAMA CITY'
vac_1516.loc[vac_1516.CITY=='PANORAM CITY', 'CITY']='PANORAMA CITY'
vac_1516.loc[vac_1516.CITY=='PT. REYES STATION', 'CITY']='POINT REYES STATION'
vac_1516.loc[vac_1516.CITY=='RANCHO SANTA MARGARI', 'CITY']='RANCHO SANTA MARGARITA'
vac_1516.loc[vac_1516.CITY=='RANCHO SAN MARGARITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1516.loc[vac_1516.CITY=='RANCHO STA MARGAITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1516.loc[vac_1516.CITY=='RANCHO STA MARGARITA', 'CITY']='RANCHO SANTA MARGARITA'
vac_1516.loc[vac_1516.CITY=='RCHO STA MARG', 'CITY']='RANCHO SANTA MARGARITA'
vac_1516.loc[vac_1516.CITY=='RANCHOS PALOS VERDES', 'CITY']='RANCHO PALOS VERDES'
vac_1516.loc[vac_1516.CITY=='RIVERSIDE,', 'CITY']='RIVERSIDE'
vac_1516.loc[vac_1516.CITY=='ROLLING HILLS ESTATE', 'CITY']='ROLLING HILLS ESTATES'
vac_1516.loc[vac_1516.CITY=='S. EL MONTE', 'CITY']='SOUTH EL MONTE'
vac_1516.loc[vac_1516.CITY=='S. LAKE TAHOE', 'CITY']='SOUTH LAKE TAHOE'
vac_1516.loc[vac_1516.CITY=='SAN BERARDINO', 'CITY']='SAN BERNARDINO'
vac_1516.loc[vac_1516.CITY=='SAN FRNCISCO', 'CITY']='SAN FRANCISCO'
vac_1516.loc[vac_1516.CITY=='SAN JOSE,', 'CITY']='SAN JOSE'
vac_1516.loc[vac_1516.CITY=='SANTA BARARA', 'CITY']='SANTA BARBARA'
vac_1516.loc[vac_1516.CITY=='SHASTA LAKE CITY', 'CITY']='SHASTA LAKE'
vac_1516.loc[vac_1516.CITY=='SILVERADO CANYON', 'CITY']='SILVERADO'
vac_1516.loc[vac_1516.CITY=='SOUTH LATE TAHOE', 'CITY']='SOUTH LAKE TAHOE'
vac_1516.loc[vac_1516.CITY=='SPRECKLES', 'CITY']='SPRECKELS'
vac_1516.loc[vac_1516.CITY=='STEVENSONS RANCH', 'CITY']='STEVENSON RANCH'
vac_1516.loc[vac_1516.CITY=='SUISUN', 'CITY']='SUISUN CITY'
vac_1516.loc[vac_1516.CITY=='SUNNYALE', 'CITY']='SUNNYVALE'
vac_1516.loc[vac_1516.CITY=='SUPELVEDA', 'CITY']='SEPULVEDA'
vac_1516.loc[vac_1516.CITY=='TURLOCK,', 'CITY']='TURLOCK'
vac_1516.loc[vac_1516.CITY=='UPPERLAKE', 'CITY']='UPPER LAKE'
vac_1516.loc[vac_1516.CITY=='VANDENBERG AIR FORCE BASE', 'CITY']='VANDENBERG AFB' # match census
vac_1516.loc[vac_1516.CITY=='W. SACRAMENTO', 'CITY']='WEST SACRAMENTO'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [52]:
# Change case to title case
vac_1516.COUNTY=vac_1516['COUNTY'].str.title()
vac_1516.CITY=vac_1516['CITY'].str.title()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [53]:
# RECODE NEIGHBORHOODS TO CITIES to match Census
vac_1516.loc[vac_1516.CITY=='Alta Loma', 'CITY']='Rancho Cucamonga'
vac_1516.loc[vac_1516.CITY=='Alviso', 'CITY']='San Jose'
vac_1516.loc[vac_1516.CITY=='Anaheim Hills', 'CITY']='Anaheim'
vac_1516.loc[vac_1516.CITY=='Angels Camp', 'CITY']='Angels City'
vac_1516.loc[vac_1516.CITY=='Arleta', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Canoga Park', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Canyon Country', 'CITY']='Santa Clarita'
vac_1516.loc[vac_1516.CITY=='Capistrano Beach', 'CITY']='Dana Point'
vac_1516.loc[vac_1516.CITY=='Cardiff By The Sea', 'CITY']='Encinitas'
vac_1516.loc[vac_1516.CITY=='Carmel Valley', 'CITY']='Carmel Valley Village'
vac_1516.loc[vac_1516.CITY=='Chatsworth', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Corona Del Mar', 'CITY']='Newport Beach'
vac_1516.loc[vac_1516.CITY=='Edwards', 'CITY']='Edwards AFB'
vac_1516.loc[vac_1516.CITY=='Emerald Hills', 'CITY']='San Diego'
vac_1516.loc[vac_1516.CITY=='Encino', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Etiwanda', 'CITY']='Rancho Cucamonga'
vac_1516.loc[vac_1516.CITY=='Foothill Ranch', 'CITY']='Lake Forest'
vac_1516.loc[vac_1516.CITY=='Granada Hills', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Harbor City', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Highland Park', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Hollywood', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Idyllwild', 'CITY']='Idyllwild-Pine Cove'
vac_1516.loc[vac_1516.CITY=='La Jolla', 'CITY']='San Diego'
vac_1516.loc[vac_1516.CITY=='Lake Balboa', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Lake View Terrace', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Leucadia', 'CITY']='Encinitas'
vac_1516.loc[vac_1516.CITY=='Mar Vista', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Mcclellan', 'CITY']='North Highlands'
vac_1516.loc[vac_1516.CITY=='Mira Loma', 'CITY']='Jurupa Valley'
vac_1516.loc[vac_1516.CITY=='Murrietta', 'CITY']='Murrieta'
vac_1516.loc[vac_1516.CITY=='Newbury Park', 'CITY']='Thousand Oaks'
vac_1516.loc[vac_1516.CITY=='Newhall', 'CITY']='Santa Clarita'
vac_1516.loc[vac_1516.CITY=='North Hills', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='North Hollywood', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Northridge', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Olympic Valley', 'CITY']='Squaw Valley'
vac_1516.loc[vac_1516.CITY=='Pacific Palisades', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Pacoima', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Palos Verdes', 'CITY']='Palos Verdes Estates'
vac_1516.loc[vac_1516.CITY=='Palos Verdes Peninsula', 'CITY']='Palos Verdes Estates'
vac_1516.loc[vac_1516.CITY=='Panorama City', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Playa Del Rey', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Playa Vista', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Porter Ranch', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Quail Valley', 'CITY']='Menifee'
vac_1516.loc[vac_1516.CITY=='Reseda', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='San Pedro', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='San Ysidro', 'CITY']='San Diego'
vac_1516.loc[vac_1516.CITY=='Santa Catalina', 'CITY']='Avalon'
vac_1516.loc[vac_1516.CITY=='Saugus', 'CITY']='Santa Clarita'
vac_1516.loc[vac_1516.CITY=='Sepulveda', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Sherman Oaks', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Studio City', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Sun City', 'CITY']='Menifee'
vac_1516.loc[vac_1516.CITY=='Sun Valley', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Sunland', 'CITY']='Sunland-Tujunga'
vac_1516.loc[vac_1516.CITY=='Sunset Beach', 'CITY']='Huntington Beach'
vac_1516.loc[vac_1516.CITY=='Sylmar', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Royal Oaks', 'CITY']='Interlaken'
vac_1516.loc[vac_1516.CITY=='Tahoe City', 'CITY']='Sunnyside–Tahoe City'
vac_1516.loc[vac_1516.CITY=='Tarzana', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Tujunga', 'CITY']='Sunland-Tujunga'
vac_1516.loc[vac_1516.CITY=='Tuolumne', 'CITY']='Tuolumne City'
vac_1516.loc[vac_1516.CITY=='Twenty Nine Palms', 'CITY']='Twentynine Palms'
vac_1516.loc[vac_1516.CITY=='Valencia', 'CITY']='Santa Clarita'
vac_1516.loc[vac_1516.CITY=='Valley Glen', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Valley Village', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Van Nuys', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Venice', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Ventura', 'CITY']="San Buenaventura (Ventura)"
vac_1516.loc[vac_1516.CITY=='Walnut Creet', 'CITY']='Walnut Creek'
vac_1516.loc[vac_1516.CITY=='Walnut Valley', 'CITY']='Walnut'
vac_1516.loc[vac_1516.CITY=='West Hills', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Westchester', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Westminister', 'CITY']='Westminster'
vac_1516.loc[vac_1516.CITY=='Wilmington', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Winnetka', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Woodland Hills', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Angels City', 'CITY']='Angels city'
vac_1516.loc[vac_1516.CITY=='Cardiff-By-The-Sea', 'CITY']='Encinitas city'
vac_1516.loc[vac_1516.CITY=='Davis,', 'CITY']='Davis city'
vac_1516.loc[vac_1516.CITY=='La CaÃ±ada', 'CITY']='La Canada Flintridge'
vac_1516.loc[vac_1516.CITY=='Los Nietos', 'CITY']='West Whittier-Los Nietos CDP'
vac_1516.loc[vac_1516.CITY=='Montclair,', 'CITY']='Montclair city'
vac_1516.loc[vac_1516.CITY=='Newport Coast', 'CITY']='Newport Coast CDP'
vac_1516.loc[vac_1516.CITY=='Paso Robles', 'CITY']='El Paso de Robles (Paso Robles)'
vac_1516.loc[vac_1516.CITY=='South San Francisoc', 'CITY']='South San Francisco city
vac_1516.loc[vac_1516.CITY=='Sunland-Tujunga', 'CITY']='Los Angeles'
vac_1516.loc[vac_1516.CITY=='Arcata Ca', 'CITY']='Arcata'
vac_1516.loc[vac_1516.CITY=='Berkeley Ave', 'CITY']='Berkeley'
vac_1516.loc[vac_1516.CITY=='Camp Pendelton', 'CITY']='Camp Pendleton North CDP'
vac_1516.loc[vac_1516.CITY=='Carmicheal', 'CITY']='Carmichael CPD'
vac_1516.loc[vac_1516.CITY=='Laguan Niguel', 'CITY']='Laguna Niguel city'
vac_1516.loc[vac_1516.CITY=='Laguna Niguel Ste 3409', 'CITY']='Laguna Niguel city'
vac_1516.loc[vac_1516.CITY=='Lapuente', 'CITY']='La Puente city'
vac_1516.loc[vac_1516.CITY=='Las Flores', 'CITY']='Las Flores CDP''
vac_1516.loc[vac_1516.CITY=='Mt. View', 'CITY']='Mountain View city'
vac_1516.loc[vac_1516.CITY=='Mision Viejo', 'CITY']='Mission Viejo city'
vac_1516.loc[vac_1516.CITY=='S Lake Tahoe', 'CITY']='South Lake Tahoe city'
vac_1516.loc[vac_1516.CITY=='Saint Helena', 'CITY']='St. Helena city'
vac_1516.loc[vac_1516.CITY=='Santa Clara Ca', 'CITY']='Santa Clara city'
vac_1516.loc[vac_1516.CITY=='St Helena', 'CITY']='St Helena city'
vac_1516.loc[vac_1516.CITY=='Stantan', 'CITY']='Stanton city'
vac_1516.loc[vac_1516.CITY=='Stinson', 'CITY']='Stinson Beach CDP'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [54]:
#Get count of students vaccinated by county,city and grade
vac_n=vac_1516.groupby(by=['COUNTY','CITY','vac_info_type'], as_index=False)['n'].sum()
vac_n

Unnamed: 0,COUNTY,CITY,vac_info_type,n
0,Alameda,Alameda,7th grade,752
1,Alameda,Alameda,childcare,1098
2,Alameda,Alameda,kindergarten,974
3,Alameda,Albany,7th grade,284
4,Alameda,Albany,childcare,248
5,Alameda,Albany,kindergarten,300
6,Alameda,Berkeley,7th grade,933
7,Alameda,Berkeley,kindergarten,903
8,Alameda,Berkeley Ave,childcare,61
9,Alameda,Castro Valley,7th grade,804


In [55]:
#Export count of students vaccinated by county,city and grade
vac_n.to_csv("vac_n_table1516.CSV")

In [56]:
#Get average percentage of students vaccinated by county,city and grade
vac_pct=vac_1516.groupby(by=['COUNTY','CITY','vac_info_type'], as_index=False)['pct'].mean()
vac_pct

Unnamed: 0,COUNTY,CITY,vac_info_type,pct
0,Alameda,Alameda,7th grade,97.666667
1,Alameda,Alameda,childcare,95.125000
2,Alameda,Alameda,kindergarten,97.266667
3,Alameda,Albany,7th grade,94.000000
4,Alameda,Albany,childcare,95.750000
5,Alameda,Albany,kindergarten,96.333333
6,Alameda,Berkeley,7th grade,97.555556
7,Alameda,Berkeley,kindergarten,85.842105
8,Alameda,Berkeley Ave,childcare,92.000000
9,Alameda,Castro Valley,7th grade,96.250000


In [57]:
#Export average percentage of students vaccinated by county,city and grade
vac_pct.to_csv("vac_pct_table1516.CSV")

In [58]:
vac_1516.head()

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED
0,13417683,Alameda,PRIVATE,Hayward,A JOYFUL NOISE LEARNING CENTER,26,26,100,childcare,Y
1,13420589,Alameda,PUBLIC,Alameda,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99,93,93,childcare,Y
3,13419449,Alameda,PRIVATE,Fremont,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45,42,93,childcare,Y
4,13417471,Alameda,PRIVATE,Fremont,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25,25,100,childcare,Y
5,13411388,Alameda,PRIVATE,Alameda,ABC PRESCHOOL,35,34,97,childcare,Y


In [59]:
#Export full cleaned data
vac_1516.to_csv("clean1516.csv", index=False)

## Census Data Cleaning

### Demographics

In [60]:
# Import and clean census data for basic demographics age gender race
census16 = pd.read_csv("Raw_Data/Census/ACS_16_demographic.csv",encoding = "ISO-8859-1",header=[0],na_values=['-'],)

In [61]:
census16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC108,HC04_VC108,HC01_VC109,HC02_VC109,HC03_VC109,HC04_VC109,HC01_VC110,HC02_VC110,HC03_VC110,HC04_VC110
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",1000,338,1000,(X),409,146,40.9,...,695,(X),279,101,40.1,7.1,416,194,59.9,7.1
1,1600000US0600156,600156,"Acampo CDP, California",466,398,466,(X),291,277,62.4,...,466,(X),291,277,62.4,13.7,175,137,37.6,13.7
2,1600000US0600212,600212,"Acton CDP, California",7170,555,7170,(X),3776,318,52.7,...,5777,(X),3020,279,52.3,2.2,2757,199,47.7,2.2
3,1600000US0600296,600296,"Adelanto city, California",32311,35,32311,(X),16350,651,50.6,...,16322,(X),8399,706,51.5,2.4,7923,539,48.5,2.4
4,1600000US0600310,600310,"Adin CDP, California",155,91,155,(X),59,40,38.1,...,115,(X),52,33,45.2,15.8,63,35,54.8,15.8


In [62]:
demographic16 = census16[['GEO.id','GEO.id2','GEO.display-label',
                     'HC01_VC03',
                     'HC01_VC04','HC03_VC04',
                     'HC01_VC05','HC03_VC05',
                     'HC01_VC08','HC03_VC08',
                     'HC01_VC09','HC03_VC09',
                     'HC01_VC10','HC03_VC10',
                     'HC01_VC11','HC03_VC11',
                     'HC01_VC12','HC03_VC12',
                     'HC01_VC13','HC03_VC13',
                     'HC01_VC14','HC03_VC14',
                     'HC01_VC15','HC03_VC15',
                     'HC01_VC16','HC03_VC16',
                     'HC01_VC17','HC03_VC17',
                     'HC01_VC18','HC03_VC18',
                     'HC01_VC19','HC03_VC19',
                     'HC01_VC20','HC03_VC20',
                     'HC01_VC23','HC03_VC23',
                     'HC01_VC88','HC03_VC88',
                     'HC01_VC94','HC03_VC94',
                     'HC01_VC95','HC03_VC95',
                     'HC01_VC96','HC03_VC96',
                     'HC01_VC97','HC03_VC97',
                     'HC01_VC98','HC03_VC98',
                     'HC01_VC99','HC03_VC99',
                     'HC01_VC100','HC03_VC100',
                     'HC01_VC101','HC03_VC101',
                     'HC01_VC102','HC03_VC102'                 
        ]]

In [63]:
demographic16.dtypes

GEO.id                object
GEO.id2                int64
GEO.display-label     object
HC01_VC03              int64
HC01_VC04              int64
HC03_VC04            float64
HC01_VC05              int64
HC03_VC05            float64
HC01_VC08              int64
HC03_VC08            float64
HC01_VC09              int64
HC03_VC09            float64
HC01_VC10              int64
HC03_VC10            float64
HC01_VC11              int64
HC03_VC11            float64
HC01_VC12              int64
HC03_VC12            float64
HC01_VC13              int64
HC03_VC13            float64
HC01_VC14              int64
HC03_VC14            float64
HC01_VC15              int64
HC03_VC15            float64
HC01_VC16              int64
HC03_VC16            float64
HC01_VC17              int64
HC03_VC17            float64
HC01_VC18              int64
HC03_VC18            float64
HC01_VC19              int64
HC03_VC19            float64
HC01_VC20              int64
HC03_VC20            float64
HC01_VC23     

In [64]:
#combine other race, two or more races (HC01_VC99, HC01_VC100, HC01_VC101, HC01_VC102)
demographic16['other']=pd.Series(census16['HC01_VC99']+census16['HC01_VC100']+census16['HC01_VC101']+census16['HC01_VC102'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [65]:
demographic16['other_pct']=pd.Series(census16['HC03_VC99']+census16['HC03_VC100']+census16['HC03_VC101']+census16['HC03_VC102'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [66]:
demographic16.rename(columns={'GEO.display-label': 'city',
                         'HC01_VC03': 'tot_pop', 
                         'HC01_VC04': 'male',
                         'HC01_VC05': 'female',
                         'HC01_VC08':'under_5',
                         'HC01_VC09':'5_9',
                         'HC01_VC10':'10_14',
                         'HC01_VC11':'15_19',
                         'HC01_VC12':'20_24',
                         'HC01_VC13':'25_34',
                         'HC01_VC14':'35_44',
                         'HC01_VC15':'45_54',
                         'HC01_VC16':'55_59',
                         'HC01_VC17':'60_64',
                         'HC01_VC18':'65_74',
                         'HC01_VC19':'75_84',
                         'HC01_VC20':'85_over',
                         'HC01_VC23':'median_age',
                         'HC01_VC88':'hispanic_latino',
                         'HC01_VC89':'mexican',
                         'HC01_VC94':'white',
                         'HC01_VC95':'black',
                         'HC01_VC96':'aian',
                         'HC01_VC97':'asian',
                         'HC01_VC98':'nhopi',

                         'HC03_VC04': 'male_pct',
                         'HC03_VC05': 'female_pct',
                         'HC03_VC08':'under_5_pct',
                         'HC03_VC09':'5_9_pct',
                         'HC03_VC10':'10_14_pct',
                         'HC03_VC11':'15_19_pct',
                         'HC03_VC12':'20_24_pct',
                         'HC03_VC13':'25_34_pct',
                         'HC03_VC14':'35_44_pct',
                         'HC03_VC15':'45_54_pct',
                         'HC03_VC16':'55_59_pct',
                         'HC03_VC17':'60_64_pct',
                         'HC03_VC18':'65_74_pct',
                         'HC03_VC19':'75_84_pct',
                         'HC03_VC20':'85_over_pct',
                         'HC03_VC23':'median_age_pct',
                         'HC03_VC88':'hispanic_latino_pct',
                         'HC03_VC89':'mexican_pct',
                         'HC03_VC94':'white_pct',
                         'HC03_VC95':'black_pct',
                         'HC03_VC96':'aian_pct',
                         'HC03_VC97':'asian_pct',
                         'HC03_VC98':'nhopi_pct'
                        }, 
                inplace=True)
# Other race categories
#                          'HC01_VC99':'other_only',
#                          'HC01_VC100':'more_two_races',
#                          'HC01_VC101':'more_two_races_other',
#                          'HC01_VC102':'more_two_races_other_exclude',


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [67]:
demographic16.dtypes

GEO.id                  object
GEO.id2                  int64
city                    object
tot_pop                  int64
male                     int64
male_pct               float64
female                   int64
female_pct             float64
under_5                  int64
under_5_pct            float64
5_9                      int64
5_9_pct                float64
10_14                    int64
10_14_pct              float64
15_19                    int64
15_19_pct              float64
20_24                    int64
20_24_pct              float64
25_34                    int64
25_34_pct              float64
35_44                    int64
35_44_pct              float64
45_54                    int64
45_54_pct              float64
55_59                    int64
55_59_pct              float64
60_64                    int64
60_64_pct              float64
65_74                    int64
65_74_pct              float64
75_84                    int64
75_84_pct              float64
85_over 

In [68]:
demographic16['city']=demographic16.city.replace(' city, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [69]:
demographic16['city']=demographic16.city.replace(' CDP, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [70]:
demographic16['city']=demographic16.city.replace(' town, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [71]:
## Uppercase cities
demographic16['city']=demographic16['city'].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [72]:
demographic16=demographic16.drop(columns=['HC01_VC99','HC03_VC99','HC01_VC100','HC03_VC100','HC01_VC101','HC03_VC101','HC01_VC102','HC03_VC102'])
demographic16.head()

Unnamed: 0,GEO.id,GEO.id2,city,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,black,black_pct,aian,aian_pct,asian,asian_pct,nhopi,nhopi_pct,other,other_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,0,0.0,0,0.0,209,20.9,0,0.0,322,32.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,120,1.7,15,0.2,67,0.9,0,0.0,254,3.6
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,5810,18.0,12,0.0,497,1.5,0,0.0,2061,6.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,0,0.0,0,0.0,0,0.0,0,0.0,10,6.4


### Median Household Income and Health insurance Status

Median Household Income has categories 2,500-, 250,000+ which was changed to 2500 and 250000. 

In [73]:
# Import and clean acs data for income and insurance status
acs16 = pd.read_csv("Raw_Data/Census/ACS_16_economic.csv", encoding = "ISO-8859-1", header=[0], na_values=['-']) 

In [74]:
acs16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC178,HC04_VC178,HC01_VC179,HC02_VC179,HC03_VC179,HC04_VC179,HC01_VC180,HC02_VC180,HC03_VC180,HC04_VC180
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",747,288,747,(X),497,296,66.5,...,0.0,26.1,(X),(X),0.0,3.5,(X),(X),0.0,26.9
1,1600000US0600156,600156,"Acampo CDP, California",466,398,466,(X),368,359,79.0,...,0.0,27.6,(X),(X),0.0,9.3,(X),(X),0.0,21.6
2,1600000US0600212,600212,"Acton CDP, California",6098,398,6098,(X),3917,339,64.2,...,5.2,3.6,(X),(X),1.6,1.2,(X),(X),39.7,11.7
3,1600000US0600296,600296,"Adelanto city, California",21534,628,21534,(X),9910,597,46.0,...,21.3,6.9,(X),(X),41.5,5.6,(X),(X),49.4,5.9
4,1600000US0600310,600310,"Adin CDP, California",123,63,123,(X),51,36,41.5,...,32.5,43.5,(X),(X),0.0,28.3,(X),(X),34.0,32.3


In [75]:
acs16=acs16[['GEO.id','GEO.id2','GEO.display-label','HC01_VC85','HC01_VC131','HC03_VC131',
                'HC01_VC132','HC03_VC132','HC01_VC133','HC03_VC133','HC01_VC134','HC03_VC134']]
income16=acs16

In [76]:
income16.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC85,HC01_VC131,HC03_VC131,HC01_VC132,HC03_VC132,HC01_VC133,HC03_VC133,HC01_VC134,HC03_VC134
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,"Acampo CDP, California",155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,"Acton CDP, California",91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,"Adelanto city, California",29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,"Adin CDP, California",55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [77]:
income16.dtypes

GEO.id                object
GEO.id2                int64
GEO.display-label     object
HC01_VC85            float64
HC01_VC131             int64
HC03_VC131           float64
HC01_VC132             int64
HC03_VC132           float64
HC01_VC133             int64
HC03_VC133           float64
HC01_VC134             int64
HC03_VC134           float64
dtype: object

In [78]:
income16.rename(columns={'GEO.display-label': 'city',
                         'HC01_VC85': 'median_income',
                         'HC01_VC131': 'insurance',
                         'HC01_VC132': 'private_insure',
                         'HC01_VC133': 'public_insure',
                         'HC01_VC134': 'no_insurance',

                         'HC03_VC131': 'insurance_pct',
                         'HC03_VC132': 'private_insure_pct',
                         'HC03_VC133': 'public_insure_pct',
                         'HC03_VC134': 'no_insurance_pct'  
                        }, 
                inplace=True)

In [79]:
income16.dtypes

GEO.id                 object
GEO.id2                 int64
city                   object
median_income         float64
insurance               int64
insurance_pct         float64
private_insure          int64
private_insure_pct    float64
public_insure           int64
public_insure_pct     float64
no_insurance            int64
no_insurance_pct      float64
dtype: object

In [80]:
income16['city']=income16.city.replace(' CDP, California', '', regex=True)

In [81]:
income16['city']=income16.city.replace(' city, California', '', regex=True)

In [82]:
income16['city']=income16.city.replace(' town, California', '', regex=True)

In [83]:
income16.head()

Unnamed: 0,GEO.id,GEO.id2,city,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


## Join Demographics and income data

In [84]:
pop_data=demographic16.merge(income16, left_on='GEO.id', right_on='GEO.id', how='outer')
pop_data.head()

Unnamed: 0,GEO.id,GEO.id2_x,city_x,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,city_y,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,Acalanes Ridge,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,Acampo,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,Acton,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,Adelanto,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,Adin,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [85]:
list(pop_data.columns.values)

['GEO.id',
 'GEO.id2_x',
 'city_x',
 'tot_pop',
 'male',
 'male_pct',
 'female',
 'female_pct',
 'under_5',
 'under_5_pct',
 '5_9',
 '5_9_pct',
 '10_14',
 '10_14_pct',
 '15_19',
 '15_19_pct',
 '20_24',
 '20_24_pct',
 '25_34',
 '25_34_pct',
 '35_44',
 '35_44_pct',
 '45_54',
 '45_54_pct',
 '55_59',
 '55_59_pct',
 '60_64',
 '60_64_pct',
 '65_74',
 '65_74_pct',
 '75_84',
 '75_84_pct',
 '85_over',
 '85_over_pct',
 'median_age',
 'median_age_pct',
 'hispanic_latino',
 'hispanic_latino_pct',
 'white',
 'white_pct',
 'black',
 'black_pct',
 'aian',
 'aian_pct',
 'asian',
 'asian_pct',
 'nhopi',
 'nhopi_pct',
 'other',
 'other_pct',
 'GEO.id2_y',
 'city_y',
 'median_income',
 'insurance',
 'insurance_pct',
 'private_insure',
 'private_insure_pct',
 'public_insure',
 'public_insure_pct',
 'no_insurance',
 'no_insurance_pct']

In [86]:
pop_data=pop_data.drop(columns=['GEO.id2_y','city_y'])

In [87]:
pop_data=pop_data.rename(columns={'GEO.id': 'geoid',
                         'GEO.id2_x': 'geoid2',
                         'city_x': 'city'})

In [88]:
# Fix City naming to be similar for special characters
pop_data.loc[pop_data.city=='La Cañada Flintridge', 'city']='La Canada Flintridge'

# Change census cities names that are present in multiple places to match vaccine data city
pop_data.loc[pop_data.city=='El Sobrante Cdp (Contra Costa County), California', 'city']='El Sobrante'
pop_data.loc[pop_data.city=='Edwards Afb', 'city']='Edwards AFB'
pop_data.loc[pop_data.city=='Las Flores Cdp (Orange County), California', 'city']='Los Flores'
pop_data.loc[pop_data.city=='West Whittier-Los Nietos CDP, California', 'city']='Los Nietos'
pop_data.loc[pop_data.city=="El Paso de Robles (Paso Robles)", 'city']='Paso Robles'
pop_data.loc[pop_data.city=='Piñon Hills', 'city']='Pinon Hills'
pop_data.loc[pop_data.city=='San Miguel Cdp (San Luis Obispo County), California', 'city']='San Miguel'
pop_data.loc[pop_data.city=='Spring Valley Cdp (San Diego County), California', 'city']='Spring Valley'



In [89]:
pop_data.head()

Unnamed: 0,geoid,geoid2,city,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,other_pct,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,32.2,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,0.0,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,3.6,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,6.3,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,6.4,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [90]:
# Exported merged census data
pop_data.to_csv("pop_data16.csv", index=False)

## Join Vaccine data to Census data

Join needs unique cities in the vaccine data to use as a key. Will need to figure out how to weight the different grade levels based on the census population numbers for age. 

In [91]:
vac_pop_1516=vac_1516.merge(pop_data, left_on='CITY', right_on='city', how='outer')
vac_pop_1516

Unnamed: 0,FACILITY_NUMBER,COUNTY,pub_priv_headstart,CITY,FACILITY_NAME,ENROLLMENT,n,pct,vac_info_type,REPORTED,...,other_pct,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,13417683.0,Alameda,PRIVATE,Hayward,A JOYFUL NOISE LEARNING CENTER,26.0,26.0,100.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
1,13419419.0,Alameda,PRIVATE,Hayward,"ARC OF ALAMEDA CO. FIRST STEP CHILDREN'S CTR.,...",17.0,16.0,94.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
2,13420960.0,Alameda,HEAD START,Hayward,CFCS - JOHN MUIR HEAD START SITE,71.0,71.0,100.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
3,10212000.0,Alameda,HEAD START,Hayward,CFCS HEAD START - GLASSBROOK,40.0,40.0,100.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
4,13418959.0,Alameda,HEAD START,Hayward,CFCS HEAD START - SUNSET,47.0,46.0,97.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
5,13418950.0,Alameda,HEAD START,Hayward,CFCS HEAD START CENTER - BIDWELL,37.0,37.0,100.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
6,13420577.0,Alameda,HEAD START,Hayward,CFCS- LORENZO MANOR HEAD START,38.0,38.0,100.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
7,13411745.0,Alameda,PUBLIC,Hayward,"CHABOT COLLEGE, CHILDREN'S CENTER",59.0,58.0,98.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
8,13414452.0,Alameda,PRIVATE,Hayward,CHERUBIM'S CHILDREN'S CENTER,11.0,11.0,100.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5
9,13420978.0,Alameda,PRIVATE,Hayward,CHILDRENS CHOICE EDUCARE,31.0,29.0,93.0,childcare,Y,...,7.5,68138.0,135952.0,88.5,96388.0,62.8,52149.0,34.0,17588.0,11.5


In [92]:
#Export full cleaned data
vac_pop_1516.to_csv("cleanjoin1516.csv", index=False)

In [93]:
vac_pop_1516.dtypes

FACILITY_NUMBER        float64
COUNTY                  object
pub_priv_headstart      object
CITY                    object
FACILITY_NAME           object
ENROLLMENT             float64
n                      float64
pct                    float64
vac_info_type           object
REPORTED                object
geoid                   object
geoid2                 float64
city                    object
tot_pop                float64
male                   float64
male_pct               float64
female                 float64
female_pct             float64
under_5                float64
under_5_pct            float64
5_9                    float64
5_9_pct                float64
10_14                  float64
10_14_pct              float64
15_19                  float64
15_19_pct              float64
20_24                  float64
20_24_pct              float64
25_34                  float64
25_34_pct              float64
                        ...   
65_74_pct              float64
75_84   