In [1]:
import numpy as np
import pandas as pd

## 2015-2016 school vaccination data

In [2]:
# Import raw combined school vaccination data
pertusis = pd.read_csv("vaxxfacts/raw_data/school_vaccination/2015-2016/2015-2016_merged.csv",encoding = "ISO-8859-1")
pertusis=pertusis.drop(columns=['Unnamed: 0'])

In [3]:
pertusis.head()

Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,reported,vac_info_type
0,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,Y,ChildCareData
1,13422032,ALAMEDA,,SAN LEANDRO,AB'S PRESCHOOL AND DAYCARE,,,,N,ChildCareData
2,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,Y,ChildCareData
3,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.0,Y,ChildCareData
4,13411388,ALAMEDA,PRIVATE,ALAMEDA,ABC PRESCHOOL,35.0,34.0,97.14,Y,ChildCareData


In [4]:
pertusis.dtypes

facility_num       int64
county            object
is_public         object
city              object
facility_name     object
enrollment       float64
n                float64
pct              float64
reported          object
vac_info_type     object
dtype: object

In [5]:
sum(pertusis.reported=='N')

1822

In [6]:
sum(pertusis.reported=='Y')

19698

In [7]:
pertusis[pertusis.facility_num==7777777]

Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,reported,vac_info_type


In [8]:
# Delete invalid test data 
pertusis=pertusis[pertusis.facility_num != 7777777]

In [9]:
pertusis[pertusis.facility_num==7777777]


Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,reported,vac_info_type


Subset dataset to those that reported

In [10]:
# Take subset that have numbers
vac_data=pertusis[pertusis.reported=='Y']

In [11]:
vac_data.head()

Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,reported,vac_info_type
0,13420589,ALAMEDA,PUBLIC,ALAMEDA,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,Y,ChildCareData
2,13419449,ALAMEDA,PRIVATE,FREMONT,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,Y,ChildCareData
3,13417471,ALAMEDA,PRIVATE,FREMONT,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.0,Y,ChildCareData
4,13411388,ALAMEDA,PRIVATE,ALAMEDA,ABC PRESCHOOL,35.0,34.0,97.14,Y,ChildCareData
5,13421345,ALAMEDA,PRIVATE,CASTRO VALLEY,ABC PRESCHOOL & DAYCARE,48.0,43.0,89.58,Y,ChildCareData


Numbers in the original document have been surpressed for <=1%, <=2%, <=5% and >=95%,>=98%,>=99% so will impute values by assuming that the percentage is equal to whatever value listed and multiply by the number of students enrolled at the school. 

In [12]:
# Change print settings to see all
#np.set_printoptions(threshold=np.nan)
#change back to default printing length
#np.set_printoptions(threshold=5)
# Check County names
sorted(list(vac_data.county.unique()))

['ALAMEDA',
 'AMADOR',
 'BERKELEY CITY',
 'BUTTE',
 'CALAVERAS',
 'COLUSA',
 'CONTRA COSTA',
 'DEL NORTE',
 'EL DORADO',
 'FRESNO',
 'GLENN',
 'HUMBOLDT',
 'IMPERIAL',
 'INYO',
 'KERN',
 'KINGS',
 'LAKE',
 'LASSEN',
 'LOS ANGELES',
 'MADERA',
 'MARIN',
 'MARIPOSA',
 'MENDOCINO',
 'MERCED',
 'MODOC',
 'MONO',
 'MONTEREY',
 'NAPA',
 'NEVADA',
 'ORANGE',
 'PLACER',
 'PLUMAS',
 'RIVERSIDE',
 'SACRAMENTO',
 'SAN BENITO',
 'SAN BERNARDINO',
 'SAN DIEGO',
 'SAN FRANCISCO',
 'SAN JOAQUIN',
 'SAN LUIS OBISPO',
 'SAN MATEO',
 'SANTA BARBARA',
 'SANTA CLARA',
 'SANTA CRUZ',
 'SHASTA',
 'SIERRA',
 'SISKIYOU',
 'SOLANO',
 'SONOMA',
 'STANISLAUS',
 'SUTTER',
 'TEHAMA',
 'TRINITY',
 'TULARE',
 'TUOLUMNE',
 'VENTURA',
 'YOLO',
 'YUBA']

In [13]:
#Check city names
sorted(list(vac_data.city.unique()))

['ACAMPO',
 'ACTON',
 'ADELANTO',
 'AGOURA',
 'AGOURA HILLS',
 'AGUA DULCE',
 'AGUANGA',
 'AHWAHNEE',
 'ALAMEDA',
 'ALAMO',
 'ALBANY',
 'ALHAMBRA',
 'ALISO VIEJO',
 'ALLENSWORTH',
 'ALPAUGH',
 'ALPINE',
 'ALTA',
 'ALTA LOMA',
 'ALTADENA',
 'ALTURAS',
 'ALVISO',
 'AMERICAN CANYON',
 'ANAHEIM',
 'ANAHEIM HILLS',
 'ANDERSON',
 'ANGELS CAMP',
 'ANGWIN',
 'ANTELOPE',
 'ANTIOCH',
 'ANZA',
 'APPLE VALLEY',
 'APTOS',
 'ARBOGA',
 'ARBUCKLE',
 'ARCADIA',
 'ARCATA',
 'ARCATA CA',
 'ARLETA',
 'ARMONA',
 'ARNOLD',
 'AROMAS',
 'ARROYO GRANDE',
 'ARTESIA',
 'ARVIN',
 'ATASCADERO',
 'ATHERTON',
 'ATWATER',
 'AUBERRY',
 'AUBURN',
 'AVALON',
 'AVENAL',
 'AVERY',
 'AZUSA',
 'BAKER',
 'BAKERSFIELD',
 'BALDWIN PARK',
 'BALDY MESA',
 'BALLICO',
 'BANGOR',
 'BANNING',
 'BANTA',
 'BARSTOW',
 'BAY POINT',
 'BAYSIDE',
 'BEALE AFB',
 'BEALE AIR FORCE BASE',
 'BEAUMONT',
 'BELL',
 'BELL GARDENS',
 'BELLA VISTA',
 'BELLFLOWER',
 'BELMONT',
 'BELVEDERE',
 'BEN LOMOND',
 'BENICIA',
 'BERKELEY',
 'BERKELEY AVE',
 'BE

In [14]:
# Fix City Typos
vac_data.loc[vac_data.city=='ACTON, CA', 'city']='ACTON'
vac_data.loc[vac_data.city=='AGOURA', 'city']='AGOURA HILLS'
vac_data.loc[vac_data.city=='ANAHEIM,', 'city']='ANAHEIM'
vac_data.loc[vac_data.city=='ANANHEIM,', 'city']='ANAHEIM'
vac_data.loc[vac_data.city=='CA', 'city']='NAPA'
vac_data.loc[vac_data.city=='BEALE AIR FORCE BASE', 'city']='BEALE AFB' # to match census name
vac_data.loc[vac_data.city=='BENICA', 'city']='BENICIA'
vac_data.loc[vac_data.city=='CARDIFF', 'city']='CARDIFF BY THE SEA'
vac_data.loc[vac_data.city=='CARMEL', 'city']='CARMEL VALLEY'
vac_data.loc[vac_data.city=='CHINO,', 'city']='CHINO'
vac_data.loc[vac_data.city=='CHULAR', 'city']='CHUALAR'
vac_data.loc[vac_data.city=='CITY OF COMMERCE', 'city']='COMMERCE'
vac_data.loc[vac_data.city=='CITY OF INDUSTRY', 'city']='INDUSTRY'
vac_data.loc[vac_data.city=='CUDAHAY', 'city']='CUDAHY'
vac_data.loc[vac_data.city=='E. NICOLAUS', 'city']='EAST NICOLAUS'
vac_data.loc[vac_data.city=='E. RANCHO DOMINGUEZ', 'city']='EAST RANCHO DOMINGUEZ'
vac_data.loc[vac_data.city=='E RANCHO DOMINGUEZ', 'city']='EAST RANCHO DOMINGUEZ'
vac_data.loc[vac_data.city=='E. WHITTIER', 'city']='EAST WHITTIER'
vac_data.loc[vac_data.city=='EL SEGUNDO,', 'city']='EL SEGUNDO'
vac_data.loc[vac_data.city=='FAIRIFELD', 'city']='FAIRFIELD'
vac_data.loc[vac_data.city=='FREMOTN', 'city']='FREMONT'
vac_data.loc[vac_data.city=='FT. IRWIN', 'city']='FORT IRWIN'
vac_data.loc[vac_data.city=='GREENFILED', 'city']='GREENFIELD'
vac_data.loc[vac_data.city=='HUNTINGTON', 'city']='HUNTINGTON PARK'
vac_data.loc[vac_data.city=='JAMUAL', 'city']='JAMUL'
vac_data.loc[vac_data.city=='LA', 'city']='LOS ANGELES'
vac_data.loc[vac_data.city=='LA CANADA', 'city']='LA CANADA FLINTRIDGE'
vac_data.loc[vac_data.city=='LA CAÃ\x91ADA', 'city']='LA CANADA FLINTRIDGE'
vac_data.loc[vac_data.city=='LA CAÃ\x83â\x80\x98ADA', 'city']='LA CANADA FLINTRIDGE'
vac_data.loc[vac_data.city=='LA CRESCENTA', 'city']='LA CRESCENTA-MONTROSE'
vac_data.loc[vac_data.city=='LAKE VIEW TERRANCE', 'city']='LAKE VIEW TERRACE'
vac_data.loc[vac_data.city=='LAKEVIEW TERRACE', 'city']='LAKE VIEW TERRACE'
vac_data.loc[vac_data.city=='LANCASTER,', 'city']='LANCASTER'
vac_data.loc[vac_data.city=='LAVERNE', 'city']='LA VERNE'
vac_data.loc[vac_data.city=='MC KINLEYVILLE', 'city']='MCKINLEYVILLE'
vac_data.loc[vac_data.city=='MONTROSE', 'city']='LA CRESCENTA-MONTROSE'
vac_data.loc[vac_data.city=='MT. SHASTA', 'city']='MOUNT SHASTA'
vac_data.loc[vac_data.city=='N. HOLLYWOOD', 'city']='NORTH HOLLYWOOD'
vac_data.loc[vac_data.city=='N.A.S. LEMOORE', 'city']='LEMOORE STATION' # match census
vac_data.loc[vac_data.city=='NAS LEMOORE', 'city']='LEMOORE STATION'
vac_data.loc[vac_data.city=='NEWPORT BEACH,', 'city']='NEWPORT BEACH'
vac_data.loc[vac_data.city=='PACOMIA', 'city']='PACOIMA'
vac_data.loc[vac_data.city=='PANORAM ', 'city']='PANORAMA CITY'
vac_data.loc[vac_data.city=='PANORAM CITY', 'city']='PANORAMA CITY'
vac_data.loc[vac_data.city=='PT. REYES STATION', 'city']='POINT REYES STATION'
vac_data.loc[vac_data.city=='RANCHO SANTA MARGARI', 'city']='RANCHO SANTA MARGARITA'
vac_data.loc[vac_data.city=='RANCHO SAN MARGARITA', 'city']='RANCHO SANTA MARGARITA'
vac_data.loc[vac_data.city=='RANCHO STA MARGAITA', 'city']='RANCHO SANTA MARGARITA'
vac_data.loc[vac_data.city=='RANCHO STA MARGARITA', 'city']='RANCHO SANTA MARGARITA'
vac_data.loc[vac_data.city=='RCHO STA MARG', 'city']='RANCHO SANTA MARGARITA'
vac_data.loc[vac_data.city=='RANCHOS PALOS VERDES', 'city']='RANCHO PALOS VERDES'
vac_data.loc[vac_data.city=='RIVERSIDE,', 'city']='RIVERSIDE'
vac_data.loc[vac_data.city=='ROLLING HILLS ESTATE', 'city']='ROLLING HILLS ESTATES'
vac_data.loc[vac_data.city=='S. EL MONTE', 'city']='SOUTH EL MONTE'
vac_data.loc[vac_data.city=='S. LAKE TAHOE', 'city']='SOUTH LAKE TAHOE'
vac_data.loc[vac_data.city=='S PASADENA', 'city']='SOUTH PASADENA'
vac_data.loc[vac_data.city=='SAN BERARDINO', 'city']='SAN BERNARDINO'
vac_data.loc[vac_data.city=='SAN FRNCISCO', 'city']='SAN FRANCISCO'
vac_data.loc[vac_data.city=='SAN JOSE,', 'city']='SAN JOSE'
vac_data.loc[vac_data.city=='SANTA BARARA', 'city']='SANTA BARBARA'
vac_data.loc[vac_data.city=='SHASTA LAKE CITY', 'city']='SHASTA LAKE'
vac_data.loc[vac_data.city=='SILVERADO CANYON', 'city']='SILVERADO'
vac_data.loc[vac_data.city=='SOUTH LATE TAHOE', 'city']='SOUTH LAKE TAHOE'
vac_data.loc[vac_data.city=='SPRECKLES', 'city']='SPRECKELS'
vac_data.loc[vac_data.city=='STEVENSONS RANCH', 'city']='STEVENSON RANCH'
vac_data.loc[vac_data.city=='SUISUN', 'city']='SUISUN CITY'
vac_data.loc[vac_data.city=='SUNNYALE', 'city']='SUNNYVALE'
vac_data.loc[vac_data.city=='SUPELVEDA', 'city']='SEPULVEDA'
vac_data.loc[vac_data.city=='TURLOCK,', 'city']='TURLOCK'
vac_data.loc[vac_data.city=='UPPERLAKE', 'city']='UPPER LAKE'
vac_data.loc[vac_data.city=='VANDENBERG AIR FORCE BASE', 'city']='VANDENBERG AFB' # match census
vac_data.loc[vac_data.city=='W. SACRAMENTO', 'city']='WEST SACRAMENTO'
vac_data.loc[vac_data.city=='WEST LOS ANGELES', 'city']='LOS ANGELES'

vac_data.loc[vac_data.county=='BERKELEY CITY', 'county']='ALAMEDA'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
# Change case to title case
vac_data.county=vac_data['county'].str.title()
vac_data.city=vac_data['city'].str.title()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [16]:
# RECODE NEIGHBORHOODS TO CITIES to match Census
vac_data.loc[vac_data.city=='Alta Loma', 'city']='Rancho Cucamonga'
vac_data.loc[vac_data.city=='Alviso', 'city']='San Jose'
vac_data.loc[vac_data.city=='Anaheim Hills', 'city']='Anaheim'
vac_data.loc[vac_data.city=='Angels Camp', 'city']='Angels'
vac_data.loc[vac_data.city=='Arleta', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Canoga Park', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Canyon Country', 'city']='Santa Clarita'
vac_data.loc[vac_data.city=='Capistrano Beach', 'city']='Dana Point'
vac_data.loc[vac_data.city=='Cardiff By The Sea', 'city']='Encinitas'
vac_data.loc[vac_data.city=='Carmel Valley', 'city']='Carmel Valley Village'
vac_data.loc[vac_data.city=='Chatsworth', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Corona Del Mar', 'city']='Newport Beach'
vac_data.loc[vac_data.city=='Edwards', 'city']='Edwards AFB'
vac_data.loc[vac_data.city=='Emerald Hills', 'city']='San Diego'
vac_data.loc[vac_data.city=='Encino', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Etiwanda', 'city']='Rancho Cucamonga'
vac_data.loc[vac_data.city=='Foothill Ranch', 'city']='Lake Forest'
vac_data.loc[vac_data.city=='Granada Hills', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Harbor City', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Highland Park', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Hollywood', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Idyllwild', 'city']='Idyllwild-Pine Cove'
vac_data.loc[vac_data.city=='La Jolla', 'city']='San Diego'
vac_data.loc[vac_data.city=='Lake Balboa', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Lake View Terrace', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Leucadia', 'city']='Encinitas'
vac_data.loc[vac_data.city=='Mar Vista', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Mcclellan', 'city']='North Highlands'
vac_data.loc[vac_data.city=='Mira Loma', 'city']='Jurupa Valley'
vac_data.loc[vac_data.city=='Murrietta', 'city']='Murrieta'
vac_data.loc[vac_data.city=='Newbury Park', 'city']='Thousand Oaks'
vac_data.loc[vac_data.city=='Newhall', 'city']='Santa Clarita'
vac_data.loc[vac_data.city=='North Hills', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='North Hollywood', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Northridge', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Olympic Valley', 'city']='Squaw Valley'
vac_data.loc[vac_data.city=='Pacific Palisades', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Pacoima', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Palos Verdes', 'city']='Palos Verdes Estates'
vac_data.loc[vac_data.city=='Palos Verdes Peninsula', 'city']='Palos Verdes Estates'
vac_data.loc[vac_data.city=='Panorama City', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Playa Del Rey', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Playa Vista', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Porter Ranch', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Quail Valley', 'city']='Menifee'
vac_data.loc[vac_data.city=='Reseda', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='San Pedro', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='San Ysidro', 'city']='San Diego'
vac_data.loc[vac_data.city=='Santa Catalina', 'city']='Avalon'
vac_data.loc[vac_data.city=='Saugus', 'city']='Santa Clarita'
vac_data.loc[vac_data.city=='Sepulveda', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Sherman Oaks', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Studio City', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Sun City', 'city']='Menifee'
vac_data.loc[vac_data.city=='Sun Valley', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Sunland', 'city']='Sunland-Tujunga'
vac_data.loc[vac_data.city=='Sunset Beach', 'city']='Huntington Beach'
vac_data.loc[vac_data.city=='Sylmar', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Royal Oaks', 'city']='Interlaken'
vac_data.loc[vac_data.city=='Tahoe City', 'city']='Sunnyside-Tahoe City'
vac_data.loc[vac_data.city=='Tarzana', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Tujunga', 'city']='Sunland-Tujunga'
vac_data.loc[vac_data.city=='Tuolumne', 'city']='Tuolumne City'
vac_data.loc[vac_data.city=='Twenty Nine Palms', 'city']='Twentynine Palms'
vac_data.loc[vac_data.city=='Valencia', 'city']='Santa Clarita'
vac_data.loc[vac_data.city=='Valley Glen', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Valley Village', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Van Nuys', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Venice', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Ventura', 'city']="San Buenaventura (Ventura)"
vac_data.loc[vac_data.city=='Walnut Creet', 'city']='Walnut Creek'
vac_data.loc[vac_data.city=='Walnut Valley', 'city']='Walnut'
vac_data.loc[vac_data.city=='West Hills', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Westchester', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Westminister', 'city']='Westminster'
vac_data.loc[vac_data.city=='Wilmington', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Winnetka', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Woodland Hills', 'city']='Los Angeles'

vac_data.loc[vac_data.city=='Angels City', 'city']='Angels'
vac_data.loc[vac_data.city=='Cardiff-By-The-Sea', 'city']='Encinitas'
vac_data.loc[vac_data.city=='Davis,', 'city']='Davis'
vac_data.loc[vac_data.city=='La Cañada', 'city']='La Canada Flintridge'
vac_data.loc[vac_data.city=='La Canada', 'city']='La Canada Flintridge'
vac_data.loc[vac_data.city=='Los Nietos', 'city']='West Whittier-Los Nietos'
vac_data.loc[vac_data.city=='Montclair,', 'city']='Montclair'
vac_data.loc[vac_data.city=='Newport Coast', 'city']='Newport Coast'
vac_data.loc[vac_data.city=='South San Francisoc', 'city']='South San Francisco'
vac_data.loc[vac_data.city=='Sunland-Tujunga', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Arcata Ca', 'city']='Arcata'
vac_data.loc[vac_data.city=='Berkeley Ave', 'city']='Berkeley'
vac_data.loc[vac_data.city=='Camp Pendelton', 'city']='Camp Pendleton North'
vac_data.loc[vac_data.city=='Carmicheal', 'city']='Carmichael'
vac_data.loc[vac_data.city=='Laguan Niguel', 'city']='Laguna Niguel'
vac_data.loc[vac_data.city=='Laguna Niguel Ste 3409', 'city']='Laguna Niguel'
vac_data.loc[vac_data.city=='Lapuente', 'city']='La Puente'
vac_data.loc[vac_data.city=='Las Flores', 'city']='Las Flores'
vac_data.loc[vac_data.city=='Mt. View', 'city']='Mountain View'
vac_data.loc[vac_data.city=='Mision Viejo', 'city']='Mission Viejo'
vac_data.loc[vac_data.city=='S Lake Tahoe', 'city']='South Lake Tahoe'
vac_data.loc[vac_data.city=='Saint Helena', 'city']='St. Helena'
vac_data.loc[vac_data.city=='Santa Clara Ca', 'city']='Santa Clara'
vac_data.loc[vac_data.city=='St Helena', 'city']='St. Helena'
vac_data.loc[vac_data.city=='St Helena', 'city']='St. Helena'
vac_data.loc[vac_data.city=='Stantan', 'city']='Stanton'
vac_data.loc[vac_data.city=='Stinson', 'city']='Stinson Beach'

vac_data.loc[vac_data.city=='Vandenberg Afb', 'city']='Vandenberg AFB'
vac_data.loc[vac_data.city=='Travis Afb', 'city']='Travis AFB'
vac_data.loc[vac_data.city=='Beale Afb', 'city']='Beale AFB'

vac_data.loc[vac_data.city=='"Avalon, Catalina Isl"', 'city']='Avalon'
vac_data.loc[vac_data.city=='"Beale Afb, Ca"', 'city']='Beale AFB'
vac_data.loc[vac_data.city=='"Menifee, Ca"', 'city']='Menifee'
vac_data.loc[vac_data.city=='"Palm Springs, Ca"', 'city']='Palm Springs'
vac_data.loc[vac_data.city=='Aliso Viejo, Ca.', 'city']='Aliso Viejo'
vac_data.loc[vac_data.city=='Avalon, Catalina Isl', 'city']='Avalon'
vac_data.loc[vac_data.city=='Beale Afb, Ca', 'city']='Beale AFB'
vac_data.loc[vac_data.city=='Camarilla', 'city']='Camarillo'
vac_data.loc[vac_data.city=='Camp Pendleton', 'city']='Camp Pendleton North'
vac_data.loc[vac_data.city=='Capinteria', 'city']='Carpinteria'
vac_data.loc[vac_data.city=='Carpenteria', 'city']='Carpinteria'
vac_data.loc[vac_data.city=='Castic', 'city']='Castaic'
vac_data.loc[vac_data.city=='Clear Lake', 'city']='Clearlake'
vac_data.loc[vac_data.city=='Coto De Caza', 'city']='Coto de Caza'
vac_data.loc[vac_data.city=='E. Palo Alto', 'city']='East Palo Alto'
vac_data.loc[vac_data.city=='El Sobrante', 'city']='El Sobrante (Contra Costa County)'
vac_data.loc[vac_data.city=='Fairfield Ca', 'city']='Fairfield'
vac_data.loc[vac_data.city=='Firebaugh Ca', 'city']='Firebaugh'
vac_data.loc[vac_data.city=='Fort Dick', 'city']='Fort Dick'
vac_data.loc[vac_data.city=='Ft Irwin', 'city']='Fort Irwin'
vac_data.loc[vac_data.city=='Grover City', 'city']='Grover Beach'
vac_data.loc[vac_data.city=='Hoopa', 'city']='Hoopa'
vac_data.loc[vac_data.city=='Hunington Beach', 'city']='Huntington Beach'
vac_data.loc[vac_data.city=='Huntington Beach', 'city']='Huntington Beach'
vac_data.loc[vac_data.city=="King'S Beach", 'city']='Kings Beach'
vac_data.loc[vac_data.city=='Ladrea Ranch', 'city']='Ladera Ranch'
vac_data.loc[vac_data.city=='Lancaster, Ca', 'city']='Lancaster'
vac_data.loc[vac_data.city=='Las Flores', 'city']='Las Flores'
vac_data.loc[vac_data.city=='Lemore Nas', 'city']='Lemoore'
vac_data.loc[vac_data.city=='Long Beach', 'city']='Long Beach'
vac_data.loc[vac_data.city=='Los Alto Hills', 'city']='Los Altos Hills'
vac_data.loc[vac_data.city=='Los Angeles, Ca', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Los Flores', 'city']='Las Flores (Orange County)'
vac_data.loc[vac_data.city=='Marina Del Rey', 'city']='Marina del Rey'
vac_data.loc[vac_data.city=='Mcarthur', 'city']='McArthur'
vac_data.loc[vac_data.city=='Mccloud', 'city']='McCloud'
vac_data.loc[vac_data.city=='Mcfarland', 'city']='McFarland'
vac_data.loc[vac_data.city=='Mckinleyville', 'city']='McKinleyville'
vac_data.loc[vac_data.city=='Mckittrick', 'city']='McKittrick'
vac_data.loc[vac_data.city=='Menifee, Ca', 'city']='Menifee'
vac_data.loc[vac_data.city=='Mountain Veiw', 'city']='Mountain View'
vac_data.loc[vac_data.city=='Mountian View', 'city']='Mountain View'
vac_data.loc[vac_data.city=='Mt Shasta', 'city']='Mount Shasta'
vac_data.loc[vac_data.city=='Newport Coast', 'city']='Newport Coast'
vac_data.loc[vac_data.city=='No. Hollywood', 'city']='North Hollywood'
vac_data.loc[vac_data.city=='Northrige', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Onatrio', 'city']='Ontario'
vac_data.loc[vac_data.city=='Pala', 'city']='Pala'
vac_data.loc[vac_data.city=='Palm Springs, Ca', 'city']='Palm Springs'
vac_data.loc[vac_data.city=='Palmdale,', 'city']='Palmdale'
vac_data.loc[vac_data.city=='Palos Verdes Peninsu', 'city']='Palos Verdes Estates'
vac_data.loc[vac_data.city=='Poplar', 'city']='Poplar-Cotton Center'
vac_data.loc[vac_data.city=='Presidio Of Sf', 'city']='San Francisco'
vac_data.loc[vac_data.city=='Rancho Bernardo', 'city']='San Diego'
vac_data.loc[vac_data.city=='Rolling Hills Est.', 'city']='Rolling Hills Estates'
vac_data.loc[vac_data.city=='Sacramento, Ca', 'city']='Sacramento'
vac_data.loc[vac_data.city=='San Bernadino', 'city']='San Bernadino'
vac_data.loc[vac_data.city=='San Miguel', 'city']='San Miguel'
vac_data.loc[vac_data.city=='Santa Barbra', 'city']='Santa Barbara'
vac_data.loc[vac_data.city=='Simi Valley', 'city']='Simi Valley'
vac_data.loc[vac_data.city=='So. Lake Tahoe', 'city']='South Lake Tahoe'
vac_data.loc[vac_data.city=='So. San Francisco', 'city']='South San Francisco'
vac_data.loc[vac_data.city=='Spring Valley', 'city']='Spring Valley'
vac_data.loc[vac_data.city=='Stevenson', 'city']='Stevenson Ranch'
vac_data.loc[vac_data.city=='Thousand Oak', 'city']='Thousand Oaks'
vac_data.loc[vac_data.city=='Tranquility', 'city']='Tranquillity'
vac_data.loc[vac_data.city=='Travis AFB', 'city']='Fairfield'
vac_data.loc[vac_data.city=='W. Hollywood', 'city']='West Hollywood'
vac_data.loc[vac_data.city=='West Los Angeles', 'city']='Los Angeles'
vac_data.loc[vac_data.city=='Westlake', 'city']='Westlake Village'
vac_data.loc[vac_data.city=='Westminter', 'city']='Westminster'
vac_data.loc[vac_data.city=='Yermo', 'city']='Yermo'

vac_data.loc[vac_data.city=='El Sobrante', 'city']='El Sobrante (Contra Costa County)'
vac_data.loc[vac_data.city=='Las Flores', 'city']='Las Flores (Orange County)'
vac_data.loc[vac_data.city=='Los Nietos', 'city']='West Whittier-Los Nietos'
vac_data.loc[vac_data.city=='West Whittier', 'city']='West Whittier-Los Nietos'
vac_data.loc[vac_data.city=='Paso Robles', 'city']='El Paso de Robles (Paso Robles)'
vac_data.loc[vac_data.city=='El Paso de Robles', 'city']='El Paso de Robles (Paso Robles)'
vac_data.loc[vac_data.city=='Piñon Hills', 'city']='Pinon Hills'
vac_data.loc[vac_data.city=='San Miguel', 'city']='San Miguel (San Luis Obispo County)'
vac_data.loc[vac_data.city=='Spring Valley', 'city']='Spring Valley (San Diego County)'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
#Get count of students vaccinated by county,city and grade
vac_n=vac_data.groupby(by=['county','city','vac_info_type'], as_index=False)['n'].sum()
vac_n


Unnamed: 0,county,city,vac_info_type,n
0,Alameda,Alameda,7thGradeData,752.0
1,Alameda,Alameda,ChildCareData,1098.0
2,Alameda,Alameda,KindergartenData,974.0
3,Alameda,Albany,7thGradeData,284.0
4,Alameda,Albany,ChildCareData,248.0
5,Alameda,Albany,KindergartenData,300.0
6,Alameda,Berkeley,7thGradeData,933.0
7,Alameda,Berkeley,ChildCareData,2333.0
8,Alameda,Berkeley,KindergartenData,903.0
9,Alameda,Castro Valley,7thGradeData,804.0


In [18]:
#Export count of students vaccinated by county,city and grade
vac_n.to_csv("vac_n_table1516.csv", index=False)

In [19]:
#Get average percentage of students vaccinated by county,city and grade
vac_pct=vac_data.groupby(by=['county','city','vac_info_type'], as_index=False)['pct'].mean()
vac_pct

Unnamed: 0,county,city,vac_info_type,pct
0,Alameda,Alameda,7thGradeData,98.003333
1,Alameda,Alameda,ChildCareData,95.387083
2,Alameda,Alameda,KindergartenData,97.730667
3,Alameda,Albany,7thGradeData,94.350000
4,Alameda,Albany,ChildCareData,96.023750
5,Alameda,Albany,KindergartenData,96.843333
6,Alameda,Berkeley,7thGradeData,98.002222
7,Alameda,Berkeley,ChildCareData,92.385094
8,Alameda,Berkeley,KindergartenData,86.262105
9,Alameda,Castro Valley,7thGradeData,96.687500


In [20]:
#Export average percentage of students vaccinated by county,city and grade
vac_pct.to_csv("vac_pct_table1516.csv", index=False)

In [21]:
# create grouping by county for comparison to outbreak incidence rates
#Get average percentage of students vaccinated by county and grade
vac_county_pct=vac_data.groupby(by=['county','vac_info_type'], as_index=False)['pct'].mean()
vac_county_pct

Unnamed: 0,county,vac_info_type,pct
0,Alameda,7thGradeData,97.486806
1,Alameda,ChildCareData,96.480760
2,Alameda,KindergartenData,96.384242
3,Amador,7thGradeData,94.377500
4,Amador,ChildCareData,88.974167
5,Amador,KindergartenData,91.078333
6,Butte,7thGradeData,94.832187
7,Butte,ChildCareData,91.520417
8,Butte,KindergartenData,89.876596
9,Calaveras,7thGradeData,88.987500


In [22]:
#Export average percentage of students vaccinated by county,and grade
vac_county_pct.to_csv("vac_pct_county_table1516.CSV")

In [23]:
#Export full cleaned data
vac_data.to_csv("clean1516.csv", index=False)

## Census Data Cleaning

### Demographics

In [24]:
# Import and clean census data for basic demographics age gender race
census = pd.read_csv("vaxxfacts/raw_data/Census/ACS_16_demographic.csv",encoding = "ISO-8859-1",header=[0],na_values=['-'],)

In [25]:
census.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC108,HC04_VC108,HC01_VC109,HC02_VC109,HC03_VC109,HC04_VC109,HC01_VC110,HC02_VC110,HC03_VC110,HC04_VC110
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",1000,338,1000,(X),409,146,40.9,...,695,(X),279,101,40.1,7.1,416,194,59.9,7.1
1,1600000US0600156,600156,"Acampo CDP, California",466,398,466,(X),291,277,62.4,...,466,(X),291,277,62.4,13.7,175,137,37.6,13.7
2,1600000US0600212,600212,"Acton CDP, California",7170,555,7170,(X),3776,318,52.7,...,5777,(X),3020,279,52.3,2.2,2757,199,47.7,2.2
3,1600000US0600296,600296,"Adelanto city, California",32311,35,32311,(X),16350,651,50.6,...,16322,(X),8399,706,51.5,2.4,7923,539,48.5,2.4
4,1600000US0600310,600310,"Adin CDP, California",155,91,155,(X),59,40,38.1,...,115,(X),52,33,45.2,15.8,63,35,54.8,15.8


In [26]:
demographic = census[['GEO.id','GEO.id2','GEO.display-label',
                     'HC01_VC03',
                     'HC01_VC04','HC03_VC04',
                     'HC01_VC05','HC03_VC05',
                     'HC01_VC08','HC03_VC08',
                     'HC01_VC09','HC03_VC09',
                     'HC01_VC10','HC03_VC10',
                     'HC01_VC11','HC03_VC11',
                     'HC01_VC12','HC03_VC12',
                     'HC01_VC13','HC03_VC13',
                     'HC01_VC14','HC03_VC14',
                     'HC01_VC15','HC03_VC15',
                     'HC01_VC16','HC03_VC16',
                     'HC01_VC17','HC03_VC17',
                     'HC01_VC18','HC03_VC18',
                     'HC01_VC19','HC03_VC19',
                     'HC01_VC20','HC03_VC20',
                     'HC01_VC23','HC03_VC23',
                     'HC01_VC88','HC03_VC88',
                     'HC01_VC94','HC03_VC94',
                     'HC01_VC95','HC03_VC95',
                     'HC01_VC96','HC03_VC96',
                     'HC01_VC97','HC03_VC97',
                     'HC01_VC98','HC03_VC98',
                     'HC01_VC99','HC03_VC99',
                     'HC01_VC100','HC03_VC100',
                     'HC01_VC101','HC03_VC101',
                     'HC01_VC102','HC03_VC102'                 
        ]]

In [27]:
demographic.dtypes

GEO.id                object
GEO.id2                int64
GEO.display-label     object
HC01_VC03              int64
HC01_VC04              int64
HC03_VC04            float64
HC01_VC05              int64
HC03_VC05            float64
HC01_VC08              int64
HC03_VC08            float64
HC01_VC09              int64
HC03_VC09            float64
HC01_VC10              int64
HC03_VC10            float64
HC01_VC11              int64
HC03_VC11            float64
HC01_VC12              int64
HC03_VC12            float64
HC01_VC13              int64
HC03_VC13            float64
HC01_VC14              int64
HC03_VC14            float64
HC01_VC15              int64
HC03_VC15            float64
HC01_VC16              int64
HC03_VC16            float64
HC01_VC17              int64
HC03_VC17            float64
HC01_VC18              int64
HC03_VC18            float64
HC01_VC19              int64
HC03_VC19            float64
HC01_VC20              int64
HC03_VC20            float64
HC01_VC23     

In [28]:
#combine other race, two or more races (HC01_VC99, HC01_VC100, HC01_VC101, HC01_VC102)
demographic['other']=pd.Series(census['HC01_VC99']+census['HC01_VC100']+census['HC01_VC101']+census['HC01_VC102'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
demographic['other_pct']=pd.Series(census['HC03_VC99']+census['HC03_VC100']+census['HC03_VC101']+census['HC03_VC102'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
demographic.rename(columns={'GEO.display-label': 'city',
                         'HC01_VC03': 'tot_pop', 
                         'HC01_VC04': 'male',
                         'HC01_VC05': 'female',
                         'HC01_VC08':'under_5',
                         'HC01_VC09':'5_9',
                         'HC01_VC10':'10_14',
                         'HC01_VC11':'15_19',
                         'HC01_VC12':'20_24',
                         'HC01_VC13':'25_34',
                         'HC01_VC14':'35_44',
                         'HC01_VC15':'45_54',
                         'HC01_VC16':'55_59',
                         'HC01_VC17':'60_64',
                         'HC01_VC18':'65_74',
                         'HC01_VC19':'75_84',
                         'HC01_VC20':'85_over',
                         'HC01_VC23':'median_age',
                         'HC01_VC88':'hispanic_latino',
                         'HC01_VC89':'mexican',
                         'HC01_VC94':'white',
                         'HC01_VC95':'black',
                         'HC01_VC96':'aian',
                         'HC01_VC97':'asian',
                         'HC01_VC98':'nhopi',

                         'HC03_VC04': 'male_pct',
                         'HC03_VC05': 'female_pct',
                         'HC03_VC08':'under_5_pct',
                         'HC03_VC09':'5_9_pct',
                         'HC03_VC10':'10_14_pct',
                         'HC03_VC11':'15_19_pct',
                         'HC03_VC12':'20_24_pct',
                         'HC03_VC13':'25_34_pct',
                         'HC03_VC14':'35_44_pct',
                         'HC03_VC15':'45_54_pct',
                         'HC03_VC16':'55_59_pct',
                         'HC03_VC17':'60_64_pct',
                         'HC03_VC18':'65_74_pct',
                         'HC03_VC19':'75_84_pct',
                         'HC03_VC20':'85_over_pct',
                         'HC03_VC23':'median_age_pct',
                         'HC03_VC88':'hispanic_latino_pct',
                         'HC03_VC89':'mexican_pct',
                         'HC03_VC94':'white_pct',
                         'HC03_VC95':'black_pct',
                         'HC03_VC96':'aian_pct',
                         'HC03_VC97':'asian_pct',
                         'HC03_VC98':'nhopi_pct'
                        }, 
                inplace=True)

# AIAN is american Indian Alaskan Native
# NHOPI is Native Hawaiian and Other Pacific Islander

# Other race categories
#                          'HC01_VC99':'other_only',
#                          'HC01_VC100':'more_two_races',
#                          'HC01_VC101':'more_two_races_other',
#                          'HC01_VC102':'more_two_races_other_exclude',


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [31]:
demographic.dtypes

GEO.id                  object
GEO.id2                  int64
city                    object
tot_pop                  int64
male                     int64
male_pct               float64
female                   int64
female_pct             float64
under_5                  int64
under_5_pct            float64
5_9                      int64
5_9_pct                float64
10_14                    int64
10_14_pct              float64
15_19                    int64
15_19_pct              float64
20_24                    int64
20_24_pct              float64
25_34                    int64
25_34_pct              float64
35_44                    int64
35_44_pct              float64
45_54                    int64
45_54_pct              float64
55_59                    int64
55_59_pct              float64
60_64                    int64
60_64_pct              float64
65_74                    int64
65_74_pct              float64
75_84                    int64
75_84_pct              float64
85_over 

In [32]:
demographic['city']=demographic.city.replace(' city, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
demographic['city']=demographic.city.replace(' CDP, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
demographic['city']=demographic.city.replace(' town, California', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [35]:
demographic=demographic.drop(columns=['HC01_VC99','HC03_VC99','HC01_VC100','HC03_VC100','HC01_VC101','HC03_VC101','HC01_VC102','HC03_VC102'])
demographic

Unnamed: 0,GEO.id,GEO.id2,city,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,black,black_pct,aian,aian_pct,asian,asian_pct,nhopi,nhopi_pct,other,other_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,0,0.0,0,0.0,209,20.9,0,0.0,322,32.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,120,1.7,15,0.2,67,0.9,0,0.0,254,3.6
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,5810,18.0,12,0.0,497,1.5,0,0.0,2061,6.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,0,0.0,0,0.0,0,0.0,0,0.0,10,6.4
5,1600000US0600394,600394,Agoura Hills,20689,10132,49.0,10557,51.0,910,4.4,...,313,1.5,10,0.0,1302,6.3,0,0.0,1014,4.8
6,1600000US0600450,600450,Agua Dulce,3541,1814,51.2,1727,48.8,208,5.9,...,8,0.2,0,0.0,58,1.6,0,0.0,504,14.2
7,1600000US0600464,600464,Aguanga,673,306,45.5,367,54.5,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
8,1600000US0600478,600478,Ahwahnee,2225,1178,52.9,1047,47.1,101,4.5,...,0,0.0,32,1.4,0,0.0,0,0.0,26,1.2
9,1600000US0600535,600535,Airport,1605,892,55.6,713,44.4,260,16.2,...,42,2.6,0,0.0,0,0.0,0,0.0,12,0.8


### Median Household Income and Health insurance Status

Median Household Income has categories 2,500-, 250,000+ which was changed to 2500 and 250000. 

In [36]:
# Import and clean acs data for income and insurance status
acs = pd.read_csv("vaxxfacts/raw_data/Census/ACS_16_economic.csv", encoding = "ISO-8859-1", header=[0], na_values=['-']) 

In [37]:
acs.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC03,HC02_VC03,HC03_VC03,HC04_VC03,HC01_VC04,HC02_VC04,HC03_VC04,...,HC03_VC178,HC04_VC178,HC01_VC179,HC02_VC179,HC03_VC179,HC04_VC179,HC01_VC180,HC02_VC180,HC03_VC180,HC04_VC180
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",747,288,747,(X),497,296,66.5,...,0.0,26.1,(X),(X),0.0,3.5,(X),(X),0.0,26.9
1,1600000US0600156,600156,"Acampo CDP, California",466,398,466,(X),368,359,79.0,...,0.0,27.6,(X),(X),0.0,9.3,(X),(X),0.0,21.6
2,1600000US0600212,600212,"Acton CDP, California",6098,398,6098,(X),3917,339,64.2,...,5.2,3.6,(X),(X),1.6,1.2,(X),(X),39.7,11.7
3,1600000US0600296,600296,"Adelanto city, California",21534,628,21534,(X),9910,597,46.0,...,21.3,6.9,(X),(X),41.5,5.6,(X),(X),49.4,5.9
4,1600000US0600310,600310,"Adin CDP, California",123,63,123,(X),51,36,41.5,...,32.5,43.5,(X),(X),0.0,28.3,(X),(X),34.0,32.3


In [38]:
acs=acs[['GEO.id','GEO.id2','GEO.display-label','HC01_VC85','HC01_VC131','HC03_VC131',
                'HC01_VC132','HC03_VC132','HC01_VC133','HC03_VC133','HC01_VC134','HC03_VC134']]
income=acs

In [39]:
income.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC01_VC85,HC01_VC131,HC03_VC131,HC01_VC132,HC03_VC132,HC01_VC133,HC03_VC133,HC01_VC134,HC03_VC134
0,1600000US0600135,600135,"Acalanes Ridge CDP, California",187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,"Acampo CDP, California",155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,"Acton CDP, California",91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,"Adelanto city, California",29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,"Adin CDP, California",55625.0,150,96.8,111,71.6,62,40.0,5,3.2


In [40]:
income.dtypes

GEO.id                object
GEO.id2                int64
GEO.display-label     object
HC01_VC85            float64
HC01_VC131             int64
HC03_VC131           float64
HC01_VC132             int64
HC03_VC132           float64
HC01_VC133             int64
HC03_VC133           float64
HC01_VC134             int64
HC03_VC134           float64
dtype: object

In [41]:
income.rename(columns={'GEO.display-label': 'city',
                         'HC01_VC85': 'median_income',
                         'HC01_VC131': 'insurance',
                         'HC01_VC132': 'private_insure',
                         'HC01_VC133': 'public_insure',
                         'HC01_VC134': 'no_insurance',

                         'HC03_VC131': 'insurance_pct',
                         'HC03_VC132': 'private_insure_pct',
                         'HC03_VC133': 'public_insure_pct',
                         'HC03_VC134': 'no_insurance_pct'  
                        }, 
                inplace=True)

In [42]:
income.dtypes

GEO.id                 object
GEO.id2                 int64
city                   object
median_income         float64
insurance               int64
insurance_pct         float64
private_insure          int64
private_insure_pct    float64
public_insure           int64
public_insure_pct     float64
no_insurance            int64
no_insurance_pct      float64
dtype: object

In [43]:
income['city']=income.city.replace(' CDP, California', '', regex=True)

In [44]:
income['city']=income.city.replace(' city, California', '', regex=True)

In [45]:
income['city']=income.city.replace(' town, California', '', regex=True)

In [46]:
income.head()

Unnamed: 0,GEO.id,GEO.id2,city,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,55625.0,150,96.8,111,71.6,62,40.0,5,3.2


## Join Demographics and income data

In [47]:
pop_data=demographic.merge(income, left_on='GEO.id', right_on='GEO.id', how='outer')
pop_data

Unnamed: 0,GEO.id,GEO.id2_x,city_x,tot_pop,male,male_pct,female,female_pct,under_5,under_5_pct,...,city_y,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,1600000US0600135,600135,Acalanes Ridge,1000,409,40.9,591,59.1,33,3.3,...,Acalanes Ridge,187604.0,988,98.8,952,95.2,149,14.9,12,1.2
1,1600000US0600156,600156,Acampo,466,291,62.4,175,37.6,0,0.0,...,Acampo,155385.0,451,96.8,451,96.8,98,21.0,15,3.2
2,1600000US0600212,600212,Acton,7170,3776,52.7,3394,47.3,350,4.9,...,Acton,91168.0,6582,92.0,5798,81.0,1660,23.2,575,8.0
3,1600000US0600296,600296,Adelanto,32311,16350,50.6,15961,49.4,3148,9.7,...,Adelanto,29647.0,26528,86.7,9306,30.4,18268,59.7,4068,13.3
4,1600000US0600310,600310,Adin,155,59,38.1,96,61.9,7,4.5,...,Adin,55625.0,150,96.8,111,71.6,62,40.0,5,3.2
5,1600000US0600394,600394,Agoura Hills,20689,10132,49.0,10557,51.0,910,4.4,...,Agoura Hills,114313.0,19814,96.2,17704,85.9,4068,19.7,787,3.8
6,1600000US0600450,600450,Agua Dulce,3541,1814,51.2,1727,48.8,208,5.9,...,Agua Dulce,104861.0,3421,96.6,3063,86.5,935,26.4,120,3.4
7,1600000US0600464,600464,Aguanga,673,306,45.5,367,54.5,0,0.0,...,Aguanga,43750.0,501,74.4,265,39.4,298,44.3,172,25.6
8,1600000US0600478,600478,Ahwahnee,2225,1178,52.9,1047,47.1,101,4.5,...,Ahwahnee,64049.0,2062,93.2,1432,64.7,1148,51.9,151,6.8
9,1600000US0600535,600535,Airport,1605,892,55.6,713,44.4,260,16.2,...,Airport,29868.0,1298,80.9,591,36.8,741,46.2,307,19.1


In [48]:
list(pop_data.columns.values)

['GEO.id',
 'GEO.id2_x',
 'city_x',
 'tot_pop',
 'male',
 'male_pct',
 'female',
 'female_pct',
 'under_5',
 'under_5_pct',
 '5_9',
 '5_9_pct',
 '10_14',
 '10_14_pct',
 '15_19',
 '15_19_pct',
 '20_24',
 '20_24_pct',
 '25_34',
 '25_34_pct',
 '35_44',
 '35_44_pct',
 '45_54',
 '45_54_pct',
 '55_59',
 '55_59_pct',
 '60_64',
 '60_64_pct',
 '65_74',
 '65_74_pct',
 '75_84',
 '75_84_pct',
 '85_over',
 '85_over_pct',
 'median_age',
 'median_age_pct',
 'hispanic_latino',
 'hispanic_latino_pct',
 'white',
 'white_pct',
 'black',
 'black_pct',
 'aian',
 'aian_pct',
 'asian',
 'asian_pct',
 'nhopi',
 'nhopi_pct',
 'other',
 'other_pct',
 'GEO.id2_y',
 'city_y',
 'median_income',
 'insurance',
 'insurance_pct',
 'private_insure',
 'private_insure_pct',
 'public_insure',
 'public_insure_pct',
 'no_insurance',
 'no_insurance_pct']

In [49]:
pop_data=pop_data.drop(columns=['GEO.id2_y','city_y'])

In [50]:
pop_data=pop_data.rename(columns={'GEO.id': 'geoid',
                         'GEO.id2_x': 'geoid2',
                         'city_x': 'city'})

In [51]:
# Fix City naming to be similar for special characters
pop_data.loc[pop_data.city=='La Cañada Flintridge', 'city']='La Canada Flintridge'
pop_data.loc[pop_data.city=='Piñon Hills', 'city']='Pinon Hills'

# Change census cities names that are present in multiple places to match vaccine data city
pop_data.loc[pop_data.city=='El Sobrante Cdp (Contra Costa County), California', 'city']='El Sobrante (Contra Costa County)'
pop_data.loc[pop_data.city=='Edwards Afb', 'city']='Edwards AFB'
pop_data.loc[pop_data.city=='Las Flores Cdp (Orange County), California', 'city']='Las Flores (Orange County)'
pop_data.loc[pop_data.city=='West Whittier-Los Nietos CDP, California', 'city']='Los Nietos'
pop_data.loc[pop_data.city=="El Paso de Robles (Paso Robles)", 'city']='Paso Robles'
pop_data.loc[pop_data.city=='San Miguel Cdp (San Luis Obispo County), California', 'city']='San Miguel'
pop_data.loc[pop_data.city=='Spring Valley Cdp (San Diego County), California', 'city']='Spring Valley'

In [52]:
# Exported merged census data
pop_data.to_csv("pop_data16.csv", index=False)

## Join Vaccine data to Census data

Join needs unique cities in the vaccine data to use as a key. Will need to figure out how to weight the different grade levels based on the census population numbers for age. 

In [53]:
vac_pop=vac_data.merge(pop_data, left_on='city', right_on='city', how='left')
vac_pop

Unnamed: 0,facility_num,county,is_public,city,facility_name,enrollment,n,pct,reported,vac_info_type,...,other_pct,median_income,insurance,insurance_pct,private_insure,private_insure_pct,public_insure,public_insure_pct,no_insurance,no_insurance_pct
0,13420589,Alameda,PUBLIC,Alameda,A.U.S.D.- WOODSTOCK CHILD DEVELOPMENT CENTER,99.0,93.0,93.94,Y,ChildCareData,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
1,13419449,Alameda,PRIVATE,Fremont,ABC MAGIC MOMENTS PRESCHOOL - IRVINGTON,45.0,42.0,93.33,Y,ChildCareData,...,8.2,111613.0,215747.0,94.9,179932.0,79.2,51603.0,22.7,11477.0,5.1
2,13417471,Alameda,PRIVATE,Fremont,"ABC MAGIC MOMENTS,INC. PRESCHOOL CHILDCARE",25.0,25.0,100.00,Y,ChildCareData,...,8.2,111613.0,215747.0,94.9,179932.0,79.2,51603.0,22.7,11477.0,5.1
3,13411388,Alameda,PRIVATE,Alameda,ABC PRESCHOOL,35.0,34.0,97.14,Y,ChildCareData,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
4,13421345,Alameda,PRIVATE,Castro Valley,ABC PRESCHOOL & DAYCARE,48.0,43.0,89.58,Y,ChildCareData,...,9.1,87204.0,58630.0,94.6,49138.0,79.3,16649.0,26.9,3325.0,5.4
5,13420975,Alameda,PRIVATE,Oakland,ACADEMIA DE MI ABUELA,37.0,23.0,62.16,Y,ChildCareData,...,9.8,57778.0,359638.0,87.7,245913.0,60.0,150210.0,36.6,50464.0,12.3
6,13415846,Alameda,PRIVATE,Oakland,ACTS FULL GOSPEL CHRISTIAN ACADEMY,33.0,33.0,100.00,Y,ChildCareData,...,9.8,57778.0,359638.0,87.7,245913.0,60.0,150210.0,36.6,50464.0,12.3
7,13417566,Alameda,PRIVATE,Oakland,ADVANCE DAY CARE CENTER,68.0,68.0,100.00,Y,ChildCareData,...,9.8,57778.0,359638.0,87.7,245913.0,60.0,150210.0,36.6,50464.0,12.3
8,10214883,Alameda,HEAD START,Alameda,ALAMEDA HEAD START - ANGELA AGUILAR CENTER,66.0,63.0,95.45,Y,ChildCareData,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3
9,13417425,Alameda,HEAD START,Alameda,ALAMEDA HEAD START - SUE MATHESON CENTER,41.0,39.0,95.12,Y,ChildCareData,...,12.5,83048.0,70366.0,92.7,58794.0,77.5,20088.0,26.5,5528.0,7.3


In [54]:
#Export full cleaned data
vac_pop.to_csv("cleanjoin1516.csv", index=False)

In [55]:
vac_pop.dtypes

facility_num             int64
county                  object
is_public               object
city                    object
facility_name           object
enrollment             float64
n                      float64
pct                    float64
reported                object
vac_info_type           object
geoid                   object
geoid2                 float64
tot_pop                float64
male                   float64
male_pct               float64
female                 float64
female_pct             float64
under_5                float64
under_5_pct            float64
5_9                    float64
5_9_pct                float64
10_14                  float64
10_14_pct              float64
15_19                  float64
15_19_pct              float64
20_24                  float64
20_24_pct              float64
25_34                  float64
25_34_pct              float64
35_44                  float64
                        ...   
65_74_pct              float64
75_84   