# Zip code level COVID cases data processing for CA only  
Data from LA times  
Outputs city_data.csv

In [1]:
import pandas as pd
import time
start_time = time.time()

city_df = pd.read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv")
city_df.x = city_df.x.abs() * -1  # Correcting a mistake in longitude
city_df.x = city_df.x.fillna(0)
city_df.y = city_df.y.fillna(0)
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y
0,2020-03-16,Los Angeles,37,Alhambra,2,,-118.135504,34.083961
1,2020-03-16,Los Angeles,37,Arcadia,1,,-118.037297,34.134186
2,2020-03-16,Los Angeles,37,Beverly Hills,1,,-118.402109,34.078543
3,2020-03-16,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150
4,2020-03-16,Los Angeles,37,Carson,1,,-118.255878,33.837391
...,...,...,...,...,...,...,...,...
42990,2020-05-30,Stanislaus,99,unincorporated / districts,113,,0.000000,0.000000
42991,2020-05-30,Yolo,113,Davis,24,,-121.738056,38.553889
42992,2020-05-30,Yolo,113,West Sacramento,72,,-121.530278,38.580556
42993,2020-05-30,Yolo,113,Winters and unincorporated,18,,-121.970833,38.525000


New python package for finding zipcode called uszipcode
Can find zipcode based on coordinates
https://pypi.org/project/uszipcode/

In [2]:
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
result = search.by_coordinates(38.678611, -121.773333, radius=30, returns=1)
result[0]

SimpleZipcode(zipcode='95776', zipcode_type='Standard', major_city='Woodland', post_office_city='Woodland, CA', common_city_list=['Woodland'], county='Yolo County', state='CA', lat=38.7, lng=-121.7, timezone='Pacific', radius_in_miles=11.0, area_code_list=['530'], population=21902, population_density=237.0, land_area_in_sqmi=92.43, water_area_in_sqmi=1.12, housing_units=6824, occupied_housing_units=6464, median_home_value=248000, median_household_income=61599, bounds_west=-121.781382, bounds_east=-121.629202, bounds_north=38.803028, bounds_south=38.553296)

Select unique cities and look for zip code in a new table

In [3]:
city_data_df = pd.DataFrame()
city_data_df = pd.concat([city_df.county, city_df.place, city_df.x, city_df.y], axis=1)
city_data_df = city_data_df.drop_duplicates()

def search_zip(df):
    # If place name contains zip code, returns the zip code directly
    if df.place[0:5].isnumeric():
        return df.place[0:5]
    if df.y == 0:
        return 0
    result = search.by_coordinates(df.y, df.x, radius=30, returns=1)
    return result[0].zipcode

city_data_df['zip'] = city_data_df.apply(search_zip, axis=1)
city_data_df = city_data_df.reset_index(drop=True)
city_data_df['city_id'] = city_data_df.index
city_data_df

Unnamed: 0,county,place,x,y,zip,city_id
0,Los Angeles,Alhambra,-118.135504,34.083961,91803,0
1,Los Angeles,Arcadia,-118.037297,34.134186,91006,1
2,Los Angeles,Beverly Hills,-118.402109,34.078543,90212,2
3,Los Angeles,Boyle Heights,-118.205330,34.038150,90033,3
4,Los Angeles,Carson,-118.255878,33.837391,90745,4
...,...,...,...,...,...,...
998,Marin,Strawberry,-122.508804,37.897780,94920,998
999,Marin,Tamalpais-Homestead Valley,-122.531200,37.876045,94941,999
1000,Marin,Tiburon,-122.456141,37.873572,94920,1000
1001,Marin,Tomales,-122.904946,38.246540,94940,1001


In [4]:
city_df = pd.merge(city_df, city_data_df, "inner", on=['county', 'place', 'x', 'y'])
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,zip,city_id
0,2020-03-16,Los Angeles,37,Alhambra,2,,-118.135504,34.083961,91803,0
1,2020-03-17,Los Angeles,37,Alhambra,1,,-118.135504,34.083961,91803,0
2,2020-03-18,Los Angeles,37,Alhambra,1,,-118.135504,34.083961,91803,0
3,2020-03-19,Los Angeles,37,Alhambra,2,,-118.135504,34.083961,91803,0
4,2020-03-20,Los Angeles,37,Alhambra,3,,-118.135504,34.083961,91803,0
...,...,...,...,...,...,...,...,...,...,...
42990,2020-05-30,Marin,41,Tiburon,1,9 or fewer,-122.456141,37.873572,94920,1000
42991,2020-05-29,Marin,41,Tomales,1,9 or fewer,-122.904946,38.246540,94940,1001
42992,2020-05-30,Marin,41,Tomales,1,9 or fewer,-122.904946,38.246540,94940,1001
42993,2020-05-29,Marin,41,Woodacre,1,9 or fewer,-122.645405,38.012655,94930,1002


Compute daily cases, rate and rate SMA7 of each city

In [5]:
import numpy as np

# Suppress warnings
pd.options.mode.chained_assignment = None

city_id = city_df.city_id.unique()

for i in city_id:
    temp_df = city_df[city_df.city_id == i]
    temp_df['daily_cases'] = temp_df['confirmed_cases'].diff().fillna(0)
    temp_df['rate'] = temp_df['confirmed_cases'].pct_change().fillna(0)
    temp_df['rate7day'] = temp_df['rate'].rolling(window=7).mean().fillna(0)
    temp_df['ratio7day'] = (temp_df.rate / temp_df.rate7day).fillna(0)
    temp_df = temp_df.replace(np.inf, 0)
    temp_df = temp_df.replace(-np.inf, 0)
    if i == 0:
        new_df = temp_df
    else:
        new_df = pd.concat([new_df, temp_df])
new_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,zip,city_id,daily_cases,rate,rate7day,ratio7day
0,2020-03-16,Los Angeles,37,Alhambra,2,,-118.135504,34.083961,91803,0,0.0,0.0,0.0,0.0
1,2020-03-17,Los Angeles,37,Alhambra,1,,-118.135504,34.083961,91803,0,-1.0,-0.5,0.0,0.0
2,2020-03-18,Los Angeles,37,Alhambra,1,,-118.135504,34.083961,91803,0,0.0,0.0,0.0,0.0
3,2020-03-19,Los Angeles,37,Alhambra,2,,-118.135504,34.083961,91803,0,1.0,1.0,0.0,0.0
4,2020-03-20,Los Angeles,37,Alhambra,3,,-118.135504,34.083961,91803,0,1.0,0.5,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42990,2020-05-30,Marin,41,Tiburon,1,9 or fewer,-122.456141,37.873572,94920,1000,0.0,0.0,0.0,0.0
42991,2020-05-29,Marin,41,Tomales,1,9 or fewer,-122.904946,38.246540,94940,1001,0.0,0.0,0.0,0.0
42992,2020-05-30,Marin,41,Tomales,1,9 or fewer,-122.904946,38.246540,94940,1001,0.0,0.0,0.0,0.0
42993,2020-05-29,Marin,41,Woodacre,1,9 or fewer,-122.645405,38.012655,94930,1002,0.0,0.0,0.0,0.0


In [6]:
import os
new_df.to_csv(os.path.join("output", "city_data.csv"), index=False)

Total Running time (s)

In [7]:
time.time() - start_time

20.174999952316284