# Zip code level COVID cases data processing for CA only  
Data from LA times  
Outputs city_data.csv

In [21]:
import pandas as pd
import time
from datetime import datetime

start_time = time.time()

city_df = pd.read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv")
city_df.x = city_df.x.abs() * -1  # Correcting a mistake in longitude
city_df.x = city_df.x.fillna(0)
city_df.y = city_df.y.fillna(0)
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y
0,2020-03-16,Los Angeles,37,Alhambra,2,,-118.135504,34.083961
1,2020-03-16,Los Angeles,37,Arcadia,1,,-118.037297,34.134186
2,2020-03-16,Los Angeles,37,Beverly Hills,1,,-118.402109,34.078543
3,2020-03-16,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150
4,2020-03-16,Los Angeles,37,Carson,1,,-118.255878,33.837391
...,...,...,...,...,...,...,...,...
46060,2020-06-03,Ventura,111,Ventura,72,,-119.293754,34.282366
46061,,Monterey,53,North County,47,,-121.669722,36.775833
46062,,Monterey,53,Peninsula and Big Sur Area,51,,-121.923333,36.555278
46063,,Monterey,53,Salinas Area,352,,-121.655556,36.677778


Filter date after 4/20

In [22]:
city_df['date1'] = pd.to_datetime(city_df.date, format="%Y-%m-%d")
city_df = city_df[city_df.date1 >= '2020-4-20']
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,date1
13107,2020-04-20,Alameda,1,Alameda,29,,-122.274444,37.756111,2020-04-20
13108,2020-04-20,Alameda,1,Albany,1,9 or fewer,-122.297778,37.886944,2020-04-20
13109,2020-04-20,Alameda,1,Berkeley,42,,-122.272778,37.871667,2020-04-20
13110,2020-04-20,Alameda,1,Dublin,21,,-121.935833,37.702222,2020-04-20
13111,2020-04-20,Alameda,1,Emeryville,1,9 or fewer,-122.285278,37.831389,2020-04-20
...,...,...,...,...,...,...,...,...,...
46056,2020-06-03,Ventura,111,Santa Paula,121,,-119.061497,34.353092,2020-06-03
46057,2020-06-03,Ventura,111,Simi Valley,276,,-118.750410,34.271008,2020-06-03
46058,2020-06-03,Ventura,111,Somis,4,,-118.994592,34.263681,2020-06-03
46059,2020-06-03,Ventura,111,Thousand Oaks,138,,-118.839304,34.172478,2020-06-03


New python package for finding zipcode called uszipcode
Can find zipcode based on coordinates
https://pypi.org/project/uszipcode/

In [23]:
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
result = search.by_coordinates(38.678611, -121.773333, radius=30, returns=1)
result[0]

SimpleZipcode(zipcode='95776', zipcode_type='Standard', major_city='Woodland', post_office_city='Woodland, CA', common_city_list=['Woodland'], county='Yolo County', state='CA', lat=38.7, lng=-121.7, timezone='Pacific', radius_in_miles=11.0, area_code_list=['530'], population=21902, population_density=237.0, land_area_in_sqmi=92.43, water_area_in_sqmi=1.12, housing_units=6824, occupied_housing_units=6464, median_home_value=248000, median_household_income=61599, bounds_west=-121.781382, bounds_east=-121.629202, bounds_north=38.803028, bounds_south=38.553296)

Select unique cities and look for zip code in a new table
Filter zip codes according to the file Zip_Codes__LA_County_.txt

In [24]:
city_data_df = pd.DataFrame()
city_data_df = pd.concat([city_df.county, city_df.place, city_df.x, city_df.y], axis=1)
city_data_df = city_data_df.drop_duplicates()

def search_zip(df):
    # If place name contains zip code, returns the zip code directly
    if df.place[0:5].isnumeric():
        return df.place[0:5]
    if df.y == 0:
        return 0
    result = search.by_coordinates(df.y, df.x, radius=30, returns=1)
    return result[0].zipcode


city_data_df['zip'] = city_data_df.apply(search_zip, axis=1)

with open("Zip_Codes__LA_County_.txt") as file:
    zip_codes_list = file.read().splitlines()
city_data_df = city_data_df[city_data_df.zip.isin(zip_codes_list)]

city_data_df = city_data_df.reset_index(drop=True)
city_data_df['city_id'] = city_data_df.index
city_data_df

Unnamed: 0,county,place,x,y,zip,city_id
0,Los Angeles,Adams-Normandie,-118.302397,34.029122,90018,0
1,Los Angeles,Alsace,-118.362349,34.029571,90016,1
2,Los Angeles,Angelino Heights,-118.251986,34.068331,90012,2
3,Los Angeles,Athens Village,-118.273697,33.919783,90061,3
4,Los Angeles,Athens-Westmont,-118.302483,33.934033,90047,4
...,...,...,...,...,...,...
97,Los Angeles,Willowbrook,-118.251505,33.916576,90059,97
98,Los Angeles,Wilshire Center,-118.295338,34.068376,90010,98
99,Los Angeles,Sycamore Square,-118.343701,34.058007,90019,99
100,Los Angeles,Vernon,-118.210583,34.001902,90058,100


In [25]:
city_df = pd.merge(city_df, city_data_df, "inner", on=['county', 'place', 'x', 'y'])
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,date1,zip,city_id
0,2020-04-20,Los Angeles,37,Adams-Normandie,16,,-118.302397,34.029122,2020-04-20,90018,0
1,2020-04-21,Los Angeles,37,Adams-Normandie,17,,-118.302397,34.029122,2020-04-21,90018,0
2,2020-04-22,Los Angeles,37,Adams-Normandie,19,,-118.302397,34.029122,2020-04-22,90018,0
3,2020-04-23,Los Angeles,37,Adams-Normandie,20,,-118.302397,34.029122,2020-04-23,90018,0
4,2020-04-24,Los Angeles,37,Adams-Normandie,22,,-118.302397,34.029122,2020-04-24,90018,0
...,...,...,...,...,...,...,...,...,...,...,...
4525,2020-05-30,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-05-30,90094,101
4526,2020-05-31,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-05-31,90094,101
4527,2020-06-01,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-06-01,90094,101
4528,2020-06-02,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-06-02,90094,101


Compute daily cases, rate and rate SMA7 of each city

In [26]:
import numpy as np

# Suppress warnings
pd.options.mode.chained_assignment = None

city_id = city_df.city_id.unique()

for i in city_id:
    temp_df = city_df[city_df.city_id == i]
    temp_df['daily_cases'] = temp_df['confirmed_cases'].diff().fillna(0)
    temp_df['rate'] = temp_df['confirmed_cases'].pct_change().fillna(0)
    temp_df['rate7day'] = temp_df['rate'].rolling(window=7).mean().fillna(0)
    temp_df['ratio7day'] = (temp_df.rate / temp_df.rate7day).fillna(0)
    temp_df = temp_df.replace(np.inf, 0)
    temp_df = temp_df.replace(-np.inf, 0)
    if i == 0:
        new_df = temp_df
    else:
        new_df = pd.concat([new_df, temp_df])
new_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,date1,zip,city_id,daily_cases,rate,rate7day,ratio7day
0,2020-04-20,Los Angeles,37,Adams-Normandie,16,,-118.302397,34.029122,2020-04-20,90018,0,0.0,0.000000,0.0,0.0
1,2020-04-21,Los Angeles,37,Adams-Normandie,17,,-118.302397,34.029122,2020-04-21,90018,0,1.0,0.062500,0.0,0.0
2,2020-04-22,Los Angeles,37,Adams-Normandie,19,,-118.302397,34.029122,2020-04-22,90018,0,2.0,0.117647,0.0,0.0
3,2020-04-23,Los Angeles,37,Adams-Normandie,20,,-118.302397,34.029122,2020-04-23,90018,0,1.0,0.052632,0.0,0.0
4,2020-04-24,Los Angeles,37,Adams-Normandie,22,,-118.302397,34.029122,2020-04-24,90018,0,2.0,0.100000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4525,2020-05-30,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-05-30,90094,101,0.0,0.000000,0.0,0.0
4526,2020-05-31,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-05-31,90094,101,0.0,0.000000,0.0,0.0
4527,2020-06-01,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-06-01,90094,101,0.0,0.000000,0.0,0.0
4528,2020-06-02,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,2020-06-02,90094,101,0.0,0.000000,0.0,0.0


In [27]:
import os
new_df.to_csv(os.path.join("output", "city_data.csv"), index=False)

Total Running time (s)

In [28]:
time.time() - start_time

5.943000078201294