# Zip code level COVID cases data processing for CA only  
Data from LA times  
Outputs city_data.csv

In [12]:
import pandas as pd
import time
start_time = time.time()

city_df = pd.read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv")
city_df.x = city_df.x.abs() * -1  # Correcting a mistake in longitude
city_df.x = city_df.x.fillna(0)
city_df.y = city_df.y.fillna(0)
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y
0,2020-03-16,Los Angeles,37,Alhambra,2,,-118.135504,34.083961
1,2020-03-16,Los Angeles,37,Arcadia,1,,-118.037297,34.134186
2,2020-03-16,Los Angeles,37,Beverly Hills,1,,-118.402109,34.078543
3,2020-03-16,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150
4,2020-03-16,Los Angeles,37,Carson,1,,-118.255878,33.837391
...,...,...,...,...,...,...,...,...
43631,2020-05-31,Stanislaus,99,unincorporated / districts,116,,0.000000,0.000000
43632,2020-05-31,Yolo,113,Davis,24,,-121.738056,38.553889
43633,2020-05-31,Yolo,113,West Sacramento,73,,-121.530278,38.580556
43634,2020-05-31,Yolo,113,Winters and unincorporated,18,,-121.970833,38.525000


New python package for finding zipcode called uszipcode
Can find zipcode based on coordinates
https://pypi.org/project/uszipcode/

In [13]:
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
result = search.by_coordinates(38.678611, -121.773333, radius=30, returns=1)
result[0]

SimpleZipcode(zipcode='95776', zipcode_type='Standard', major_city='Woodland', post_office_city='Woodland, CA', common_city_list=['Woodland'], county='Yolo County', state='CA', lat=38.7, lng=-121.7, timezone='Pacific', radius_in_miles=11.0, area_code_list=['530'], population=21902, population_density=237.0, land_area_in_sqmi=92.43, water_area_in_sqmi=1.12, housing_units=6824, occupied_housing_units=6464, median_home_value=248000, median_household_income=61599, bounds_west=-121.781382, bounds_east=-121.629202, bounds_north=38.803028, bounds_south=38.553296)

Select unique cities and look for zip code in a new table
Filter zip codes according to the file Zip_Codes__LA_County_.txt

In [14]:
city_data_df = pd.DataFrame()
city_data_df = pd.concat([city_df.county, city_df.place, city_df.x, city_df.y], axis=1)
city_data_df = city_data_df.drop_duplicates()

def search_zip(df):
    # If place name contains zip code, returns the zip code directly
    if df.place[0:5].isnumeric():
        return df.place[0:5]
    if df.y == 0:
        return 0
    result = search.by_coordinates(df.y, df.x, radius=30, returns=1)
    return result[0].zipcode


city_data_df['zip'] = city_data_df.apply(search_zip, axis=1)

with open("Zip_Codes__LA_County_.txt") as file:
    zip_codes_list = file.read().splitlines()
city_data_df = city_data_df[city_data_df.zip.isin(zip_codes_list)]

city_data_df = city_data_df.reset_index(drop=True)
city_data_df['city_id'] = city_data_df.index
city_data_df

Unnamed: 0,county,place,x,y,zip,city_id
0,Los Angeles,Boyle Heights,-118.205330,34.038150,90033,0
1,Los Angeles,Koreatown,-118.304346,34.054588,90010,1
2,Los Angeles,Melrose,-118.334586,34.082723,90038,2
3,Los Angeles,Silver Lake,-118.270242,34.095701,90029,3
4,Los Angeles,West Adams,-118.307847,34.035612,90018,4
...,...,...,...,...,...,...
98,Los Angeles,Exposition,-118.349778,34.026624,90016,98
99,Los Angeles,University Hills,-118.168371,34.068093,90032,99
100,Los Angeles,Sycamore Square,-118.343701,34.058007,90019,100
101,Los Angeles,Vernon,-118.210583,34.001902,90058,101


In [15]:
city_df = pd.merge(city_df, city_data_df, "inner", on=['county', 'place', 'x', 'y'])
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,zip,city_id
0,2020-03-16,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0
1,2020-03-17,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0
2,2020-03-18,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0
3,2020-03-19,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0
4,2020-03-20,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0
...,...,...,...,...,...,...,...,...,...,...
6799,2020-05-31,Los Angeles,37,Vernon,3,,-118.210583,34.001902,90058,101
6800,2020-05-28,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,90094,102
6801,2020-05-29,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,90094,102
6802,2020-05-30,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,90094,102


Compute daily cases, rate and rate SMA7 of each city

In [16]:
import numpy as np

# Suppress warnings
pd.options.mode.chained_assignment = None

city_id = city_df.city_id.unique()

for i in city_id:
    temp_df = city_df[city_df.city_id == i]
    temp_df['daily_cases'] = temp_df['confirmed_cases'].diff().fillna(0)
    temp_df['rate'] = temp_df['confirmed_cases'].pct_change().fillna(0)
    temp_df['rate7day'] = temp_df['rate'].rolling(window=7).mean().fillna(0)
    temp_df['ratio7day'] = (temp_df.rate / temp_df.rate7day).fillna(0)
    temp_df = temp_df.replace(np.inf, 0)
    temp_df = temp_df.replace(-np.inf, 0)
    if i == 0:
        new_df = temp_df
    else:
        new_df = pd.concat([new_df, temp_df])
new_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,zip,city_id,daily_cases,rate,rate7day,ratio7day
0,2020-03-16,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0,0.0,0.0,0.000000,0.0
1,2020-03-17,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0,0.0,0.0,0.000000,0.0
2,2020-03-18,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0,0.0,0.0,0.000000,0.0
3,2020-03-19,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0,0.0,0.0,0.000000,0.0
4,2020-03-20,Los Angeles,37,Boyle Heights,5,,-118.205330,34.038150,90033,0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6799,2020-05-31,Los Angeles,37,Vernon,3,,-118.210583,34.001902,90058,101,0.0,0.0,0.214286,0.0
6800,2020-05-28,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,90094,102,0.0,0.0,0.000000,0.0
6801,2020-05-29,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,90094,102,0.0,0.0,0.000000,0.0
6802,2020-05-30,Los Angeles,37,Unincorporated - Del Rey,1,,-118.412269,33.982584,90094,102,0.0,0.0,0.000000,0.0


In [17]:
import os
new_df.to_csv(os.path.join("output", "city_data.csv"), index=False)

Total Running time (s)

In [18]:
time.time() - start_time

6.591000080108643