# Zip code level COVID cases data processing for CA only  
Data from LA times  
Outputs city_data.csv

In [1]:
import pandas as pd
import time
from datetime import datetime

start_time = time.time()

city_df = pd.read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv")
city_df.x = city_df.x.abs() * -1  # Correcting a mistake in longitude
city_df.x = city_df.x.fillna(0)
city_df.y = city_df.y.fillna(0)
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y
0,2020-07-07,Amador,5.0,Ione,12,,-120.932778,38.352778
1,2020-07-07,Amador,5.0,Jackson,10,,-120.774167,38.348889
2,2020-07-07,Amador,5.0,Pine Grove,3,,-120.658889,38.413056
3,2020-07-07,Amador,5.0,Pioneer,2,,0.000000,0.000000
4,2020-07-07,Amador,5.0,Plymouth,2,,0.000000,0.000000
...,...,...,...,...,...,...,...,...
72244,2020-03-16,Los Angeles,37.0,West Adams,1,,-118.307847,34.035612
72245,2020-03-16,Los Angeles,37.0,West Hills,3,,-118.636070,34.211472
72246,2020-03-16,Los Angeles,37.0,West Hollywood,5,,-118.371765,34.088278
72247,2020-03-16,Los Angeles,37.0,West Vernon,1,,-118.300192,34.000114


Filter date after 4/20

In [2]:
city_df['date1'] = pd.to_datetime(city_df.date, format="%Y-%m-%d")
city_df = city_df[city_df.date1 >= '2020-4-20']
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,date1
0,2020-07-07,Amador,5.0,Ione,12,,-120.932778,38.352778,2020-07-07
1,2020-07-07,Amador,5.0,Jackson,10,,-120.774167,38.348889,2020-07-07
2,2020-07-07,Amador,5.0,Pine Grove,3,,-120.658889,38.413056,2020-07-07
3,2020-07-07,Amador,5.0,Pioneer,2,,0.000000,0.000000,2020-07-07
4,2020-07-07,Amador,5.0,Plymouth,2,,0.000000,0.000000,2020-07-07
...,...,...,...,...,...,...,...,...,...
59148,2020-04-20,Ventura,111.0,Westlake,8,,-118.824430,34.135589,2020-04-20
59149,2020-04-20,Yolo,113.0,Davis,16,,-121.738056,38.553889,2020-04-20
59150,2020-04-20,Yolo,113.0,West Sacramento,44,,-121.530278,38.580556,2020-04-20
59151,2020-04-20,Yolo,113.0,Winters and unincorporated,10,,0.000000,0.000000,2020-04-20


New python package for finding zipcode called uszipcode
Can find zipcode based on coordinates
https://pypi.org/project/uszipcode/

In [3]:
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
result = search.by_coordinates(38.678611, -121.773333, radius=30, returns=1)
result[0]

SimpleZipcode(zipcode='95776', zipcode_type='Standard', major_city='Woodland', post_office_city='Woodland, CA', common_city_list=['Woodland'], county='Yolo County', state='CA', lat=38.7, lng=-121.7, timezone='Pacific', radius_in_miles=11.0, area_code_list=['530'], population=21902, population_density=237.0, land_area_in_sqmi=92.43, water_area_in_sqmi=1.12, housing_units=6824, occupied_housing_units=6464, median_home_value=248000, median_household_income=61599, bounds_west=-121.781382, bounds_east=-121.629202, bounds_north=38.803028, bounds_south=38.553296)

Select unique cities and look for zip code in a new table
Filter zip codes according to the file Zip_Codes__LA_County_.txt

In [4]:
city_data_df = pd.DataFrame()
city_data_df = pd.concat([city_df.county, city_df.place, city_df.x, city_df.y], axis=1)
city_data_df = city_data_df.drop_duplicates()

def search_zip(df):
    # If place name contains zip code, returns the zip code directly
    if df.place[0:5].isnumeric():
        return df.place[0:5]
    if df.y == 0:
        return 0
    result = search.by_coordinates(df.y, df.x, radius=30, returns=1)
    return result[0].zipcode


city_data_df['zip'] = city_data_df.apply(search_zip, axis=1)

with open("Zip_Codes__LA_County_.txt") as file:
    zip_codes_list = file.read().splitlines()
city_data_df = city_data_df[city_data_df.zip.isin(zip_codes_list)]

city_data_df = city_data_df.reset_index(drop=True)
city_data_df['city_id'] = city_data_df.index
city_data_df

Unnamed: 0,county,place,x,y,zip,city_id
0,Los Angeles,Adams-Normandie,-118.302397,34.029122,90018,0
1,Los Angeles,Alsace,-118.362349,34.029571,90016,1
2,Los Angeles,Angelino Heights,-118.251986,34.068331,90012,2
3,Los Angeles,Athens Village,-118.273697,33.919783,90061,3
4,Los Angeles,Athens-Westmont,-118.302483,33.934033,90047,4
...,...,...,...,...,...,...
98,Los Angeles,Westlake,-118.272224,34.057208,90057,98
99,Los Angeles,Westwood,-118.438137,34.065931,90024,99
100,Los Angeles,Wholesale District,-118.233106,34.041547,90013,100
101,Los Angeles,Willowbrook,-118.251505,33.916576,90059,101


In [5]:
city_df = pd.merge(city_df, city_data_df, "inner", on=['county', 'place', 'x', 'y'])
city_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,date1,zip,city_id
0,2020-07-07,Los Angeles,37.0,Adams-Normandie,116,,-118.302397,34.029122,2020-07-07,90018,0
1,2020-07-06,Los Angeles,37.0,Adams-Normandie,112,,-118.302397,34.029122,2020-07-06,90018,0
2,2020-07-02,Los Angeles,37.0,Adams-Normandie,105,,-118.302397,34.029122,2020-07-02,90018,0
3,2020-07-01,Los Angeles,37.0,Adams-Normandie,103,,-118.302397,34.029122,2020-07-01,90018,0
4,2020-06-30,Los Angeles,37.0,Adams-Normandie,101,,-118.302397,34.029122,2020-06-30,90018,0
...,...,...,...,...,...,...,...,...,...,...,...
7708,2020-04-24,Los Angeles,37.0,Wilshire Center,87,,-118.295338,34.068376,2020-04-24,90010,102
7709,2020-04-23,Los Angeles,37.0,Wilshire Center,80,,-118.295338,34.068376,2020-04-23,90010,102
7710,2020-04-22,Los Angeles,37.0,Wilshire Center,79,,-118.295338,34.068376,2020-04-22,90010,102
7711,2020-04-21,Los Angeles,37.0,Wilshire Center,71,,-118.295338,34.068376,2020-04-21,90010,102


Compute daily cases, rate and rate SMA7 of each city

In [6]:
import numpy as np

# Suppress warnings
pd.options.mode.chained_assignment = None

city_id = city_df.city_id.unique()

for i in city_id:
    temp_df = city_df[city_df.city_id == i]
    temp_df['daily_cases'] = temp_df['confirmed_cases'].diff().fillna(0)
    temp_df['rate'] = temp_df['confirmed_cases'].pct_change().fillna(0)
    temp_df['rate7day'] = temp_df['rate'].rolling(window=7).mean().fillna(0)
    temp_df['ratio7day'] = (temp_df.rate / temp_df.rate7day).fillna(0)
    temp_df = temp_df.replace(np.inf, 0)
    temp_df = temp_df.replace(-np.inf, 0)
    if i == 0:
        new_df = temp_df
    else:
        new_df = pd.concat([new_df, temp_df])
new_df

Unnamed: 0,date,county,fips,place,confirmed_cases,note,x,y,date1,zip,city_id,daily_cases,rate,rate7day,ratio7day
0,2020-07-07,Los Angeles,37.0,Adams-Normandie,116,,-118.302397,34.029122,2020-07-07,90018,0,0.0,0.000000,0.000000,0.000000
1,2020-07-06,Los Angeles,37.0,Adams-Normandie,112,,-118.302397,34.029122,2020-07-06,90018,0,-4.0,-0.034483,0.000000,0.000000
2,2020-07-02,Los Angeles,37.0,Adams-Normandie,105,,-118.302397,34.029122,2020-07-02,90018,0,-7.0,-0.062500,0.000000,0.000000
3,2020-07-01,Los Angeles,37.0,Adams-Normandie,103,,-118.302397,34.029122,2020-07-01,90018,0,-2.0,-0.019048,0.000000,0.000000
4,2020-06-30,Los Angeles,37.0,Adams-Normandie,101,,-118.302397,34.029122,2020-06-30,90018,0,-2.0,-0.019417,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7708,2020-04-24,Los Angeles,37.0,Wilshire Center,87,,-118.295338,34.068376,2020-04-24,90010,102,-3.0,-0.033333,-0.039790,0.837735
7709,2020-04-23,Los Angeles,37.0,Wilshire Center,80,,-118.295338,34.068376,2020-04-23,90010,102,-7.0,-0.080460,-0.036506,2.204031
7710,2020-04-22,Los Angeles,37.0,Wilshire Center,79,,-118.295338,34.068376,2020-04-22,90010,102,-1.0,-0.012500,-0.038291,0.326444
7711,2020-04-21,Los Angeles,37.0,Wilshire Center,71,,-118.295338,34.068376,2020-04-21,90010,102,-8.0,-0.101266,-0.047263,2.142580


In [7]:
import os
new_df.to_csv(os.path.join("output", "city_data.csv"), index=False)

Total Running time (s)

In [8]:
time.time() - start_time

7.662999868392944