# Import LA County Data from NY Times Github

NY Times COVID-19 data with rolling averages can be found at: https://github.com/nytimes/covid-19-data/tree/master/rolling-averages (as of 2022-04-01). The data is split between .csv files by year. Raw files are found at:
- https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2020.csv
- https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2021.csv
- https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2022.csv

In [3]:
import urllib.request
import numpy as np
import pandas as pd
import logging
import os

## Download data from NY Times Github

In [9]:
url_dict = {
    2020: 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2020.csv',
    2021: 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2021.csv',
    2022: 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2022.csv',
}

In [7]:
log_dir = 'logs'
log_fname = 'log.log'

# Create and configure logger
logging.basicConfig(
    filename=os.path.join(log_dir, log_fname),
    format='%(asctime)s %(message)s',
    filemode='w'
)

# Create object
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [10]:
for year in url_dict:
    try:
        logger.info(f'Starting download of {year} data...')
        url = url_dict[year]
        output = f'us-counties-{year}.csv'
        urllib.request.urlretrieve(url, output)
        logger.info(f'{year} data saved.')
    except Exception as e:
        logger.error(f'Error downloading {year} data file: {str(e)}')

## Load data

In [11]:
df_dict = {year: pd.read_csv(f'us-counties-{year}.csv') for year in url_dict}

In [13]:
df_dict[2020].head()

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k
0,2020-01-21,USA-53061,Snohomish,Washington,1,0.14,0.02,0,0.0,0.0
1,2020-01-22,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
2,2020-01-23,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
3,2020-01-24,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
4,2020-01-24,USA-17031,Cook,Illinois,1,0.14,0.0,0,0.0,0.0


## Extract LA County data

In [14]:
la_df_dict = {year: df.loc[(df['county'] == 'Los Angeles') & (df['state'] == 'California')] for year, df in df_dict.items()}

In [15]:
la_df_dict[2020].head()

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k
11,2020-01-26,USA-06037,Los Angeles,California,1,0.14,0.0,0,0.0,0.0
16,2020-01-27,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0
21,2020-01-28,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0
26,2020-01-29,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0
31,2020-01-30,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0


## Combine into one DataFrame

In [23]:
la_df = pd.concat(
    la_df_dict
).reset_index(
).drop(
    columns=['level_0', 'level_1']
)

In [24]:
la_df.head()

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k
0,2020-01-26,USA-06037,Los Angeles,California,1,0.14,0.0,0,0.0,0.0
1,2020-01-27,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0
2,2020-01-28,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0
3,2020-01-29,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0
4,2020-01-30,USA-06037,Los Angeles,California,0,0.14,0.0,0,0.0,0.0


In [25]:
la_df.tail()

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k
791,2022-03-27,USA-06037,Los Angeles,California,0,956.29,9.53,0,18.86,0.19
792,2022-03-28,USA-06037,Los Angeles,California,1055,811.43,8.08,25,18.29,0.18
793,2022-03-29,USA-06037,Los Angeles,California,552,796.86,7.94,7,16.86,0.17
794,2022-03-30,USA-06037,Los Angeles,California,1051,825.43,8.22,29,17.14,0.17
795,2022-03-31,USA-06037,Los Angeles,California,500,772.0,7.69,14,15.43,0.15


# Export data to .csv for use downstream

In [26]:
la_df.to_csv('los_angeles_covid_cases.csv', index=False)