In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from uszipcode import Zipcode, SearchEngine

In [2]:
covid = pd.read_csv('../data/california_covid.csv')
earthquake = pd.read_csv('../data/earthquakes_CA.csv')
fire = pd.read_csv('../data/fire_data.csv')

In [3]:
covid.head(2)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio
0,6001.0,Alameda,California,US,2020-10-17 04:24:12,37.646294,-121.892927,22408,439,0,21969.0,"Alameda, California, US",1340.729443,1.959122
1,6003.0,Alpine,California,US,2020-10-17 04:24:12,38.596786,-119.822359,3,0,0,3.0,"Alpine, California, US",265.721878,0.0


In [4]:
earthquake.head(2)

Unnamed: 0,Event ID,Name/Epicenter,Date,Lat,Lon,Mag
0,73472891,"10 km (6.25 mi) ESE of Willits, CA",2020-10-19,39.36,-123.25,3.4
1,73472896,"11 km (6.875 mi) ESE of Willits, CA",2020-10-19,39.37,-123.24,3.8


In [5]:
fire.head(2)

Unnamed: 0,Name,Final,Started,County,AcresBurned,PercentContained,Longitude,Latitude,Type,IsActive,CalFireIncident,Location
0,Apple Fire,False,2020-07-31T18:08:39Z,Riverside,0,0,-116.9617,33.99139,Wildfire,True,False,"off of Oak Glen Road and Apple Tree Lane, Nort..."
1,August Complex (includes Doe Fire),False,2020-08-16T20:37:26Z,"Mendocino, Humboldt, Trinity, Tehama, Glenn, L...",1032209,80,-122.673,39.776,Wildfire,True,True,"Mendocino, Humboldt, Trinity, Tehama, Glenn, ..."


The goal here is to merge dataframes based on county column as a common key. Covid data is clean and ready to go. Some fires extend to multiple counties. We are going to base our analysis on the county level, so we need to preserve the fire information for each county. Here, we are going to separate the county to individual counties while copying the fire data where the fire spans to multiple counties.

In [6]:
(fire['County'].iloc[0, ]).split(', ')

['Riverside']

In [7]:
(fire['County'].iloc[1, ]).split(', ')

['Mendocino', 'Humboldt', 'Trinity', 'Tehama', 'Glenn', 'Lake', 'Colusa']

In [8]:
# first change the `County` column from str to a list
fire['County'] = fire['County'].apply(lambda x: x.split(', '))

In [9]:
# modified from https://stackoverflow.com/questions/24029659/python-pandas-replicate-rows-in-dataframe
# making copies of the rows with more than one county listed. The number of copies correspond to the number of counties
reps = [len(county) if  len(county) > 1 else 1 for county in fire['County']]
fire = fire.loc[np.repeat(fire.index.values, reps)]

In [10]:
# reset index 
fire.reset_index(inplace=True, drop=True)

In [11]:
# reassign single counties to the `County` column

# index counter and an empty counties list
index = 0
counties = []

# loop to have an index track to not go out of index range
for indexer in range(len(fire)):
    while index <= indexer:
        
        # if there is only a single county name, append that county name to the list
        if len(fire.iloc[index, 3]) == 1:
            counties.append(fire.iloc[index, 3][0])
            index += 1
            
        # if there is more than one single county name, append each county name to the list
        else:
            for i in range(0, len(fire.iloc[index, 3])):
                # append each indiviual county name of the multi-county list
                counties.append(fire.iloc[index, 3][i])
            # set the index, so that it goes to the next 'unique' item
            index += len(fire.iloc[index, 3])

# reassign `County` column to this new list
fire['County'] = counties

The earthquake dataset does not include county names, so we are making a new column with the county names assigned from latitude and longitude using the python `uszipcode` library.

In [12]:
# make a function to retrive the county name from lat and long 
def county_name(lat, long):
    '''Takes in latitude and longitude and returns the county name of the coordinates'''
    search = SearchEngine()
    result = search.by_coordinates(lat, long)
    # get the county name
    county = result[0].county
    # this is in 'XXXX County', so fix the format so that it's only the county name without 'County' at the end
    county = county.split()
    # get everything but the last item (which is 'County')
    county = county[:-1]
    
    print(' '.join(county))
    # return a string
    return ' '.join(county)
    

In [13]:
county_name(39.36, -123.25)

Mendocino


'Mendocino'

In [14]:
len(earthquake)

200

In [16]:
len(counties)

224

In [71]:
list(map(county_name, earthquake['Lat'], earthquake['Lon']))

IndexError: list index out of range

In [15]:
counties = earthquake.apply(lambda x: county_name(x['Lat'], x['Lon']), axis = 1)

Mendocino
Mendocino
San Bernardino
Imperial
Monterey
Humboldt
San Bernardino
Mendocino
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Imperial
Santa Clara
Riverside
Los Angeles
Inyo
Inyo
Santa Barbara
Kern
Imperial
Mendocino
Kern
Inyo
Mendocino
Mendocino
Mendocino
Santa Clara


IndexError: list index out of range