In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  from pandas import Panel
  pd.set_option('display.max_colwidth', -1)


**Read California Fire Incidents CSV file into a dataframe**

In [2]:
df = pd.read_csv("California_Fire_Incidents.csv")
geolocator = geopy.Nominatim(user_agent='check_1') #My OpenMap username
reverse = RateLimiter(geolocator.reverse, min_delay_seconds = 1)

**Print the columns of the dataframe to select the ones that hold relevant information for the project. In other words, select a subset of columns from the dataframe to create a new dataframe.**

In [3]:
print(df.columns)
df = df[['AcresBurned','Active','ArchiveYear','CanonicalUrl','Counties','CountyIds','Extinguished','Latitude','Location','Longitude','Name','Started']]

Index(['AcresBurned', 'Active', 'AdminUnit', 'AirTankers', 'ArchiveYear',
       'CalFireIncident', 'CanonicalUrl', 'ConditionStatement',
       'ControlStatement', 'Counties', 'CountyIds', 'CrewsInvolved', 'Dozers',
       'Engines', 'Extinguished', 'Fatalities', 'Featured', 'Final',
       'FuelType', 'Helicopters', 'Injuries', 'Latitude', 'Location',
       'Longitude', 'MajorIncident', 'Name', 'PercentContained',
       'PersonnelInvolved', 'Public', 'SearchDescription', 'SearchKeywords',
       'Started', 'Status', 'StructuresDamaged', 'StructuresDestroyed',
       'StructuresEvacuated', 'StructuresThreatened', 'UniqueId', 'Updated',
       'WaterTenders'],
      dtype='object')


**We see that the dataframe has 1636 rows and 12 columns. Looking the information of the dataframe, we see that the most important columns are all non-null, which is good. We also see that the Extinguished and Started columns are objects, so we will need to cast these columns to datetime objects.**

In [4]:
print(df.columns)
print(df.shape)
print(df.info())
df.head()

Index(['AcresBurned', 'Active', 'ArchiveYear', 'CanonicalUrl', 'Counties',
       'CountyIds', 'Extinguished', 'Latitude', 'Location', 'Longitude',
       'Name', 'Started'],
      dtype='object')
(1636, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1636 entries, 0 to 1635
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AcresBurned   1633 non-null   float64
 1   Active        1636 non-null   bool   
 2   ArchiveYear   1636 non-null   int64  
 3   CanonicalUrl  1636 non-null   object 
 4   Counties      1636 non-null   object 
 5   CountyIds     1636 non-null   object 
 6   Extinguished  1577 non-null   object 
 7   Latitude      1636 non-null   float64
 8   Location      1636 non-null   object 
 9   Longitude     1636 non-null   float64
 10  Name          1636 non-null   object 
 11  Started       1636 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(7)
memory usage: 142.3+ KB
None


Unnamed: 0,AcresBurned,Active,ArchiveYear,CanonicalUrl,Counties,CountyIds,Extinguished,Latitude,Location,Longitude,Name,Started
0,257314.0,False,2013,/incidents/2013/8/17/rim-fire/,Tuolumne,55,2013-09-06T18:30:00Z,37.857,3 miles east of Groveland along Hwy 120,-120.086,Rim Fire,2013-08-17T15:25:00Z
1,30274.0,False,2013,/incidents/2013/5/30/powerhouse-fire/,Los Angeles,19,2013-06-08T18:30:00Z,34.585595,Angeles National Forest,-118.423176,Powerhouse Fire,2013-05-30T15:28:00Z
2,27531.0,False,2013,/incidents/2013/7/15/mountain-fire/,Riverside,33,2013-07-30T18:00:00Z,33.7095,Hwy 243 & Hwy 74 near Mountain Center,-116.72885,Mountain Fire,2013-07-15T13:43:00Z
3,27440.0,False,2013,/incidents/2013/8/10/american-fire/,Placer,31,2013-08-30T08:00:00Z,39.12,"Deadwood Ridge, northeast of Foresthill",-120.65,American Fire,2013-08-10T16:30:00Z
4,24251.0,False,2013,/incidents/2013/5/2/springs-fire/,Ventura,56,2013-05-11T06:30:00Z,0.0,"Southbound Highway 101 at Camarillo Springs Road, Camarillo",0.0,Springs Fire,2013-05-02T07:01:00Z


In [5]:
# df = df.dropna(axis = 1, how = 'any')
# df = df[df['Latitude'].notna()]
# df = df[df['Longitude'].notna()]

**Here, the geopy library is used to reverse search an address using lateral and longitudinal coordinates which will ultimately be used to obtain zip codes. These zip codes will be used as a primary key to join/merge our project datasets together. Since we were having issues with the reverse search when lateral and longitudinal coordinates were not available in the table, we added an except call to ignore the error. The following cell defines the function used to reverse search an address**

In [6]:
def get_zipcode(df, geolocator, lat_field, lon_field):
    try:
        location = geolocator.reverse((df[lat_field], df[lon_field]))
        return location.raw['address']
    except (AttributeError, KeyError, ValueError):
        return None

**The get_zipcode function is applied to the dataframe on a row by row basis where the output is new column called 'Location'. This new column 'Location' yieled a column of dictionaries, so we needed to find a way to expand this dictionary into columns of the dataframe.**

In [7]:
df['Loc'] = df.progress_apply(
    get_zipcode, axis=1, geolocator=geolocator, 
    lat_field='Latitude', lon_field='Longitude')

  return cls(*args)
100%|██████████████████████████████████████████████████████████████████████████████| 1636/1636 [13:34<00:00,  2.01it/s]


**The new column 'Location" was concatenated into the datframe, where each key of the dictionary became a column in the dataframe. We then needed to filter out the relevant columns of this new dataframe.**

In [8]:
df = pd.concat([df.drop(['Loc'], axis=1), df['Loc'].apply(pd.Series)], axis=1)

In [9]:
print(df.columns)
df.head()

Index(['AcresBurned', 'Active', 'ArchiveYear', 'CanonicalUrl', 'Counties',
       'CountyIds', 'Extinguished', 'Latitude', 'Location', 'Longitude',
       'Name', 'Started', 'county', 'state', 'country', 'country_code',
       'amenity', 'road', 'hamlet', 'postcode', 'man_made', 'house_number',
       'town', 'locality', 'tourism', 'village', 'city', 'quarter', 'leisure',
       'highway', 'suburb', 'residential', 'neighbourhood', 'natural',
       'industrial', 'province', 'region', 'landuse', 'farm', 'aeroway',
       'building', 'city_district', 'shop', 'emergency', 'commercial'],
      dtype='object')


Unnamed: 0,AcresBurned,Active,ArchiveYear,CanonicalUrl,Counties,CountyIds,Extinguished,Latitude,Location,Longitude,Name,Started,county,state,country,country_code,amenity,road,hamlet,postcode,man_made,house_number,town,locality,tourism,village,city,quarter,leisure,highway,suburb,residential,neighbourhood,natural,industrial,province,region,landuse,farm,aeroway,building,city_district,shop,emergency,commercial
0,257314.0,False,2013,/incidents/2013/8/17/rim-fire/,Tuolumne,55,2013-09-06T18:30:00Z,37.857,3 miles east of Groveland along Hwy 120,-120.086,Rim Fire,2013-08-17T15:25:00Z,Tuolumne County,California,United States,us,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,30274.0,False,2013,/incidents/2013/5/30/powerhouse-fire/,Los Angeles,19,2013-06-08T18:30:00Z,34.585595,Angeles National Forest,-118.423176,Powerhouse Fire,2013-05-30T15:28:00Z,Los Angeles County,California,United States,us,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,27531.0,False,2013,/incidents/2013/7/15/mountain-fire/,Riverside,33,2013-07-30T18:00:00Z,33.7095,Hwy 243 & Hwy 74 near Mountain Center,-116.72885,Mountain Fire,2013-07-15T13:43:00Z,Riverside County,California,United States,us,Calvary Chapel Mountain Center,Banning-Idyllwild Panoramic Highway,Mountain Center,92549.0,,,,,,,,,,,,,,,,,,,,,,,,,
3,27440.0,False,2013,/incidents/2013/8/10/american-fire/,Placer,31,2013-08-30T08:00:00Z,39.12,"Deadwood Ridge, northeast of Foresthill",-120.65,American Fire,2013-08-10T16:30:00Z,Placer County,California,United States,us,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,24251.0,False,2013,/incidents/2013/5/2/springs-fire/,Ventura,56,2013-05-11T06:30:00Z,0.0,"Southbound Highway 101 at Camarillo Springs Road, Camarillo",0.0,Springs Fire,2013-05-02T07:01:00Z,,,,,,,,,Soul Buoy,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
df = df[['AcresBurned','Active','ArchiveYear','CanonicalUrl','Counties','CountyIds','Extinguished','Latitude','Location','Longitude','Name','Started','country','country_code','postcode']]

**Here, the Extinguished and Started columns are casted to datetime columns. To clean the datetime columns some more, the time from the date&time datetime columns was removed using the dt.date. Since all of the datetime strings were of similar format, they were converted to datetime using to_datetime on the columns and then using the dt.date attribute to filter out just the date portion.**

In [11]:
df['Extinguished'] = pd.to_datetime(df['Extinguished']).dt.date
df['Started'] = pd.to_datetime(df['Started']).dt.date
df.head()

Unnamed: 0,AcresBurned,Active,ArchiveYear,CanonicalUrl,Counties,CountyIds,Extinguished,Latitude,Location,Longitude,Name,Started,country,country_code,postcode
0,257314.0,False,2013,/incidents/2013/8/17/rim-fire/,Tuolumne,55,2013-09-06,37.857,3 miles east of Groveland along Hwy 120,-120.086,Rim Fire,2013-08-17,United States,us,
1,30274.0,False,2013,/incidents/2013/5/30/powerhouse-fire/,Los Angeles,19,2013-06-08,34.585595,Angeles National Forest,-118.423176,Powerhouse Fire,2013-05-30,United States,us,
2,27531.0,False,2013,/incidents/2013/7/15/mountain-fire/,Riverside,33,2013-07-30,33.7095,Hwy 243 & Hwy 74 near Mountain Center,-116.72885,Mountain Fire,2013-07-15,United States,us,92549.0
3,27440.0,False,2013,/incidents/2013/8/10/american-fire/,Placer,31,2013-08-30,39.12,"Deadwood Ridge, northeast of Foresthill",-120.65,American Fire,2013-08-10,United States,us,
4,24251.0,False,2013,/incidents/2013/5/2/springs-fire/,Ventura,56,2013-05-11,0.0,"Southbound Highway 101 at Camarillo Springs Road, Camarillo",0.0,Springs Fire,2013-05-02,,,


**We see that only 750 of the 1636 rows in the postcode column are non-null. We will need to dig deeper into determining why not all rows returned a postcode.**

In [12]:
print(df.dtypes)

AcresBurned     float64
Active          bool   
ArchiveYear     int64  
CanonicalUrl    object 
Counties        object 
CountyIds       object 
Extinguished    object 
Latitude        float64
Location        object 
Longitude       float64
Name            object 
Started         object 
country         object 
country_code    object 
postcode        object 
dtype: object


In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1636 entries, 0 to 1635
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AcresBurned   1633 non-null   float64
 1   Active        1636 non-null   bool   
 2   ArchiveYear   1636 non-null   int64  
 3   CanonicalUrl  1636 non-null   object 
 4   Counties      1636 non-null   object 
 5   CountyIds     1636 non-null   object 
 6   Extinguished  1577 non-null   object 
 7   Latitude      1636 non-null   float64
 8   Location      1636 non-null   object 
 9   Longitude     1636 non-null   float64
 10  Name          1636 non-null   object 
 11  Started       1636 non-null   object 
 12  country       1474 non-null   object 
 13  country_code  1474 non-null   object 
 14  postcode      750 non-null    object 
dtypes: bool(1), float64(3), int64(1), object(10)
memory usage: 180.7+ KB
None


In [15]:
df.to_csv('CLEAN_CA_Fire_Incidents.csv')