In [0]:
# Generic Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Data based on data.gov.FEMADeclarations.3.15.19
femadf = pd.read_csv('fema_dataset.csv')
femadf.shape

(48555, 16)

In [3]:
femadf.head()

Unnamed: 0,Disaster Number,IH Program Declared,IA Program Declared,PA Program Declared,HM Program Declared,State,Declaration Date,Disaster Type,Incident Type,Title,Incident Begin Date,Incident End Date,Disaster Close Out Date,Place Code,Declared County/Area,Declaration Request Number
0,4419,Yes,No,Yes,Yes,AL,3/5/2019,DR,Tornado,"SEVERE STORMS, STRAIGHT-LINE WINDS, AND TORNADOES",3/3/2019,3/3/2019,,99081.0,Lee (County),19006
1,4418,No,No,Yes,Yes,WA,3/4/2019,DR,Severe Storm(s),"SEVERE WINTER STORMS, STRAIGHT-LINE WINDS, FLO...",12/10/2018,12/24/2018,,99009.0,Clallam (County),19005
2,4418,No,No,Yes,Yes,WA,3/4/2019,DR,Severe Storm(s),"SEVERE WINTER STORMS, STRAIGHT-LINE WINDS, FLO...",12/10/2018,12/24/2018,,99027.0,Grays Harbor (County),19005
3,4418,No,No,Yes,Yes,WA,3/4/2019,DR,Severe Storm(s),"SEVERE WINTER STORMS, STRAIGHT-LINE WINDS, FLO...",12/10/2018,12/24/2018,,99029.0,Island (County),19005
4,4418,No,No,Yes,Yes,WA,3/4/2019,DR,Severe Storm(s),"SEVERE WINTER STORMS, STRAIGHT-LINE WINDS, FLO...",12/10/2018,12/24/2018,,99031.0,Jefferson (County),19005


In [4]:
# Checking for NaNs
femadf.isnull().sum()

Disaster Number                  0
IH Program Declared              0
IA Program Declared              0
PA Program Declared              0
HM Program Declared              0
State                            0
Declaration Date                 0
Disaster Type                    0
Incident Type                    0
Title                            0
Incident Begin Date              0
Incident End Date              339
Disaster Close Out Date       9385
Place Code                     197
Declared County/Area           197
Declaration Request Number       0
dtype: int64

In [0]:
# Drop columns Incident End Date, Disaster Close Out Date to remove NaNs
# 'Place Code' has location information but is not consistent.
femadf.drop(columns=['Incident End Date', 'Disaster Close Out Date', 
                     'Place Code'], inplace=True)

# Considering NaNs in Declared County/Area as Statewide
femadf['Declared County/Area'].fillna('Statewide', inplace = True) 

In [6]:
# Rechecking if we have a clean data
femadf.isnull().sum()

Disaster Number               0
IH Program Declared           0
IA Program Declared           0
PA Program Declared           0
HM Program Declared           0
State                         0
Declaration Date              0
Disaster Type                 0
Incident Type                 0
Title                         0
Incident Begin Date           0
Declared County/Area          0
Declaration Request Number    0
dtype: int64

In [7]:
# Rechecking the shape
femadf.shape

(48555, 13)

In [8]:
# Load state to code dataset and update information in femadf
statedf = pd.read_csv('states_code.csv', index_col=1)
statedf.head()

Unnamed: 0_level_0,State
Abbreviation,Unnamed: 1_level_1
AL,Alabama
AK,Alaska
AZ,Arizona
AR,Arkansas
CA,California


In [0]:
# Get State name from Abbreviation
def getstatename(col):
  return statedf.loc[col]['State']

In [0]:
# Rename State column to StateCode
femadf.rename(columns={'State ':'StateCode'}, inplace=True)

# Update a new column state to match StateCode
femadf['State'] = \
  femadf['StateCode'].apply(getstatename)

In [11]:
femadf['Incident Type'].unique()

array(['Tornado', 'Severe Storm(s)', 'Flood', 'Hurricane', 'Earthquake',
       'Fire', 'Typhoon', 'Snow', 'Coastal Storm', 'Volcano',
       'Mud/Landslide', 'Severe Ice Storm', 'Dam/Levee Break',
       'Toxic Substances', 'Chemical', 'Other', 'Terrorist', 'Freezing',
       'Tsunami', 'Drought', 'Human Cause', 'Fishing Losses'],
      dtype=object)

In [12]:
# Load county data
url = 'https://raw.githubusercontent.com/1wheel/whitehouse-petitions/master/Gaz_counties_national.txt'
countydf = pd.read_csv(url, sep='\t', encoding='ISO-8859-1')

# Replace special characters.
countydf.NAME = countydf.NAME.apply(lambda x: x.replace('á', 'a'))
countydf.NAME = countydf.NAME.apply(lambda x: x.replace('í', 'i'))
countydf.NAME = countydf.NAME.apply(lambda x: x.replace('ñ', 'n'))
countydf.NAME = countydf.NAME.apply(lambda x: x.replace('ó', 'o'))
countydf.NAME = countydf.NAME.apply(lambda x: x.replace('ü', 'u'))

countydf.head()

Unnamed: 0,USPS,GEOID,ANSICODE,NAME,POP10,HU10,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
0,AL,1001,161526,Autauga County,54571,22135,1539582278,25775735,594.436,9.952,32.536382,-86.64449
1,AL,1003,161527,Baldwin County,182265,104061,4117521611,1133190229,1589.784,437.527,30.659218,-87.746067
2,AL,1005,161528,Barbour County,27457,11829,2291818968,50864716,884.876,19.639,31.87067,-85.405456
3,AL,1007,161529,Bibb County,22915,8981,1612480789,9289057,622.582,3.587,33.015893,-87.127148
4,AL,1009,161530,Blount County,57322,23887,1669961855,15157440,644.776,5.852,33.977448,-86.567246


In [0]:
# matching index of fema_dataset
def updatecountyinfo(row):
  statecode = row['StateCode']
  county = row['Declared County/Area']
  
  if county == 'Statewide':
    return county
    
  series = countydf[countydf['USPS'] == statecode]['NAME']
  county_words_split = county.split(" ")
  search_string = county_words_split[0]
  output = series[series.str.startswith(search_string, na=False)]
  
  if output.shape[0] == 1:
    return output.iloc[0]
  
  # Check for two words
  if len(county_words_split) > 1:
    search_string = county.split(" ")[0] + " " + county.split(" ")[1]
    output = series[series.str.startswith(search_string, na=False)]
    
    if output.shape[0] == 1:
      return output.iloc[0]
    elif output.shape[0] > 1:
      # More than one selection so choosing 1st.
      return output.iloc[0]
    else:
      return None
    
  if output.shape[0] > 1:
    # More than one selection so choosing 1st.
    return output.iloc[0]

  return None

In [0]:
femadf['Updated County Info'] = femadf.apply(updatecountyinfo, axis=1)

In [15]:
sample = femadf.sample(10)
sample.loc[:,['StateCode','Declared County/Area','State','Updated County Info']]

Unnamed: 0,StateCode,Declared County/Area,State,Updated County Info
43634,ND,McLean (County),North Dakota,McLean County
39858,KS,Wilson (County),Kansas,Wilson County
19863,KY,Nicholas (County),Kentucky,Nicholas County
34311,NC,Swain (County),North Carolina,Swain County
10001,MA,"Norfolk (County)(in PMSA 1120,1200,6060)",Massachusetts,Norfolk County
28015,OK,Rogers (County),Oklahoma,Rogers County
33629,NC,Guilford (County),North Carolina,Guilford County
32385,ND,Wells (County),North Dakota,Wells County
14556,MO,Miller (County),Missouri,Miller County
35112,CA,Sierra (County),California,Sierra County


In [16]:
femadf.isnull().sum()

Disaster Number                  0
IH Program Declared              0
IA Program Declared              0
PA Program Declared              0
HM Program Declared              0
StateCode                        0
Declaration Date                 0
Disaster Type                    0
Incident Type                    0
Title                            0
Incident Begin Date              0
Declared County/Area             0
Declaration Request Number       0
State                            0
Updated County Info           1005
dtype: int64

In [17]:
femadf.shape

(48555, 15)

In [18]:
# Dropping rows which has no mapped County names.
femadf.dropna(inplace=True)
femadf.shape

(47550, 15)

In [0]:
# Fetch County FIPS Code
def getcountycode(row):
  statecode = row['StateCode']
  county = row['Updated County Info']
  
  series = countydf[(countydf['USPS'] == statecode) & (countydf['NAME'] == county)]
  output = series['GEOID']
  
  if output.shape[0] == 1:
    return int(output.iloc[0])
    
  # Handling Statewide county code
  series = countydf[(countydf['USPS'] == statecode)]
  output = series['GEOID']
    
  if output.shape[0] < 1:
    return None
  
  stateFIPScode = output.iloc[0] // 1000
  
  return stateFIPScode * 1000
 

In [0]:
femadf['County FIPS Code'] = femadf.apply(getcountycode, axis=1)

In [21]:
femadf.isnull().sum()

Disaster Number                0
IH Program Declared            0
IA Program Declared            0
PA Program Declared            0
HM Program Declared            0
StateCode                      0
Declaration Date               0
Disaster Type                  0
Incident Type                  0
Title                          0
Incident Begin Date            0
Declared County/Area           0
Declaration Request Number     0
State                          0
Updated County Info            0
County FIPS Code              10
dtype: int64

In [22]:
femadf.shape

(47550, 16)

In [23]:
# Dropping rows which has no mapped County names.
femadf.dropna(inplace=True)
femadf.shape

(47540, 16)

In [26]:
femadf['County FIPS Code'] = femadf['County FIPS Code'].astype(int)

Disaster Number                int64
IH Program Declared           object
IA Program Declared           object
PA Program Declared           object
HM Program Declared           object
StateCode                     object
Declaration Date              object
Disaster Type                 object
Incident Type                 object
Title                         object
Incident Begin Date           object
Declared County/Area          object
Declaration Request Number     int64
State                         object
Updated County Info           object
County FIPS Code               int64
dtype: object