# Weather Disaster
* Deal with error-prone columns
* Drop un-needed columns
* Change casing
* save to csv

In [1]:
import pandas as pd

#### Pulling S&P 500 stock market index

In [2]:
# Visit mars nasa
url = "https://en.wikipedia.org/wiki/List_of_natural_disasters_in_the_United_States"
# Use pandas to scrape tables within the url
tables = pd.read_html(url)
# Grab the first DF of Mars Facts
df = tables[3]
df.head()

Unnamed: 0,Year,Disaster,Death toll,Damage cost,Main article,Location,Notes
0,2018,Wildfire,88,"$16,500,000,000",Camp Fire,California,"The Camp Fire ravaged North California, destro..."
1,2017,Hurricane,5740,"$91,610,000,000",Hurricane Maria,Florida and Puerto Rico,After strengthening at a near record pace and ...
2,2017,Hurricane,134,"$64,760,000,000",Hurricane Irma,"Florida, South Carolina, Georgia, Puerto Rico",Hurricane Irma ravaged the northern Leeward Is...
3,2017,Hurricane,107,"$125,000,000,000",Hurricane Harvey,"Texas, Louisiana, Alabama",Hurricane Harvey began as a tropical storm in ...
4,2016,Wildfire,14,"$990,000,000",2016 Great Smoky Mountains wildfires,Tennessee,"Destroyed nearly 2,000 structures; burned near..."


#### Add zero to disaster cost and 'not available' for notes, death toll, and main article

In [3]:
df['Death toll'] = df['Death toll'].fillna('Not Available')
df['Damage cost'] = df['Damage cost'].fillna('0')
df['Main article'] = df['Main article'].fillna('Not Available')
df['Notes'] = df['Notes'].fillna('Not Available')
df

Unnamed: 0,Year,Disaster,Death toll,Damage cost,Main article,Location,Notes
0,2018,Wildfire,88,"$16,500,000,000",Camp Fire,California,"The Camp Fire ravaged North California, destro..."
1,2017,Hurricane,5740,"$91,610,000,000",Hurricane Maria,Florida and Puerto Rico,After strengthening at a near record pace and ...
2,2017,Hurricane,134,"$64,760,000,000",Hurricane Irma,"Florida, South Carolina, Georgia, Puerto Rico",Hurricane Irma ravaged the northern Leeward Is...
3,2017,Hurricane,107,"$125,000,000,000",Hurricane Harvey,"Texas, Louisiana, Alabama",Hurricane Harvey began as a tropical storm in ...
4,2016,Wildfire,14,"$990,000,000",2016 Great Smoky Mountains wildfires,Tennessee,"Destroyed nearly 2,000 structures; burned near..."
...,...,...,...,...,...,...,...
98,1888,Blizzard,400,0,Great Blizzard of 1888,Northeast,Fatalities estimated
99,1888,Cold wave,Unknown,0,1888 Northwest Cold Wave,Northwest,Not Available
100,1871,Wildfire,"1,500–2,500",0,Peshtigo fire,Wisconsin,Deadliest firestorm in United States history
101,1862,Flood,Not Available,0,Great Flood of 1862,"California, Oregon, Utah, and the territories ...",An atmospheric river led to 43 days of rain st...


#### length of dataset

In [4]:
df.count()

Year            103
Disaster        103
Death toll      103
Damage cost     103
Main article    103
Location        100
Notes           103
dtype: int64

#### Dropping any rows that are missing 

In [5]:
df = df.dropna()
df.count()

Year            100
Disaster        100
Death toll      100
Damage cost     100
Main article    100
Location        100
Notes           100
dtype: int64

#### Looking at columns

In [6]:
df.columns

Index(['Year', 'Disaster', 'Death toll', 'Damage cost', 'Main article',
       'Location', 'Notes'],
      dtype='object')

#### Filter out row year that gives an error when trying to convert column year to datetime

In [7]:
df = df[~df['Year'].str.contains('–')]
df.count()

Year            97
Disaster        97
Death toll      97
Damage cost     97
Main article    97
Location        97
Notes           97
dtype: int64

#### Checking data types

In [8]:
df.dtypes

Year            object
Disaster        object
Death toll      object
Damage cost     object
Main article    object
Location        object
Notes           object
dtype: object

#### Convert column dt to datetime

In [9]:
df['Year'] = pd.to_datetime(df['Year'])
df.dtypes

Year            datetime64[ns]
Disaster                object
Death toll              object
Damage cost             object
Main article            object
Location                object
Notes                   object
dtype: object

#### Extract year only

In [10]:
df['Year'] = df['Year'].dt.year
df.head()

Unnamed: 0,Year,Disaster,Death toll,Damage cost,Main article,Location,Notes
0,2018,Wildfire,88,"$16,500,000,000",Camp Fire,California,"The Camp Fire ravaged North California, destro..."
1,2017,Hurricane,5740,"$91,610,000,000",Hurricane Maria,Florida and Puerto Rico,After strengthening at a near record pace and ...
2,2017,Hurricane,134,"$64,760,000,000",Hurricane Irma,"Florida, South Carolina, Georgia, Puerto Rico",Hurricane Irma ravaged the northern Leeward Is...
3,2017,Hurricane,107,"$125,000,000,000",Hurricane Harvey,"Texas, Louisiana, Alabama",Hurricane Harvey began as a tropical storm in ...
4,2016,Wildfire,14,"$990,000,000",2016 Great Smoky Mountains wildfires,Tennessee,"Destroyed nearly 2,000 structures; burned near..."


#### lowercase/rename columns

In [11]:
df = df.rename(columns={'Year':'year','Disaster':'disaster','Death toll':'death_toll','Damage cost':'damage_cost','Main article':'main_article','Location':'location','Notes':'notes'})
df.head()

Unnamed: 0,year,disaster,death_toll,damage_cost,main_article,location,notes
0,2018,Wildfire,88,"$16,500,000,000",Camp Fire,California,"The Camp Fire ravaged North California, destro..."
1,2017,Hurricane,5740,"$91,610,000,000",Hurricane Maria,Florida and Puerto Rico,After strengthening at a near record pace and ...
2,2017,Hurricane,134,"$64,760,000,000",Hurricane Irma,"Florida, South Carolina, Georgia, Puerto Rico",Hurricane Irma ravaged the northern Leeward Is...
3,2017,Hurricane,107,"$125,000,000,000",Hurricane Harvey,"Texas, Louisiana, Alabama",Hurricane Harvey began as a tropical storm in ...
4,2016,Wildfire,14,"$990,000,000",2016 Great Smoky Mountains wildfires,Tennessee,"Destroyed nearly 2,000 structures; burned near..."


#### Sort descending order

In [12]:
df = df.sort_values('year',ascending=True).reset_index()
df = df.drop(columns=['index'])
df.head()

Unnamed: 0,year,disaster,death_toll,damage_cost,main_article,location,notes
0,1862,Flood,Not Available,0,Great Flood of 1862,"California, Oregon, Utah, and the territories ...",An atmospheric river led to 43 days of rain st...
1,1871,Wildfire,"1,500–2,500",0,Peshtigo fire,Wisconsin,Deadliest firestorm in United States history
2,1888,Blizzard,400,0,Great Blizzard of 1888,Northeast,Fatalities estimated
3,1888,Cold wave,Unknown,0,1888 Northwest Cold Wave,Northwest,Not Available
4,1889,Flood,2209,$17 Million ($425 Million in 2012 dollars),Johnstown Flood,"Johnstown, Pennsylvania",A dam failure caused 20 million tons of water ...


#### Create a table for disaster

In [14]:
disaster_data = df['disaster'].unique()
disaster_data
disaster_df = pd.DataFrame(disaster_data,columns=['disaster'])
disaster_df.head()

Unnamed: 0,disaster
0,Flood
1,Wildfire
2,Blizzard
3,Cold wave
4,Hurricane


#### Exporting to a csv file

In [15]:
df.to_csv('../data_transformed/weather_damage.csv')
disaster_df.to_csv('../data_transformed/disaster.csv')