### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import config
import urllib.request
from datetime import datetime
import matplotlib.pyplot as plt
import pickle
from datetime import datetime
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Reading first dataset

In [2]:
df = pd.read_excel('hurricanes_damage.xls', 'ATD of ICAT')

In [3]:
df.head()

Unnamed: 0,ATCF_ID,name,basedamage,ATD,ND,lf_ISO_TIME,lf_wind,lf_pressure,lf_state,WPC,population,lf_lat,lf_lon
0,AL011900,Galveston,30000000,2826.090656,171510000000,1900-09-09 02:30:00,120,936.0,TX,886.652943,119724,29.1711,-95.2018
1,AL041901,Storm 4 in 1901,1000000,28.612073,830000000,1901-08-14 20:30:00,75,973.0,LA,941.570046,371192,29.266667,-89.633333
2,AL031903,Storm 3 in 1903,670000,1394.193738,7410000000,1903-09-11 23:00:00,75,976.0,FL,1054.905202,4556,26.1,-80.1
3,AL031903,Storm 3 in 1903,0,0.0,0,1903-09-13 21:00:00,80,988.0,FL,1054.905202,25018,29.7989,-85.4572
4,AL021904,Storm 2 in 1904,1000000,62.698162,640000000,1904-09-14 13:30:00,70,,SC,1025.246032,155567,33.2666,-79.275275


In [4]:
df.shape

(247, 13)

In [5]:
df.drop(columns=['ATD', 'ND', 'lf_state', 'population', 'WPC'], inplace=True)

In [6]:
df.columns

Index(['ATCF_ID', 'name', 'basedamage', 'lf_ISO_TIME', 'lf_wind',
       'lf_pressure', 'lf_lat', 'lf_lon'],
      dtype='object')

### Renaming columns

In [7]:
df.rename(columns={'basedamage': 'damage',
                  'lf_ISO_TIME': 'date',
                  'lf_wind': 'wind',
                  'lf_pressure': 'pressure',
                  'lf_lat': 'lat',
                  'lf_lon': 'long',
                  'ATCF_ID': 'storm_id'},
          inplace=True, errors='raise')

In [8]:
df.head()

Unnamed: 0,storm_id,name,damage,date,wind,pressure,lat,long
0,AL011900,Galveston,30000000,1900-09-09 02:30:00,120,936.0,29.1711,-95.2018
1,AL041901,Storm 4 in 1901,1000000,1901-08-14 20:30:00,75,973.0,29.266667,-89.633333
2,AL031903,Storm 3 in 1903,670000,1903-09-11 23:00:00,75,976.0,26.1,-80.1
3,AL031903,Storm 3 in 1903,0,1903-09-13 21:00:00,80,988.0,29.7989,-85.4572
4,AL021904,Storm 2 in 1904,1000000,1904-09-14 13:30:00,70,,33.2666,-79.275275


### Adding data on recent Hurricanes

In [9]:
new_data = pd.read_excel('new_data.xlsx')

In [10]:
new_data.head()

Unnamed: 0,name,date,lat,long,damage,wind,pressure,storm_id
0,Florence,2018-08-31,34.2,77.8,24000000000.0,130.0,937.0,AL062018
1,Michael,2018-10-06,30.0,85.5,25000000000.0,140.0,919.0,AL142018
2,Barry,2019-07-11,29.6,92.2,600000000.0,65.0,993.0,AL022019
3,Dorian,2019-08-24,14.0,60.9,1600000000.0,160.0,910.0,AL052019
4,Hanna,2020-07-23,26.8,97.3,1100000000.0,80.0,973.0,AL082020


### Concatenating datasets

In [11]:
df = pd.concat([df, new_data], axis = 0)

In [12]:
df['date'] = df['date'].dt.date

In [13]:
df.head()

Unnamed: 0,storm_id,name,damage,date,wind,pressure,lat,long
0,AL011900,Galveston,30000000.0,1900-09-09,120.0,936.0,29.1711,-95.2018
1,AL041901,Storm 4 in 1901,1000000.0,1901-08-14,75.0,973.0,29.266667,-89.633333
2,AL031903,Storm 3 in 1903,670000.0,1903-09-11,75.0,976.0,26.1,-80.1
3,AL031903,Storm 3 in 1903,0.0,1903-09-13,80.0,988.0,29.7989,-85.4572
4,AL021904,Storm 2 in 1904,1000000.0,1904-09-14,70.0,,33.2666,-79.275275


## New Data to get Duration of hurricane

In [14]:
hurricane_list = []
with open('hurdat2.txt') as fp:
    line = fp.readline()
    while line:
        if 'AL' in line:
            storm_id = line.split(',')[0].strip()
            storm_name = line.split(',')[1].strip()
        else:
            text = line.split(',')
            oldformat = text[0] + text[1]
            datetimeobject = datetime.strptime(oldformat,'%Y%m%d %H%M%S')
            date = datetimeobject.strftime('%m-%d-%Y %H%M%S')
            hurricane_list.append(
                {
                    "storm_id": storm_id, 
                    "name": storm_name,
                    "date": date
                }) 
        line = fp.readline()


In [15]:
# Create Dataframe and reorder columns
hdf = pd.DataFrame(hurricane_list)
hdf = hdf.loc[:, ["storm_id", "name","date"]]

In [16]:
hdf.shape

(53501, 3)

In [17]:
hdf.head()

Unnamed: 0,storm_id,name,date
0,AL011851,UNNAMED,06-25-1851 000000
1,AL011851,UNNAMED,06-25-1851 060000
2,AL011851,UNNAMED,06-25-1851 120000
3,AL011851,UNNAMED,06-25-1851 180000
4,AL011851,UNNAMED,06-25-1851 210000


In [18]:
hdf['date'] = pd.to_datetime(hdf['date'])

In [19]:
hdf['year'] = pd.DatetimeIndex(hdf['date']).year

### Getting min max date

In [20]:
new_df = hdf.groupby('storm_id')
new_df = new_df['date']
start_date = new_df.min()
end_date = new_df.max()
duration = end_date - start_date

### Merge Start Date, End Date, and Duration to one dataframe. 

In [21]:
new_df = pd.DataFrame({"start_date": start_date
                             ,"end_date": end_date
                             ,"duration" : duration
                            }).reset_index()

In [22]:
new_df.head()

Unnamed: 0,storm_id,start_date,end_date,duration
0,AL011851,1851-06-25 00:00:00,1851-06-28 00:00:00,3 days 00:00:00
1,AL011852,1852-08-19 00:00:00,1852-08-30 00:00:00,11 days 00:00:00
2,AL011853,1853-08-05 12:00:00,1853-08-05 12:00:00,0 days 00:00:00
3,AL011854,1854-06-25 00:00:00,1854-06-27 12:00:00,2 days 12:00:00
4,AL011855,1855-08-06 12:00:00,1855-08-06 12:00:00,0 days 00:00:00


In [23]:
df = pd.merge(new_df[['storm_id', 'duration']], df, on = 'storm_id')

In [24]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long
0,AL011900,19 days 18:00:00,Galveston,30000000.0,1900-09-09,120.0,936.0,29.1711,-95.2018
1,AL011918,6 days 18:00:00,Storm 1 in 1918,5000000.0,1918-08-06,105.0,955.0,29.8,-93.2
2,AL011926,11 days 06:00:00,Storm 1 in 1926,3051000.0,1926-07-28,90.0,967.0,28.564317,-80.53265
3,AL011929,3 days 06:00:00,Storm 1 in 1929,675000.0,1929-06-28,80.0,982.5,28.383333,-96.516667
4,AL011934,17 days 06:00:00,Storm 2 in 1934,2605000.0,1934-06-16,85.0,966.0,29.7,-91.7


### Reading new dataset to get rainfall

In [25]:
rain = pd.read_csv('rainfall.csv')

In [26]:
rain.head()

Unnamed: 0,Station,Total,Lat,Lon,Storm,Year
0,ALEXANDRIA,0.27,31.316667,-92.466667,Abby 1964,1964
1,ALEXANDRIA #2,0.17,31.316667,-92.45,Abby 1964,1964
2,ALEXANDRIA ESLER RGNL AP,0.42,31.4,-92.3,Abby 1964,1964
3,AMITE,1.35,30.716667,-90.533333,Abby 1964,1964
4,ANDREW,1.14,30.083333,-92.25,Abby 1964,1964


In [27]:
rain.shape

(778466, 6)

In [28]:
rain['name'] = rain['Storm'].str.split(' ').str[0]

In [29]:
rain.head()

Unnamed: 0,Station,Total,Lat,Lon,Storm,Year,name
0,ALEXANDRIA,0.27,31.316667,-92.466667,Abby 1964,1964,Abby
1,ALEXANDRIA #2,0.17,31.316667,-92.45,Abby 1964,1964,Abby
2,ALEXANDRIA ESLER RGNL AP,0.42,31.4,-92.3,Abby 1964,1964,Abby
3,AMITE,1.35,30.716667,-90.533333,Abby 1964,1964,Abby
4,ANDREW,1.14,30.083333,-92.25,Abby 1964,1964,Abby


### groupby to get max rainfall for each hurricane

In [30]:
rain_df = pd.DataFrame(rain.groupby(['name']).Total.max())

In [31]:
rain_df.head()

Unnamed: 0_level_0,Total
name,Unnamed: 1_level_1
AL011900,10.25
AL011901,12.51
AL011902,7.97
AL011906,8.33
AL011907,6.13


In [32]:
rain_df.reset_index(level=0, inplace = True)

In [33]:
rain_df.shape

(421, 2)

In [34]:
temp = pd.merge(df, rain_df, on = 'name')

In [35]:
temp.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,Total
0,AL011959,5 days 12:00:00,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26
1,AL021993,3 days 06:00:00,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26
2,AL011962,6 days 18:00:00,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38
3,AL011966,9 days 00:00:00,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38
4,AL011968,12 days 06:00:00,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65


In [36]:
temp.shape

(177, 10)

In [37]:
temp2 = pd.merge(df.drop(columns = ['name']), rain_df, left_on = 'storm_id', right_on = 'name', how = 'inner')

In [38]:
df = pd.concat([temp, temp2], ignore_index=True)

In [39]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,Total
0,AL011959,5 days 12:00:00,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26
1,AL021993,3 days 06:00:00,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26
2,AL011962,6 days 18:00:00,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38
3,AL011966,9 days 00:00:00,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38
4,AL011968,12 days 06:00:00,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65


In [40]:
df.shape

(237, 10)

In [41]:
df.rename(columns={'Total': 'rainfall'},
          inplace=True, errors='raise')

### Adding new columns

In [42]:
category = []
for index,row in df.iterrows():
    if row['wind'] <= 73:
        category.append(0)
    elif row['wind'] >= 74 and row['wind'] <= 95:
        category.append(1)
    elif row['wind'] >= 96 and row['wind'] <= 110:
        category.append(2)
    elif row['wind'] >= 111 and row['wind'] <= 129:
        category.append(3)
    elif row['wind'] >= 130 and row['wind'] <= 156:
        category.append(4)
    elif row['wind'] >= 157:
        category.append(5)


In [43]:
df['category'] = category

In [44]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,rainfall,category
0,AL011959,5 days 12:00:00,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26,0
1,AL021993,3 days 06:00:00,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26,0
2,AL011962,6 days 18:00:00,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38,0
3,AL011966,9 days 00:00:00,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38,1
4,AL011968,12 days 06:00:00,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65,0


In [45]:
df['duration'].loc[0].total_seconds()/86400

5.5

In [46]:
d = []
for index,row in df.iterrows():
    d.append(row['duration'].total_seconds()/86400)
df['duration'] = d

In [47]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,rainfall,category
0,AL011959,5.5,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26,0
1,AL021993,3.25,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26,0
2,AL011962,6.75,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38,0
3,AL011966,9.0,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38,1
4,AL011968,12.25,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65,0


### Extracting year from date

In [48]:
df['year'] = pd.DatetimeIndex(df['date']).year

In [49]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,rainfall,category,year
0,AL011959,5.5,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26,0,1959
1,AL021993,3.25,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26,0,1993
2,AL011962,6.75,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38,0,1962
3,AL011966,9.0,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38,1,1966
4,AL011968,12.25,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65,0,1968


### Extracting month from date

In [50]:
df['month'] = pd.DatetimeIndex(df['date']).month

In [51]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,rainfall,category,year,month
0,AL011959,5.5,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26,0,1959,5
1,AL021993,3.25,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26,0,1993,6
2,AL011962,6.75,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38,0,1962,8
3,AL011966,9.0,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38,1,1966,6
4,AL011968,12.25,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65,0,1968,6


### Function used to make API calls to get elevation from latitude and longitude

In [52]:
def elevation(lat, lng):
    api = config.API_KEY
    url = "https://maps.googleapis.com/maps/api/elevation/json"
    request = urllib.request.urlopen(url+"?locations="+str(lat)+","+str(lng)+"&key="+api)
    try:
        results = json.load(request).get('results')
        if 0 < len(results):
            elevation = results[0].get('elevation')
            return elevation
        else:
            print('HTTP GET Request failed.')
    except ValueError:
        print('JSON decode failed: '+str(request))

In [53]:
elevation(29.171100, -95.201800)

1.306691884994507

In [58]:
#df['elevation'] = ele_list

In [59]:
#df['elevation'].to_pickle('elevation.pkl')

In [54]:
df['elevation'] = pd.read_pickle("elevation.pkl")

### Filling nas in pressure with mode

In [56]:
df.pressure.fillna(df.pressure.mode(), inplace = True)

In [59]:
df.head()

Unnamed: 0,storm_id,duration,name,damage,date,wind,pressure,lat,long,rainfall,category,year,month,elevation
0,AL011959,5.5,Arlene,500000.0,1959-05-30,55.0,993.0,29.65,-91.6,15.26,0,1959,5,0.299893
1,AL021993,3.25,Arlene,22000000.0,1993-06-20,35.0,1001.166667,27.116667,-97.466667,15.26,0,1993,6,2.473105
2,AL011962,6.75,Alma,1000000.0,1962-08-28,65.0,990.666667,34.526417,-76.099667,10.38,0,1962,8,-39.299995
3,AL011966,9.0,Alma,10050000.0,1966-06-09,80.0,979.666667,29.9098,-84.4982,10.38,1,1966,6,-0.304802
4,AL011968,12.25,Abby,450000.0,1968-06-04,57.0,992.0,26.808383,-82.166817,14.65,0,1968,6,0.206002


In [60]:
df.to_pickle('df.pkl')

In [61]:
df.to_csv('data.csv', index = False)