In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from functools import partial
from decimal import Decimal

import warnings
warnings.filterwarnings('ignore')

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func, and_, or_

import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.ticker as ticker


from flask import Flask, jsonify

import time
from splinter import Browser
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import re

In [2]:
# df0_sig = pd.read_csv("raw_data/Significant_Volcanic_Eruption.csv")
# df0_sig.info()
# df0_sig.head()

In [3]:
# df1_sig = df0_sig[[
#     'Year', 'Month'
#     , 'Day', 'TSU'
#     , 'EQ', 'Name'
#     , 'Location', 'Country'
#     , 'Latitude', 'Longitude'
#     , 'Elevation', 'Type'
#     , 'Status', 'Time'
#     , 'VEI', 'Agent'
#     ]].fillna(0)

# df1_sig.info()
# df1_sig.head()

## Data Set 1: Volcanos with Confirmed Erruptions -- Smithsonian Institute Global Volcanism Program
http://volcano.si.edu/database/search_eruption_results.cfm
- confirmed erruptions database search results

In [4]:
df0_erupt_gvp = pd.read_excel("raw_data/GVP_Eruption_Results.xlsx", sheetname='Eruption List')
df0_erupt_gvp.info()
df0_erupt_gvp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 24 columns):
Global Volcanism Program - Volcanoes of the World 4.6.6    9841 non-null object
Unnamed: 1                                                 9841 non-null object
Unnamed: 2                                                 9841 non-null object
Unnamed: 3                                                 9841 non-null object
Downloaded on 09 Mar 2018 at 07:48 PM                      4390 non-null object
Unnamed: 5                                                 7564 non-null object
Unnamed: 6                                                 1102 non-null object
Unnamed: 7                                                 1569 non-null object
Unnamed: 8                                                 9840 non-null object
Unnamed: 9                                                 2049 non-null object
Unnamed: 10                                                9660 non-null object
Unnamed: 11  

Unnamed: 0,Global Volcanism Program - Volcanoes of the World 4.6.6,Unnamed: 1,Unnamed: 2,Unnamed: 3,Downloaded on 09 Mar 2018 at 07:48 PM,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Volcano Number,Volcano Name,Eruption Number,Eruption Category,Area of Activity,VEI,VEI Modifier,Start Year Modifier,Start Year,Start Year Uncertainty,...,Evidence Method (dating),End Year Modifier,End Year,End Year Uncertainty,End Month,End Day Modifier,End Day,End Day Uncertainty,Latitude,Longitude
1,343100,San Miguel,22251,Confirmed Eruption,,,,,2018,,...,Historical Observations,,2018,,1,,15,,13.434,-88.269
2,273030,Mayon,22250,Confirmed Eruption,,,,,2018,,...,Historical Observations,>,2018,,1,,19,,13.257,123.685
3,251002,Kadovar,22246,Confirmed Eruption,,,,,2018,,...,Historical Observations,>,2018,,1,,19,,-3.608,144.588
4,272020,Kanlaon,22249,Confirmed Eruption,,,,,2017,,...,Historical Observations,,2017,,12,,9,,10.412,123.132


In [5]:
df1_erupt_gvp = df0_erupt_gvp.rename(
    columns = df0_erupt_gvp.iloc[0]).drop(
    df0_erupt_gvp.index[0]).reset_index(drop=True)

df1_erupt_gvp.info()
df1_erupt_gvp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9840 entries, 0 to 9839
Data columns (total 24 columns):
Volcano Number              9840 non-null object
Volcano Name                9840 non-null object
Eruption Number             9840 non-null object
Eruption Category           9840 non-null object
Area of Activity            4389 non-null object
VEI                         7563 non-null object
VEI Modifier                1101 non-null object
Start Year Modifier         1568 non-null object
Start Year                  9839 non-null object
Start Year Uncertainty      2048 non-null object
Start Month                 9659 non-null object
Start Day Modifier          387 non-null object
Start Day                   9658 non-null object
Start Day Uncertainty       708 non-null object
Evidence Method (dating)    9793 non-null object
End Year Modifier           89 non-null object
End Year                    3942 non-null object
End Year Uncertainty        6 non-null object
End Month         

Unnamed: 0,Volcano Number,Volcano Name,Eruption Number,Eruption Category,Area of Activity,VEI,VEI Modifier,Start Year Modifier,Start Year,Start Year Uncertainty,...,Evidence Method (dating),End Year Modifier,End Year,End Year Uncertainty,End Month,End Day Modifier,End Day,End Day Uncertainty,Latitude,Longitude
0,343100,San Miguel,22251,Confirmed Eruption,,,,,2018,,...,Historical Observations,,2018,,1,,15,,13.434,-88.269
1,273030,Mayon,22250,Confirmed Eruption,,,,,2018,,...,Historical Observations,>,2018,,1,,19,,13.257,123.685
2,251002,Kadovar,22246,Confirmed Eruption,,,,,2018,,...,Historical Observations,>,2018,,1,,19,,-3.608,144.588
3,272020,Kanlaon,22249,Confirmed Eruption,,,,,2017,,...,Historical Observations,,2017,,12,,9,,10.412,123.132
4,264020,Agung,22241,Confirmed Eruption,,,,,2017,,...,Historical Observations,>,2018,,1,,15,,-8.343,115.508


In [6]:
df1_erupt_gvp.Latitude[0]
type(df1_erupt_gvp.Latitude[0])

float

#### Descriptions of VEI found in the link below
https://www.ngdc.noaa.gov/nndc/DescribeField.jsp?dataset=102557&s=77&field_name=HAZ.VOLCANO_EVENT.VEI
 - filled NaN with 0 

In [7]:
df2_erupt_gvp = df1_erupt_gvp[[
    'Volcano Number'
    ,'Volcano Name'
   ,'VEI','Start Year'
   ,'Start Month','Start Day'
   ,'End Year','End Month','End Day'
   ,'Latitude','Longitude'
]].apply(
    partial(
        pd.to_numeric
        , errors='ignore'
    ))
df2_erupt_gvp.info()
df2_erupt_gvp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9840 entries, 0 to 9839
Data columns (total 11 columns):
Volcano Number    9840 non-null int64
Volcano Name      9840 non-null object
VEI               7563 non-null float64
Start Year        9839 non-null float64
Start Month       9659 non-null float64
Start Day         9658 non-null float64
End Year          3942 non-null float64
End Month         3940 non-null float64
End Day           3939 non-null float64
Latitude          9840 non-null float64
Longitude         9840 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 845.7+ KB


Unnamed: 0,Volcano Number,Volcano Name,VEI,Start Year,Start Month,Start Day,End Year,End Month,End Day,Latitude,Longitude
0,343100,San Miguel,,2018.0,1.0,14.0,2018.0,1.0,15.0,13.434,-88.269
1,273030,Mayon,,2018.0,1.0,13.0,2018.0,1.0,19.0,13.257,123.685
2,251002,Kadovar,,2018.0,1.0,5.0,2018.0,1.0,19.0,-3.608,144.588
3,272020,Kanlaon,,2017.0,12.0,9.0,2017.0,12.0,9.0,10.412,123.132
4,264020,Agung,,2017.0,11.0,21.0,2018.0,1.0,15.0,-8.343,115.508
5,261230,Dempo,,2017.0,11.0,9.0,2017.0,11.0,9.0,-4.030,103.130
6,256010,Tinakula,3.0,2017.0,10.0,21.0,2017.0,10.0,26.0,-10.386,165.804
7,282090,Kirishimayama,,2017.0,10.0,11.0,2017.0,10.0,17.0,31.934,130.862
8,257030,Ambae,3.0,2017.0,9.0,6.0,2018.0,1.0,8.0,-15.400,167.830
9,353010,Fernandina,,2017.0,9.0,4.0,2017.0,9.0,6.0,-0.370,-91.550


In [8]:
df2_erupt_gvp.Latitude[0]
type(df2_erupt_gvp.Latitude[2])

numpy.float64

## Data Set 2: Volcanos With Metadata -- Smithsonian Institute
http://volcano.si.edu/database/search_volcano_results.cfm
- All volcanos

In [9]:
df0_vol_gvp = pd.read_excel("raw_data/GVP_Volcano_List_ALL.xlsx", sheetname='Volcano List')
df0_vol_gvp.info()
# df0_vol_meta.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12826 entries, 0 to 12825
Data columns (total 26 columns):
Global Volcanism Program - Volcanoes of the World 4.6.6    12826 non-null object
Unnamed: 1                                                 12826 non-null object
Unnamed: 2                                                 12825 non-null object
Downloaded on 09 Mar 2018 at 03:04 PM                      1445 non-null object
Unnamed: 4                                                 1445 non-null object
Unnamed: 5                                                 1445 non-null object
Unnamed: 6                                                 1445 non-null object
Unnamed: 7                                                 1445 non-null object
Unnamed: 8                                                 1445 non-null object
Unnamed: 9                                                 1445 non-null object
Unnamed: 10                                                1439 non-null object
Unnamed:

In [10]:
df1_vol_gvp = df0_vol_gvp.rename(
    columns=df0_vol_gvp.iloc[0]).drop(
    df0_vol_gvp.index[0]).reset_index(drop=True)

df1_vol_gvp.info()
df1_vol_gvp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12825 entries, 0 to 12824
Data columns (total 26 columns):
Volcano Number              12825 non-null object
Volcano Name                12825 non-null object
Primary Volcano Type        12824 non-null object
Last Eruption Year          1444 non-null object
Country                     1444 non-null object
Region                      1444 non-null object
Subregion                   1444 non-null object
Latitude                    1444 non-null object
Longitude                   1444 non-null object
Elevation                   1444 non-null object
Tectonic Settings           1438 non-null object
Evidence Category           1444 non-null object
Major Rock 1                1444 non-null object
Major Rock 2                1444 non-null object
Major Rock 3                1444 non-null object
Major Rock 4                1444 non-null object
Major Rock 5                1444 non-null object
Minor Rock 1                1444 non-null object
Minor 

Unnamed: 0,Volcano Number,Volcano Name,Primary Volcano Type,Last Eruption Year,Country,Region,Subregion,Latitude,Longitude,Elevation,...,Major Rock 5,Minor Rock 1,Minor Rock 2,Minor Rock 3,Minor Rock 4,Minor Rock 5,Population within 5 km,Population within 10 km,Population within 30 km,Population within 100 km
0,231160,"'Ares, Djebel el-","Synonym of Urais, Jabal",,,,,,,,...,,,,,,,,,,
1,231020,"'Awayridh, Harrat el-","Synonym of Uwayrid, Harrat",,,,,,,,...,,,,,,,,,,
2,231020,"'Uweirizh, Harrat el-","Synonym of Uwayrid, Harrat",,,,,,,,...,,,,,,,,,,
3,343050,"14 de Marzo, Cerro",Pyroclastic cone of San Salvador,,,,,,,,...,,,,,,,,,,
4,244010,A'ofa,Shield volcano of Ofu-Olosega,,,,,,,,...,,,,,,,,,,


In [11]:
df2_vol_gvp = df1_vol_gvp[[
    'Volcano Number'
    ,'Volcano Name'
    ,'Country','Region'
    ,'Latitude'
    ,'Longitude','Elevation'
    ,'Evidence Category'
    ,'Primary Volcano Type'
]].dropna(subset = ['Country']).reset_index(drop=True).apply(
    partial(
        pd.to_numeric
        , errors='ignore'))

df2_vol_gvp.info()
df2_vol_gvp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444 entries, 0 to 1443
Data columns (total 9 columns):
Volcano Number          1444 non-null int64
Volcano Name            1444 non-null object
Country                 1444 non-null object
Region                  1444 non-null object
Latitude                1444 non-null float64
Longitude               1444 non-null float64
Elevation               1444 non-null int64
Evidence Category       1444 non-null object
Primary Volcano Type    1443 non-null object
dtypes: float64(2), int64(2), object(5)
memory usage: 101.6+ KB


Unnamed: 0,Volcano Number,Volcano Name,Country,Region,Latitude,Longitude,Elevation,Evidence Category,Primary Volcano Type
0,283001,Abu,Japan,"Japan, Taiwan, Marianas",34.500,131.600,641,Eruption Dated,Shield(s)
1,355096,Acamarachi,Chile,South America,-23.292,-67.618,6023,Evidence Credible,Stratovolcano
2,342080,Acatenango,Guatemala,México and Central America,14.501,-90.876,3976,Eruption Observed,Stratovolcano(es)
3,213004,Acigol-Nevsehir,Turkey,Mediterranean and Western Asia,38.537,34.621,1683,Eruption Dated,Caldera
4,321040,Adams,United States,Canada and Western USA,46.206,-121.490,3742,Eruption Dated,Stratovolcano
5,333050,Adams Seamount,Undersea Features,Hawaii and Pacific Ocean,-25.370,-129.270,-39,Eruption Dated,Submarine
6,283170,Adatarayama,Japan,"Japan, Taiwan, Marianas",37.647,140.281,1728,Eruption Observed,Stratovolcano(es)
7,221170,Adwa,Ethiopia,Africa and Red Sea,10.070,40.840,1733,Evidence Credible,Stratovolcano
8,221110,Afdera,Ethiopia,Africa and Red Sea,13.088,40.853,1250,Evidence Uncertain,Stratovolcano
9,284160,Agrigan,United States,"Japan, Taiwan, Marianas",18.770,145.670,965,Eruption Observed,Stratovolcano


In [12]:
type(df2_vol_gvp.Latitude[2])

numpy.float64

## Merge Data Sets 1 & 2

In [13]:
df_erupt_merge0 = pd.merge(
    df2_erupt_gvp, df2_vol_gvp, 
    how='left', 
    on=['Volcano Number',
        'Volcano Name',
        'Latitude',
        'Longitude'], 
    left_on=None, right_on=None,
    left_index=False, right_index=False, 
    sort=True, suffixes=('_x', '_y'), 
    copy=True, indicator=False)

df_erupt_merge0.info()
df_erupt_merge0.head()

#######################################################
#### This merge causese Lat/Long to be a floating point 
#### with more than numbers more than 3 decimals places
#######################################################

df_erupt_merge0.to_csv('data_check/EruptionData0_chk.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9840 entries, 0 to 9839
Data columns (total 16 columns):
Volcano Number          9840 non-null int64
Volcano Name            9840 non-null object
VEI                     7563 non-null float64
Start Year              9839 non-null float64
Start Month             9659 non-null float64
Start Day               9658 non-null float64
End Year                3942 non-null float64
End Month               3940 non-null float64
End Day                 3939 non-null float64
Latitude                9840 non-null float64
Longitude               9840 non-null float64
Country                 9840 non-null object
Region                  9840 non-null object
Elevation               9840 non-null int64
Evidence Category       9840 non-null object
Primary Volcano Type    9763 non-null object
dtypes: float64(9), int64(2), object(5)
memory usage: 1.3+ MB


In [14]:
for index, row in df_erupt_merge0.iterrows():
    df_erupt_merge0['Latitude'][index] = round(row['Latitude'],3)
    print('.', end='')

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [15]:
type(df_erupt_merge0['Latitude'][0])
df_erupt_merge0['Latitude'][0]

50.170000000000002

## Comprehensive list of Volcanos -- NOAA
https://www.ngdc.noaa.gov/nndc/struts/results?type_0=Like&query_0=&op_8=eq&v_8=&type_10=EXACT&query_10=None+Selected&le_2=&ge_3=&le_3=&ge_2=&op_5=eq&v_5=&op_6=eq&v_6=&op_7=eq&v_7=&t=102557&s=5&d=5

In [16]:
url = 'https://www.ngdc.noaa.gov/nndc/struts/results?type_0=Like&query_0=&op_8=eq&v_8=&type_10=EXACT&query_10=None+Selected&le_2=&ge_3=&le_3=&ge_2=&op_5=eq&v_5=&op_6=eq&v_6=&op_7=eq&v_7=&t=102557&s=5&d=5'
noaa_tablesv0 = pd.read_html(url)[1]

noaa_tablev1 = noaa_tablesv0.rename(
    columns = noaa_tablesv0.iloc[1]).drop(
    noaa_tablesv0.index[[0,1]]).reset_index(drop=True)

In [17]:
noaa_vol_table = noaa_tablev1[[
    'Volcano Name', 
    'Country', 'Region', 
    'Latitude', 'Longitude', 
    'Elev', 'Type', 'Status']].apply(
    partial(
        pd.to_numeric
        , errors='ignore'
    )).rename(columns = {
    'Elev':'Elevation',
    'Type':'Primary Volcano Type'}).drop_duplicates([
    'Volcano Name','Country','Region'])

noaa_vol_table.info()
noaa_vol_table


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1497 entries, 0 to 1570
Data columns (total 8 columns):
Volcano Name            1497 non-null object
Country                 1497 non-null object
Region                  1497 non-null object
Latitude                1497 non-null float64
Longitude               1497 non-null float64
Elevation               1489 non-null float64
Primary Volcano Type    1497 non-null object
Status                  1497 non-null object
dtypes: float64(3), object(5)
memory usage: 105.3+ KB


Unnamed: 0,Volcano Name,Country,Region,Latitude,Longitude,Elevation,Primary Volcano Type,Status
0,Abu,Japan,Honshu-Japan,34.500,131.600,571.0,Shield volcano,Holocene
1,Acamarachi,Chile,Chile-N,-23.300,-67.620,6046.0,Stratovolcano,Holocene
2,Acatenango,Guatemala,Guatemala,14.501,-90.876,3976.0,Stratovolcano,Historical
3,Acigol-Nevsehir,Turkey,Turkey,38.570,34.520,1689.0,Maar,Holocene
4,Adams,United States,US-Washington,46.206,-121.490,3742.0,Stratovolcano,Tephrochronology
5,Adatara,Japan,Honshu-Japan,37.620,140.280,1718.0,Stratovolcano,Historical
6,Adwa,Ethiopia,Africa-NE,10.070,40.840,1733.0,Stratovolcano,Holocene
7,Afdera,Ethiopia,Africa-NE,13.080,40.850,1295.0,Stratovolcano,Holocene
8,Agmagan-Karadag,Armenia,Armenia,40.275,44.750,3560.0,Volcanic field,Holocene
9,Agrigan,United States,Mariana Is-C Pacific,18.770,145.670,965.0,Stratovolcano,Historical


In [18]:
type(noaa_vol_table.Latitude[0])
noaa_vol_table.Latitude[0]

34.5

In [19]:
noaa_vol_table.info()
print('')
df_erupt_merge0.info()
noaa_vol_table.to_csv('data_check/noaa_vol_table1_chk.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1497 entries, 0 to 1570
Data columns (total 8 columns):
Volcano Name            1497 non-null object
Country                 1497 non-null object
Region                  1497 non-null object
Latitude                1497 non-null float64
Longitude               1497 non-null float64
Elevation               1489 non-null float64
Primary Volcano Type    1497 non-null object
Status                  1497 non-null object
dtypes: float64(3), object(5)
memory usage: 145.3+ KB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9840 entries, 0 to 9839
Data columns (total 16 columns):
Volcano Number          9840 non-null int64
Volcano Name            9840 non-null object
VEI                     7563 non-null float64
Start Year              9839 non-null float64
Start Month             9659 non-null float64
Start Day               9658 non-null float64
End Year                3942 non-null float64
End Month               3940 non-null float64
End 

## Merging Data Sets (1/2) + 3

In [20]:
df_erupt_merge2 = pd.merge(df_erupt_merge0[[
    'Volcano Name','VEI',
    'Start Year','Start Month','Start Day',
    'End Year','End Month','End Day',
    'Country','Region','Latitude',
    'Longitude','Elevation', 
    'Primary Volcano Type']], noaa_vol_table[[
    'Volcano Name','Country','Region',
    'Latitude', 'Longitude',
    'Elevation','Primary Volcano Type',
    'Status']], how='left', on=['Volcano Name','Latitude', 'Longitude'], left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)

df_erupt_merge2.info()
df_erupt_merge2

# s1 = 'Primary Volcano Type'
# s2 = 'Status'

# x = vol_table[s1].isnull().sum()
# y = vol_table[s2].isnull().sum()

# print(f'There are {x} unknown {s1} values.\n There are {y} unknown {s2} values')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9840 entries, 0 to 9839
Data columns (total 19 columns):
Volcano Name              9840 non-null object
VEI                       7563 non-null float64
Start Year                9839 non-null float64
Start Month               9659 non-null float64
Start Day                 9658 non-null float64
End Year                  3942 non-null float64
End Month                 3940 non-null float64
End Day                   3939 non-null float64
Country_x                 9840 non-null object
Region_x                  9840 non-null object
Latitude                  9840 non-null float64
Longitude                 9840 non-null float64
Elevation_x               9840 non-null int64
Primary Volcano Type_x    9763 non-null object
Country_y                 2691 non-null object
Region_y                  2691 non-null object
Elevation_y               2691 non-null float64
Primary Volcano Type_y    2691 non-null object
Status                    2691 non-nul

Unnamed: 0,Volcano Name,VEI,Start Year,Start Month,Start Day,End Year,End Month,End Day,Country_x,Region_x,Latitude,Longitude,Elevation_x,Primary Volcano Type_x,Country_y,Region_y,Elevation_y,Primary Volcano Type_y,Status
0,Abu,,-6850.0,0.0,0.0,,,,Japan,"Japan, Taiwan, Marianas",34.500,131.600,641,Shield(s),Japan,Honshu-Japan,571.0,Shield volcano,Holocene
1,Acatenango,1.0,1972.0,11.0,12.0,1972.0,12.0,16.0,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
2,Acatenango,2.0,1926.0,8.0,0.0,1927.0,5.0,19.0,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
3,Acatenango,3.0,1924.0,12.0,18.0,1925.0,6.0,7.0,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
4,Acatenango,,1450.0,0.0,0.0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
5,Acatenango,,90.0,0.0,0.0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
6,Acatenango,,-260.0,0.0,0.0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
7,Acatenango,,-370.0,0.0,0.0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
8,Acatenango,,-2710.0,0.0,0.0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Guatemala,Guatemala,3976.0,Stratovolcano,Historical
9,Acigol-Nevsehir,,-2080.0,0.0,0.0,,,,Turkey,Mediterranean and Western Asia,38.537,34.621,1683,Caldera,,,,,


In [21]:
df_erupt_merge2.to_csv('data_check/EruptionData1_chk.csv') 

In [22]:
df_erupt_merge3 = df_erupt_merge2.where((pd.notnull(df_erupt_merge2)), None)

for index, row in df_erupt_merge3.iterrows():
    if row['Country_x'] is None:
        print('.', end='')
        row['Country_x'] = row['Country_y']
        row['Primary Volcano Type_x'] = row['Primary Volcano Type_y']
        row['Region_x'] = row['Region_y']
        row['Elevation_x'] = row['Elevation_y']
    else:
        print('-', end = '')

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### Handling Start Date Tip
https://www.ngdc.noaa.gov/nndc/DescribeField.jsp?dataset=102557&s=77&field_name=HAZ.VOLCANO_EVENT.DAY

Day:
Valid values: 1-31 (where months apply)
The Date and Time are given in Universal Coordinated Time (also known as Greenwich Mean Time). The local date may be one day different.

In [23]:
df_erupt_merge4 = df_erupt_merge3.drop([
    'Country_y','Region_y',
    'Elevation_y',
    'Primary Volcano Type_y'], axis=1).rename(columns = {
    'Country_x' : 'Country',
    'Region_x' : 'Region',
    'Elevation_x' : 'Elevation',
    'Primary Volcano Type_x' : 'Primary Volcano Type'})
# .fillna({
#     'VEI':0,'Start Month':0,'Start Day':0,
#     'End Year':0,'End Month':0,'End Day':0})

df_erupt_merge4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9840 entries, 0 to 9839
Data columns (total 15 columns):
Volcano Name            9840 non-null object
VEI                     7563 non-null object
Start Year              9839 non-null object
Start Month             9659 non-null object
Start Day               9658 non-null object
End Year                3942 non-null object
End Month               3940 non-null object
End Day                 3939 non-null object
Country                 9840 non-null object
Region                  9840 non-null object
Latitude                9840 non-null object
Longitude               9840 non-null object
Elevation               9840 non-null int64
Primary Volcano Type    9763 non-null object
Status                  2691 non-null object
dtypes: int64(1), object(14)
memory usage: 1.2+ MB


In [24]:
# remove row with no start erruption date

for index, row in df_erupt_merge4.iterrows():
    if row['Start Year']is None:
        df_erupt_merge4.drop(index, inplace=True)
    else:
        print('.',end='')
df_erupt_merge4.info()

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [25]:
df_erupt_merge4.to_csv('data_check/EruptionData2_chk.csv', index=False)

In [26]:
df3_vol_gvp =  df2_vol_gvp[['Volcano Name','Country','Region','Latitude','Longitude','Elevation','Primary Volcano Type']].dropna(subset = ['Country']).reset_index(drop=True)
df3_vol_gvp.info()
df3_vol_gvp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444 entries, 0 to 1443
Data columns (total 7 columns):
Volcano Name            1444 non-null object
Country                 1444 non-null object
Region                  1444 non-null object
Latitude                1444 non-null float64
Longitude               1444 non-null float64
Elevation               1444 non-null int64
Primary Volcano Type    1443 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 79.0+ KB


Unnamed: 0,Volcano Name,Country,Region,Latitude,Longitude,Elevation,Primary Volcano Type
0,Abu,Japan,"Japan, Taiwan, Marianas",34.500,131.600,641,Shield(s)
1,Acamarachi,Chile,South America,-23.292,-67.618,6023,Stratovolcano
2,Acatenango,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es)
3,Acigol-Nevsehir,Turkey,Mediterranean and Western Asia,38.537,34.621,1683,Caldera
4,Adams,United States,Canada and Western USA,46.206,-121.490,3742,Stratovolcano
5,Adams Seamount,Undersea Features,Hawaii and Pacific Ocean,-25.370,-129.270,-39,Submarine
6,Adatarayama,Japan,"Japan, Taiwan, Marianas",37.647,140.281,1728,Stratovolcano(es)
7,Adwa,Ethiopia,Africa and Red Sea,10.070,40.840,1733,Stratovolcano
8,Afdera,Ethiopia,Africa and Red Sea,13.088,40.853,1250,Stratovolcano
9,Agrigan,United States,"Japan, Taiwan, Marianas",18.770,145.670,965,Stratovolcano


## Merge by Name & Latitude (Floating Point Issue Resolved)

In [27]:
df_erupt_merge5 = pd.merge(df_erupt_merge4, df3_vol_gvp, how='left', on=['Volcano Name','Latitude'], left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)

df_erupt_merge5.info()
df_erupt_merge5

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9839 entries, 0 to 9838
Data columns (total 20 columns):
Volcano Name              9839 non-null object
VEI                       7563 non-null object
Start Year                9839 non-null object
Start Month               9659 non-null object
Start Day                 9658 non-null object
End Year                  3942 non-null object
End Month                 3940 non-null object
End Day                   3939 non-null object
Country_x                 9839 non-null object
Region_x                  9839 non-null object
Latitude                  9839 non-null object
Longitude_x               9839 non-null object
Elevation_x               9839 non-null int64
Primary Volcano Type_x    9762 non-null object
Status                    2690 non-null object
Country_y                 9839 non-null object
Region_y                  9839 non-null object
Longitude_y               9839 non-null float64
Elevation_y               9839 non-null int64
P

Unnamed: 0,Volcano Name,VEI,Start Year,Start Month,Start Day,End Year,End Month,End Day,Country_x,Region_x,Latitude,Longitude_x,Elevation_x,Primary Volcano Type_x,Status,Country_y,Region_y,Longitude_y,Elevation_y,Primary Volcano Type_y
0,Abu,,-6850,0,0,,,,Japan,"Japan, Taiwan, Marianas",34.5,131.6,641,Shield(s),Holocene,Japan,"Japan, Taiwan, Marianas",131.600,641,Shield(s)
1,Acatenango,1,1972,11,12,1972,12,16,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
2,Acatenango,2,1926,8,0,1927,5,19,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
3,Acatenango,3,1924,12,18,1925,6,7,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
4,Acatenango,,1450,0,0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
5,Acatenango,,90,0,0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
6,Acatenango,,-260,0,0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
7,Acatenango,,-370,0,0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
8,Acatenango,,-2710,0,0,,,,Guatemala,México and Central America,14.501,-90.876,3976,Stratovolcano(es),Historical,Guatemala,México and Central America,-90.876,3976,Stratovolcano(es)
9,Acigol-Nevsehir,,-2080,0,0,,,,Turkey,Mediterranean and Western Asia,38.537,34.621,1683,Caldera,,Turkey,Mediterranean and Western Asia,34.621,1683,Caldera


In [28]:
df_erupt_merge5.to_csv('data_check/EruptionData3_chk.csv')

In [29]:
df_erupt_merge6 = df_erupt_merge5.where((pd.notnull(df_erupt_merge5)), None)

for index, row in df_erupt_merge6.iterrows():
    if row['Country_x'] is None:
        print('.', end='')
        row['Country_x'] = row['Country_y']
        row['Region_x'] = row['Region_y']
        row['Elevation_x'] = row['Elevation_y']
        row['Primary Volcano Type_x'] = row['Primary Volcano Type_y']
    else:
        print('-', end = '')

df_erupt_merge7 = df_erupt_merge6.drop([
    'Longitude_y','Country_y',
    'Region_y','Elevation_y',
    'Primary Volcano Type_y'], axis=1).rename(columns = {
    'Country_x' : 'Country',
    'Region_x' : 'Region',
    'Longitude_x' : 'Longitude',
    'Elevation_x' : 'Elevation',
    'Primary Volcano Type_x' : 'Primary Volcano Type'})

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
df_erupt_merge7.to_csv('data_check/EruptionData4_chk.csv')

## Re-merge by Lat/Long

In [31]:
df_erupt_merge8 = pd.merge(df_erupt_merge7, df3_vol_gvp, how='left', on=['Latitude', 'Longitude'], left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)

df_erupt_merge8.info()
df_erupt_merge8

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9839 entries, 0 to 9838
Data columns (total 20 columns):
Volcano Name_x            9839 non-null object
VEI                       7563 non-null object
Start Year                9839 non-null object
Start Month               9659 non-null object
Start Day                 9658 non-null object
End Year                  3942 non-null object
End Month                 3940 non-null object
End Day                   3939 non-null object
Country_x                 9839 non-null object
Region_x                  9839 non-null object
Latitude                  9839 non-null object
Longitude                 9839 non-null object
Elevation_x               9839 non-null int64
Primary Volcano Type_x    9762 non-null object
Status                    2690 non-null object
Volcano Name_y            9839 non-null object
Country_y                 9839 non-null object
Region_y                  9839 non-null object
Elevation_y               9839 non-null int64
Pr

Unnamed: 0,Volcano Name_x,VEI,Start Year,Start Month,Start Day,End Year,End Month,End Day,Country_x,Region_x,Latitude,Longitude,Elevation_x,Primary Volcano Type_x,Status,Volcano Name_y,Country_y,Region_y,Elevation_y,Primary Volcano Type_y
0,Erebus,2,1972,12,16,2018,2,7,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
1,Erebus,1,1972,1,3,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
2,Erebus,0,1963,11,16,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
3,Erebus,2,1955,7,2,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
4,Erebus,2,1947,2,0,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
5,Erebus,2,1915,8,0,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
6,Erebus,2,1915,3,22,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
7,Erebus,2,1912,12,12,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
8,Erebus,2,1911,10,0,,,,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano
9,Erebus,2,1911,4,0,1911,6,0,Antarctica,Antarctica,-77.53,167.17,3794,Stratovolcano,Historical,Erebus,Antarctica,Antarctica,3794,Stratovolcano


In [32]:
df_erupt_merge8.to_csv('data_check/EruptionData5_chk.csv')

In [33]:
df_erupt_merge9 = df_erupt_merge8.where((pd.notnull(df_erupt_merge8)), None)

for index, row in df_erupt_merge9.iterrows():
    if row['Country_x'] is None:
        print('.', end='')
        row['Country_x'] = row['Country_y']
        row['Region_x'] = row['Region_y']
        row['Elevation_x'] = row['Elevation_y']
        row['Primary Volcano Type_x'] = row['Primary Volcano Type_y']
    else:
        print('-', end = '')

df_erupt_final = df_erupt_merge9.drop([
    'Volcano Name_y',
    'Country_y','Region_y','Elevation_y',
    'Primary Volcano Type_y'], axis=1).rename(columns = {
    'Volcano Name_x':'Volcano Name',
    'Country_x' : 'Country',
    'Region_x' : 'Region',
    'Elevation_x' : 'Elevation',
    'Primary Volcano Type_x' : 'Primary Volcano Type'})

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [34]:
df_erupt_final.to_csv('finalized_data/clean_eruptions.csv', index=True, encoding = 'utf-8')

In [35]:
df_erupt_final.to_excel('finalized_data/clean_eruptions.xlsx', index=True)