## Libraries

### Note: You need to install below library in order for code to work
### pip install reverse_geocoder

In [1]:
# Import Pandas and NumPy
import pandas as pd
import numpy as np
import datetime as dt

# Import SQL Alchemy
from sqlalchemy import create_engine
import psycopg2

# Import Password
from config import password

# Import Reverse Geocoder library which takes a latitude / longitude coordinate and returns the nearest town/city
import reverse_geocoder as rg

In [94]:
# Earthquakes
# Save path to data set in a variable
Earthquakes = "../Resources/Earthquakes.csv"

# Use Pandas to read data
Earthquakes_df = pd.read_csv(Earthquakes)
Earthquakes_df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2021-04-17T22:20:57.320Z,32.751,-115.829333,6.04,1.24,ml,17.0,72.0,0.1296,0.22,...,2021-04-17T22:24:25.471Z,"15km E of Ocotillo, CA",earthquake,0.62,2.82,0.224,16.0,automatic,ci,ci
1,2021-04-17T22:18:44.955Z,60.2994,-141.2456,3.0,1.5,ml,,,,0.59,...,2021-04-17T22:22:37.941Z,"119 km NW of Yakutat, Alaska",earthquake,,0.2,,,automatic,ak,ak
2,2021-04-17T22:13:31.910Z,19.182333,-155.393997,34.349998,1.85,md,38.0,166.0,,0.12,...,2021-04-17T22:16:49.020Z,"9 km ESE of Pāhala, Hawaii",earthquake,0.69,0.83,0.88,7.0,automatic,hv,hv
3,2021-04-17T22:07:23.157Z,69.0625,-146.4755,1.2,3.0,ml,,,,0.78,...,2021-04-17T22:20:57.579Z,"111 km NNW of Arctic Village, Alaska",earthquake,,0.6,,,automatic,ak,ak
4,2021-04-17T22:05:27.560Z,32.400833,-115.487333,1.01,1.77,ml,15.0,120.0,0.1553,0.28,...,2021-04-17T22:09:47.387Z,"22km SSE of Progreso, B.C., MX",earthquake,0.66,31.61,0.258,22.0,automatic,ci,ci


## Data Cleaning

In [95]:
# get list of all columns
Earthquakes_df.columns

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

In [96]:
# check all columns with any missing/null values
Earthquakes_df.isna().sum()

time                  0
latitude              0
longitude             0
depth                 0
mag                   3
magType               3
nst                2486
gap                1527
dmin               3216
rms                   0
net                   0
id                    0
updated               0
place                 0
type                  0
horizontalError    2871
depthError            0
magError           2123
magNst             1798
status                0
locationSource        0
magSource             0
dtype: int64

In [97]:
# drop the unwanted columns
Earthquakes_df.drop(columns = ['horizontalError', 'depthError', 'magError','nst', 'gap', 'dmin', 'magNst', 'net'], inplace=True)

In [98]:
# Count no of records
Earthquakes_df['id'].count()

10328

In [99]:
# Drop any missing/null values
Earthquakes_df.dropna(inplace=True)

In [100]:
# Re-count no of records after drop
Earthquakes_df['id'].count()

10325

In [101]:
# Format the Time field to get the date in %Y-%m-%d %H:%M:%S format
Earthquakes_df['time'] = pd.to_datetime(Earthquakes_df['time'], format='%Y-%m-%d %H:%M:%S', errors='ignore').dt.tz_convert(None)
Earthquakes_df['time'] = Earthquakes_df['time'].dt.ceil(freq='s')

In [102]:
# Format the updated field to get the date in %Y-%m-%d %H:%M:%S format
Earthquakes_df['updated'] = pd.to_datetime(Earthquakes_df['updated'], format='%Y-%m-%d %H:%M:%S', errors='ignore').dt.tz_convert(None)
Earthquakes_df['updated'] = Earthquakes_df['updated'].dt.ceil(freq='s')

In [103]:
# Convert both lat and long columns into tuples
subset = Earthquakes_df[['latitude', 'longitude']]
tuples = [tuple(x) for x in subset.values]
#print(tuples)

In [104]:
# sample test
coordinates = (32.751, -115.8293333), (60.2994,-141.2456)
rg.search(coordinates)

[{'lat': '32.79311',
  'lon': '-115.69111',
  'name': 'Seeley',
  'admin1': 'California',
  'admin2': 'Imperial County',
  'cc': 'US'},
 {'lat': '63.33667',
  'lon': '-142.98556',
  'name': 'Tok',
  'admin1': 'Alaska',
  'admin2': 'Southeast Fairbanks Census Area',
  'cc': 'US'}]

In [105]:
# coordinates = (32.751, -115.8293333), (60.2994,-141.2456)
# Pass the tuples as an input to reverse_geocoder to get City, County, State and Country
geocode_df = pd.DataFrame(rg.search(tuples))
geocode_df.head()

Unnamed: 0,lat,lon,name,admin1,admin2,cc
0,32.79311,-115.69111,Seeley,California,Imperial County,US
1,63.33667,-142.98556,Tok,Alaska,Southeast Fairbanks Census Area,US
2,19.2025,-155.47694,Pahala,Hawaii,Hawaii County,US
3,70.25528,-148.33722,Prudhoe Bay,Alaska,North Slope Borough,US
4,32.58039,-115.58479,Progreso,Baja California,,MX


In [106]:
# Rename the columns
geocode_df.rename(columns = {"lat" : "latitude", "lon" : "longitude", "name" : "City", "admin1" : "State", "admin2" : "County", "cc" : "Country"}, inplace=True)
geocode_df.head()

Unnamed: 0,latitude,longitude,City,State,County,Country
0,32.79311,-115.69111,Seeley,California,Imperial County,US
1,63.33667,-142.98556,Tok,Alaska,Southeast Fairbanks Census Area,US
2,19.2025,-155.47694,Pahala,Hawaii,Hawaii County,US
3,70.25528,-148.33722,Prudhoe Bay,Alaska,North Slope Borough,US
4,32.58039,-115.58479,Progreso,Baja California,,MX


In [113]:
# verify the counts
Earthquakes_df['id'].count()

10325

In [114]:
# verify the counts
geocode_df['latitude'].count()

10325

In [125]:
# Merge both df's based on Index to get City, State, County and Country into Earthquakes_df
# Note: We could have merged based on Lat and Long but their float values are slighly off which retruns nothing after merge
merge_df = Earthquakes_df.merge(geocode_df, left_index=True, right_index=True)

In [126]:
# drop the unwanted columns
merge_df.drop(columns = ['latitude_y', 'longitude_y', 'magSource'], inplace=True)

In [127]:
# Rename the columns
merge_df.rename(columns = {"latitude_x" : "latitude", "longitude_x" : "longitude", "rms" : "rootMeanSquare", 'updated': 'lastUpdDatetime'}, inplace=True)
merge_df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,rootMeanSquare,id,lastUpdDatetime,place,type,status,locationSource,City,State,County,Country
0,2021-04-17 22:20:58,32.751,-115.829333,6.04,1.24,ml,0.22,ci39854328,2021-04-17 22:24:26,"15km E of Ocotillo, CA",earthquake,automatic,ci,Seeley,California,Imperial County,US
1,2021-04-17 22:18:45,60.2994,-141.2456,3.0,1.5,ml,0.59,ak0214xbpmk2,2021-04-17 22:22:38,"119 km NW of Yakutat, Alaska",earthquake,automatic,ak,Tok,Alaska,Southeast Fairbanks Census Area,US
2,2021-04-17 22:13:32,19.182333,-155.393997,34.349998,1.85,md,0.12,hv72430407,2021-04-17 22:16:50,"9 km ESE of Pāhala, Hawaii",earthquake,automatic,hv,Pahala,Hawaii,Hawaii County,US
3,2021-04-17 22:07:24,69.0625,-146.4755,1.2,3.0,ml,0.78,ak0214xbn842,2021-04-17 22:20:58,"111 km NNW of Arctic Village, Alaska",earthquake,automatic,ak,Prudhoe Bay,Alaska,North Slope Borough,US
4,2021-04-17 22:05:28,32.400833,-115.487333,1.01,1.77,ml,0.28,ci39854320,2021-04-17 22:09:48,"22km SSE of Progreso, B.C., MX",earthquake,automatic,ci,Progreso,Baja California,,MX


In [137]:
# Replace white spaces in newly added fields with NaN
merge_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [139]:
# check all columns with any missing/null values
merge_df.isna().sum()

time                 0
latitude             0
longitude            0
depth                0
mag                  0
magType              0
rootMeanSquare       0
id                   0
lastUpdDatetime      0
place                0
type                 0
status               0
locationSource       0
City                 0
State               27
County             973
Country              0
dtype: int64

In [140]:
# Drop any missing/null values
merge_df.dropna(inplace=True)

In [141]:
# verify the counts
merge_df['id'].count()

9349

In [143]:
# Earthquake Df
Earthquake = pd.DataFrame(merge_df, columns = ['id', 'type', 'depth','mag', 'magType', 'rootMeanSquare', 'status'])
Earthquake.head()

Unnamed: 0,id,type,depth,mag,magType,rootMeanSquare,status
0,ci39854328,earthquake,6.04,1.24,ml,0.22,automatic
1,ak0214xbpmk2,earthquake,3.0,1.5,ml,0.59,automatic
2,hv72430407,earthquake,34.349998,1.85,md,0.12,automatic
3,ak0214xbn842,earthquake,1.2,3.0,ml,0.78,automatic
5,ak0214xbmcgy,earthquake,124.8,1.3,ml,0.43,automatic


In [144]:
# Location Df
Location = pd.DataFrame(merge_df, columns = ['id', 'place', 'latitude','longitude', 'locationSource', 'City', 'County', 'State', 'Country'])
Location.head()

Unnamed: 0,id,place,latitude,longitude,locationSource,City,County,State,Country
0,ci39854328,"15km E of Ocotillo, CA",32.751,-115.829333,ci,Seeley,Imperial County,California,US
1,ak0214xbpmk2,"119 km NW of Yakutat, Alaska",60.2994,-141.2456,ak,Tok,Southeast Fairbanks Census Area,Alaska,US
2,hv72430407,"9 km ESE of Pāhala, Hawaii",19.182333,-155.393997,hv,Pahala,Hawaii County,Hawaii,US
3,ak0214xbn842,"111 km NNW of Arctic Village, Alaska",69.0625,-146.4755,ak,Prudhoe Bay,North Slope Borough,Alaska,US
5,ak0214xbmcgy,"66 km ENE of Pedro Bay, Alaska",59.9636,-152.9668,ak,Anchor Point,Kenai Peninsula Borough,Alaska,US


In [145]:
# Time Df
Time = pd.DataFrame(merge_df, columns = ['id', 'time', 'lastUpdDatetime'])
Time.head()

Unnamed: 0,id,time,lastUpdDatetime
0,ci39854328,2021-04-17 22:20:58,2021-04-17 22:24:26
1,ak0214xbpmk2,2021-04-17 22:18:45,2021-04-17 22:22:38
2,hv72430407,2021-04-17 22:13:32,2021-04-17 22:16:50
3,ak0214xbn842,2021-04-17 22:07:24,2021-04-17 22:20:58
5,ak0214xbmcgy,2021-04-17 22:03:14,2021-04-17 22:06:45


## Database

In [151]:
# connect to Postgres
engine = create_engine(f"postgresql://postgres:{password}@localhost/Earthquakes_db")
conn = engine.connect()

### earthquake

In [152]:
# Insert data into earthquake table
Earthquake.to_sql(name='earthquake', con=engine, if_exists='append', index=False)

In [153]:
# earthquake - Query the data in Postgres
earthquake_table = pd.read_sql("SELECT * FROM earthquake", conn)
earthquake_table.head(5)

Unnamed: 0,id,type,depth,mag,magType,rootMeanSquare,status
0,ci39854328,earthquake,6.04,1.24,ml,0.22,automatic
1,ak0214xbpmk2,earthquake,3.0,1.5,ml,0.59,automatic
2,hv72430407,earthquake,34.349998,1.85,md,0.12,automatic
3,ak0214xbn842,earthquake,1.2,3.0,ml,0.78,automatic
4,ak0214xbmcgy,earthquake,124.8,1.3,ml,0.43,automatic


### location

In [154]:
# Insert data into location table
Location.to_sql(name='location', con=engine, if_exists='append', index=False)

In [155]:
# location - Query the data in Postgres
location_table = pd.read_sql("SELECT * FROM location", conn)
location_table.head(5)

Unnamed: 0,id,place,latitude,longitude,locationSource,City,County,State,Country
0,ci39854328,"15km E of Ocotillo, CA",32.751,-115.829333,ci,Seeley,Imperial County,California,US
1,ak0214xbpmk2,"119 km NW of Yakutat, Alaska",60.2994,-141.2456,ak,Tok,Southeast Fairbanks Census Area,Alaska,US
2,hv72430407,"9 km ESE of Pāhala, Hawaii",19.182333,-155.393997,hv,Pahala,Hawaii County,Hawaii,US
3,ak0214xbn842,"111 km NNW of Arctic Village, Alaska",69.0625,-146.4755,ak,Prudhoe Bay,North Slope Borough,Alaska,US
4,ak0214xbmcgy,"66 km ENE of Pedro Bay, Alaska",59.9636,-152.9668,ak,Anchor Point,Kenai Peninsula Borough,Alaska,US


### time

In [156]:
# Insert data into time table
Time.to_sql(name='time', con=engine, if_exists='append', index=False)

In [157]:
# time - Query the data in Postgres
time_table = pd.read_sql("SELECT * FROM time", conn)
time_table.head(5)

Unnamed: 0,id,time,lastUpdDatetime
0,ci39854328,2021-04-17 22:20:58,2021-04-17 22:24:26
1,ak0214xbpmk2,2021-04-17 22:18:45,2021-04-17 22:22:38
2,hv72430407,2021-04-17 22:13:32,2021-04-17 22:16:50
3,ak0214xbn842,2021-04-17 22:07:24,2021-04-17 22:20:58
4,ak0214xbmcgy,2021-04-17 22:03:14,2021-04-17 22:06:45
