### Data Exploration

In [1]:
# Import earthquake data
import pandas as pd
FILEPATH = '../Data/raw/earthquake_data.csv'
df = pd.read_csv(FILEPATH)
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 7.0 - 18 km SW of Malango, Solomon Islands",7.0,22-11-2022 02:03,8,7,green,1,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596,"Malango, Solomon Islands",Oceania,Solomon Islands
1,"M 6.9 - 204 km SW of Bengkulu, Indonesia",6.9,18-11-2022 13:37,4,4,green,0,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738,"Bengkulu, Indonesia",,
2,M 7.0 -,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,,Oceania,Fiji
3,"M 7.3 - 205 km ESE of Neiafu, Tonga",7.3,11-11-2022 10:48,5,5,green,1,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129,"Neiafu, Tonga",,
4,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,,,


Magnitude:
This measures the energy released at the source of the earthquake. It’s a logarithmic scale, so each whole number increase represents a tenfold increase in measured amplitude and roughly 31.6 times more energy release.

CDI (Community Determined Intensity):
This is a measure of the earthquake’s intensity based on reports from people who felt the earthquake. It reflects the effects of the earthquake as experienced by individuals and is part of the “Did You Feel It?” program.

MMI (Modified Mercalli Intensity):
This scale measures the intensity of shaking produced by an earthquake at a specific location. It ranges from I (not felt) to XII (total destruction), and is based on observed effects on people, buildings, and the Earth’s surface.

SIG (Significance):
This is a calculated value that combines magnitude, felt reports, and estimated impact to provide an overall significance score for the earthquake. Higher values indicate more significant events.

NET (Network):
This indicates the seismic network that detected and reported the earthquake. Different networks may cover different regions and use various technologies for detection.

NST (Number of Stations):
This represents the number of seismic stations that contributed data to the earthquake’s location and magnitude determination. More stations generally lead to more accurate measurements.

DMIN (Minimum Distance):
This is the minimum distance from the earthquake’s epicenter to the nearest seismic station that recorded the event. It helps in understanding the proximity of the earthquake to the recording instruments.

GAP:
This is the largest azimuthal gap in degrees between the seismic stations around the earthquake. Smaller gaps indicate better coverage and more reliable location estimates.

In [2]:
# Define Labels and Features
LABEL_tsunami = ['tsunami']
FEATURES = ['magnitude','cdi','mmi','sig','net','nst','dmin','gap','magType','depth','latitude','longitude']
df = df.loc[:,FEATURES + LABEL_tsunami]
df.head()


Unnamed: 0,magnitude,cdi,mmi,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,tsunami
0,7.0,8,7,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596,1
1,6.9,4,4,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738,0
2,7.0,3,3,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,1
3,7.3,5,5,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129,1
4,6.6,0,2,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,1


In [3]:
# run describe on the numberical features
pd.set_option('display.float_format', '{:.2f}'.format)
numeric_features = df[FEATURES].select_dtypes(include=["number"]).columns
df[numeric_features].describe()


Unnamed: 0,magnitude,cdi,mmi,sig,nst,dmin,gap,depth,latitude,longitude
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,6.94,4.33,5.96,870.11,230.25,1.33,25.04,75.88,3.54,52.61
std,0.45,3.17,1.46,322.47,250.19,2.22,24.23,137.28,27.3,117.9
min,6.5,0.0,1.0,650.0,0.0,0.0,0.0,2.7,-61.85,-179.97
25%,6.6,0.0,5.0,691.0,0.0,0.0,14.62,14.0,-14.6,-71.67
50%,6.8,5.0,6.0,754.0,140.0,0.0,20.0,26.3,-2.57,109.43
75%,7.1,7.0,7.0,909.75,445.0,1.86,30.0,49.75,24.65,148.94
max,9.1,9.0,9.0,2910.0,934.0,17.65,239.0,670.81,71.63,179.66


In [4]:
#check if null values
df.isnull().sum()

magnitude    0
cdi          0
mmi          0
sig          0
net          0
nst          0
dmin         0
gap          0
magType      0
depth        0
latitude     0
longitude    0
tsunami      0
dtype: int64

In [5]:
#number of tsunamis
df["tsunami"].value_counts()

tsunami
0    478
1    304
Name: count, dtype: int64

### Automated EDA

In [6]:
dfy = df.copy()[FEATURES + LABEL_tsunami]

# Change type of labels to strings, so ydata-profiling interprets it as a categorical variable.
dfy['tsunami'] = dfy['tsunami'].astype(str)

dfy.head()

Unnamed: 0,magnitude,cdi,mmi,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,tsunami
0,7.0,8,7,768,us,117,0.51,17.0,mww,14.0,-9.8,159.6,1
1,6.9,4,4,735,us,99,2.23,34.0,mww,25.0,-4.96,100.74,0
2,7.0,3,3,755,us,147,3.12,18.0,mww,579.0,-20.05,-178.35,1
3,7.3,5,5,833,us,149,1.86,21.0,mww,37.0,-19.29,-172.13,1
4,6.6,0,2,670,us,131,5.0,27.0,mww,624.46,-25.59,178.28,1


In [7]:
dfy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   magnitude  782 non-null    float64
 1   cdi        782 non-null    int64  
 2   mmi        782 non-null    int64  
 3   sig        782 non-null    int64  
 4   net        782 non-null    object 
 5   nst        782 non-null    int64  
 6   dmin       782 non-null    float64
 7   gap        782 non-null    float64
 8   magType    782 non-null    object 
 9   depth      782 non-null    float64
 10  latitude   782 non-null    float64
 11  longitude  782 non-null    float64
 12  tsunami    782 non-null    object 
dtypes: float64(6), int64(4), object(3)
memory usage: 79.6+ KB


In [8]:
from ydata_profiling import ProfileReport

profile = ProfileReport(dfy, title="Earthquake report")
profile.to_file("dataset_profiling_earthquake.html")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 122/122 [00:13<00:00,  9.11it/s, Completed]                   
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.42s/it]
Render HTML: 100%|██████████| 1/1 [00:04<00:00,  4.26s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 22.72it/s]
