In [6]:
import pandas as pd
df = pd.read_csv('cleaned_earthquake_data.csv')
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,...,latitude,longitude,location,continent,country,year,month,day,hour,radius
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,2023-08-16 12:47:00,7,4,green,0,657,us,114,...,-13.8814,167.158,"Sola, Vanuatu",Oceania,Vanuatu,2023,8,16,12,5.0
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,2023-07-19 00:22:00,8,6,yellow,0,775,us,92,...,12.814,-88.1265,"Intipucá, El Salvador",North America,El Salvador,2023,7,19,0,10.0
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,2023-07-17 03:05:00,7,5,green,0,899,us,70,...,-38.1911,-70.3731,"Loncopué, Argentina",South America,Argentina,2023,7,17,3,5.610092
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,2023-07-16 06:48:00,6,6,green,1,860,us,173,...,54.3844,-160.699,"Sand Point, Alaska",North America,Alaska,2023,7,16,6,22.387211
4,M 7.3 - Alaska Peninsula,9.49,2023-07-16 06:48:00,0,5,unknown,1,820,at,79,...,54.49,-160.796,Alaska Peninsula,unknown,unknown,2023,7,16,6,312.607937


In [7]:
df['country'].nunique()

77

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Load the data (replace with your actual file path)
df = pd.read_csv("cleaned_earthquake_data.csv")

# Define features and target variables
features = ['sig', 'nst', 'dmin', 'gap', 'depth', 'radius']  # Add other relevant features as needed
target_cdi = 'cdi'
target_mmi = 'mmi'

# Standardize numerical features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Split data into training and testing sets (Ensure separate splits for each target)
X_train_cdi, X_test_cdi, y_train_cdi, y_test_cdi = train_test_split(
    df[features], df[target_cdi], test_size=0.2, random_state=42
)
X_train_mmi, X_test_mmi, y_train_mmi, y_test_mmi = train_test_split(
    df[features], df[target_mmi], test_size=0.2, random_state=42
)

# Define LightGBM parameters
params = {
    'objective': 'regression',  # Regression task
    'metric': 'rmse',          # Root Mean Squared Error
    'boosting_type': 'gbdt',   # Gradient Boosting Decision Trees
    'learning_rate': 0.05,     # Step size
    'num_leaves': 31,          # Maximum tree leaves
    'max_depth': -1,           # No maximum depth
    'verbose': -1              # Suppress warnings
}

# Train the LightGBM model for CDI
train_data_cdi = lgb.Dataset(X_train_cdi, label=y_train_cdi)
test_data_cdi = lgb.Dataset(X_test_cdi, label=y_test_cdi, reference=train_data_cdi)

gbm_cdi = lgb.train(
    params,
    train_data_cdi,
    num_boost_round=100,
    valid_sets=[test_data_cdi],
    valid_names=["validation"],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Train the LightGBM model for MMI
train_data_mmi = lgb.Dataset(X_train_mmi, label=y_train_mmi)
test_data_mmi = lgb.Dataset(X_test_mmi, label=y_test_mmi, reference=train_data_mmi)

gbm_mmi = lgb.train(
    params,
    train_data_mmi,
    num_boost_round=100,
    valid_sets=[test_data_mmi],
    valid_names=["validation"],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Make predictions
y_pred_cdi = gbm_cdi.predict(X_test_cdi)
y_pred_mmi = gbm_mmi.predict(X_test_mmi)

from math import sqrt

# Compute RMSE (Root Mean Squared Error) manually
rmse_cdi = sqrt(mean_squared_error(y_test_cdi, y_pred_cdi))
rmse_mmi = sqrt(mean_squared_error(y_test_mmi, y_pred_mmi))

print(f"RMSE for CDI: {rmse_cdi}")
print(f"RMSE for MMI: {rmse_mmi}")



Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[96]	validation's rmse: 1.66878
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[52]	validation's rmse: 1.03161
RMSE for CDI: 1.668777656467126
RMSE for MMI: 1.0316082496527206


In [9]:
gbm_cdi.save_model("cdi_gbm.model")
gbm_mmi.save_model("mmi_gbm.model")

<lightgbm.basic.Booster at 0x7da9196aca30>

In [13]:
import numpy as np
# Baseline predictions using the mean value of the target for each test set
baseline_cdi = np.mean(y_train_cdi)  # Mean of the training target for CDI
baseline_mmi = np.mean(y_train_mmi)  # Mean of the training target for MMI

# Compute Baseline RMSE for CDI and MMI
baseline_rmse_cdi = np.sqrt(mean_squared_error(y_test_cdi, [baseline_cdi] * len(y_test_cdi)))
baseline_rmse_mmi = np.sqrt(mean_squared_error(y_test_mmi, [baseline_mmi] * len(y_test_mmi)))

# Print Baseline RMSE
print(f"Baseline RMSE for CDI: {baseline_rmse_cdi}")
print(f"Baseline RMSE for MMI: {baseline_rmse_mmi}")

Baseline RMSE for CDI: 3.4079839882824547
Baseline RMSE for MMI: 1.3739519991829379


In [11]:
print(f"Range of CDI: {y_test_cdi.min()} to {y_test_cdi.max()}")
print(f"Range of MMI: {y_test_mmi.min()} to {y_test_mmi.max()}")


Range of CDI: 0 to 9
Range of MMI: 2 to 9


In [12]:
range_cdi = y_test_cdi.max() - y_test_cdi.min()
range_mmi = y_test_mmi.max() - y_test_mmi.min()

# Normalize the RMSE by dividing by the range
normalized_rmse_cdi = rmse_cdi / range_cdi
normalized_rmse_mmi = rmse_mmi / range_mmi


# Print Normalized RMSE for CDI and MMI
print(f"Normalized RMSE for CDI: {normalized_rmse_cdi}")
print(f"Normalized RMSE for MMI: {normalized_rmse_mmi}")

Normalized RMSE for CDI: 0.18541973960745844
Normalized RMSE for MMI: 0.1473726070932458
