In [60]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [61]:
df = pd.read_parquet("data/features/earthquake_features.parquet")

In [None]:
df = pd.read_parquet("data/raw/earthquake_data.parquet")

In [62]:
# Convert datetime column to numerical features
df['time_utc'] = pd.to_datetime(df['time_utc'])
df['year'] = df['time_utc'].dt.year
df['month'] = df['time_utc'].dt.month
df['day'] = df['time_utc'].dt.day
df['hour'] = df['time_utc'].dt.hour
df['minute'] = df['time_utc'].dt.minute
df['second'] = df['time_utc'].dt.second

# Drop non-numeric identifier
df.drop(columns=['event_id'], inplace=True)

# Convert categorical variables to numeric using Label Encoding
for col in ['magnitude_type', 'event_type']:
    df[col] = LabelEncoder().fit_transform(df[col])


In [63]:
df = df.query("time_utc >= '2022-01-01'")

In [64]:
X = df.drop(columns=["max_mag_next_30d", "target_class"])
y = df["max_mag_next_30d"]

In [65]:
X= X.drop(columns=["station_latitude", "station_longitude", "second", "magnitude_type", "station_elevation_m", ])

In [66]:
# Temporal train-test split (until 2024-01-01 is train, the rest is test)
X_train = X.loc[X.time_utc < "2024-01-01"]
X_test = X.loc[X.time_utc >= "2024-01-30"]

y_train = y.loc[X.time_utc < "2024-01-01"]
y_test = y.loc[X.time_utc >= "2024-01-30"]

X_train = X_train.drop(columns=["time_utc"])
X_test = X_test.drop(columns=["time_utc"])

In [67]:
X_train.dtypes

latitude                  float64
longitude                 float64
depth_km                  float64
magnitude                 float64
event_type                  int64
magnitude_ml              float64
distance_to_station_km    float64
rolling_mean_depth        float64
T_since_last_days         float64
b_value                   float64
b_value_shift2            float64
Delta_b_i_i_2             float64
M_last_week_max           float64
N_eq_30                   float64
etas_intensity            float64
year                        int32
month                       int32
day                         int32
hour                        int32
minute                      int32
dtype: object

In [68]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model = XGBRegressor()

In [69]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MSE: 0.3617526939252195
MAE: 0.43769762934366874
R2: -0.038493527429592556
