In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import joblib

In [39]:
# 1. Load the tsunami dataset
file_path = "tsunami_dataset.csv"
df = pd.read_csv(file_path)

# 2. Initial info and head
print("=== DataFrame Info ===")
df.info()
print("\n=== First 5 Rows ===")
print(df.head())

=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2259 entries, 0 to 2258
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        2259 non-null   int64  
 1   YEAR                      2259 non-null   int64  
 2   MONTH                     2155 non-null   float64
 3   DAY                       2082 non-null   float64
 4   HOUR                      1308 non-null   float64
 5   MINUTE                    1235 non-null   float64
 6   LATITUDE                  2259 non-null   float64
 7   LONGITUDE                 2259 non-null   float64
 8   LOCATION_NAME             2250 non-null   object 
 9   COUNTRY                   2259 non-null   object 
 10  REGION                    2258 non-null   object 
 11  CAUSE                     2258 non-null   object 
 12  EVENT_VALIDITY            2259 non-null   object 
 13  EQ_MAGNITUDE              1474 non-null 

In [40]:
# 3. Drop rows with missing LATITUDE or LONGITUDE and filter YEAR >= 1800
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])
df = df[df['YEAR'] >= 1800]
print(f"\nRows after filtering LAT/LON and YEAR >=1800: {len(df)}")

# 4. Missing value summary
total = len(df)
missing_summary = pd.DataFrame({
    'Non-Missing': df.notnull().sum(),
    'Missing': df.isnull().sum(),
    'Total': total,
    'Missing Ratio (%)': (df.isnull().sum() / total * 100).round(2)
})
print("\n=== Missing Value Summary ===")
print(missing_summary)




Rows after filtering LAT/LON and YEAR >=1800: 1796

=== Missing Value Summary ===
                          Non-Missing  Missing  Total  Missing Ratio (%)
ID                               1796        0   1796               0.00
YEAR                             1796        0   1796               0.00
MONTH                            1765       31   1796               1.73
DAY                              1725       71   1796               3.95
HOUR                             1224      572   1796              31.85
MINUTE                           1185      611   1796              34.02
LATITUDE                         1796        0   1796               0.00
LONGITUDE                        1796        0   1796               0.00
LOCATION_NAME                    1790        6   1796               0.33
COUNTRY                          1796        0   1796               0.00
REGION                           1795        1   1796               0.06
CAUSE                            1796    

In [41]:
# 5. Drop unneeded columns
drop_cols = ['ID', 'HOUSES_TOTAL_DESCRIPTION', 'DEATHS_TOTAL_DESCRIPTION',
             'URL', 'COMMENTS', 'DAMAGE_TOTAL_DESCRIPTION']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

In [42]:
# 6. Classify continent region
asian = ['JAPAN','INDONESIA','PHILIPPINES','INDIA','CHINA','MALAYSIA',
         'THAILAND','SRI LANKA','VIETNAM','MYANMAR','BANGLADESH','TAIWAN',
         'PAKISTAN','TURKEY','RUSSIA','IRAN','IRAQ','SOUTH KOREA',
         'NORTH KOREA','ISRAEL','JORDAN']
european = ['GREECE','ITALY','UK','PORTUGAL','SPAIN','FRANCE','GERMANY',
            'NORWAY','CROATIA','ROMANIA','ALBANIA','BULGARIA','SWEDEN',
            'FINLAND','NETHERLANDS','IRELAND','CYPRUS','MONTENEGRO']
df['COUNTRY'] = df['COUNTRY'].str.upper()
df['CONTINENT_REGION'] = df['COUNTRY'].apply(lambda c: 'Asia' if c in asian
                                             else ('Europe' if c in european else 'Other'))
print("\n=== First 5 Rows ===")
print(df.head())


=== First 5 Rows ===
     YEAR  MONTH   DAY  HOUR  MINUTE  LATITUDE  LONGITUDE  \
261  1861    6.0   5.0   NaN     NaN      -6.3      107.3   
262  1856    3.0   NaN   NaN     NaN     -44.0      172.0   
263  1867    8.0   5.0   NaN     NaN     -33.8      151.3   
264  1844    3.0  22.0  10.0    13.0      43.4       16.7   
265  1854    1.0  15.0   NaN     NaN      20.9      134.8   

                           LOCATION_NAME                  COUNTRY  \
261                                 JAVA                INDONESIA   
262  S CANTERBURY PROVINCE, SOUTH ISLAND              NEW ZEALAND   
263                        SYDNEY HARBOR                AUSTRALIA   
264                            DUBROVNIK                  CROATIA   
265                       PHILIPPINE SEA  NORTHWEST PACIFIC OCEAN   

                                                REGION       CAUSE  \
261   Indian Ocean (including west coast of Australia)     Unknown   
262  E Coast Australia, New Zealand, South Pacific Is.  

In [43]:
# 8. Build final DataFrame
features = ['MONTH','EQ_MAGNITUDE','LATITUDE','LONGITUDE',
            'COUNTRY','EQ_DEPTH','TS_INTENSITY','CAUSE',
            'CONTINENT_REGION','EVENT_VALIDITY']
df_final = df[features].copy()

In [44]:
# 9. Imputation
# Impute MONTH using mode (most frequent value)
df_final['MONTH'] = df_final['MONTH'].fillna(df_final['MONTH'].mode()[0])

# Impute EQ_MAGNITUDE, EQ_DEPTH, TS_INTENSITY using mean
for col in ['EQ_MAGNITUDE', 'EQ_DEPTH', 'TS_INTENSITY']:
    df_final[col] = df_final[col].fillna(df_final[col].mean())

# Ensure smallest circle radius ≥ 0.5 so every circle is visible
df_final['TS_INTENSITY_PLOT'] = df_final['TS_INTENSITY'] - df_final['TS_INTENSITY'].min() + 0.5

df_final = (
    df_final
    .drop(columns=['TS_INTENSITY'])
    .rename(columns={'TS_INTENSITY_PLOT': 'TS_INTENSITY'})
)
df_final.head()

Unnamed: 0,MONTH,EQ_MAGNITUDE,LATITUDE,LONGITUDE,COUNTRY,EQ_DEPTH,CAUSE,CONTINENT_REGION,EVENT_VALIDITY,TS_INTENSITY
261,6.0,7.047709,-6.3,107.3,INDONESIA,36.295045,Unknown,Asia,Questionable Tsunami,5.64
262,3.0,7.047709,-44.0,172.0,NEW ZEALAND,36.295045,Unknown,Other,Probable Tsunami,5.14
263,8.0,7.047709,-33.8,151.3,AUSTRALIA,36.295045,Unknown,Other,Very Doubtful Tsunami,3.64
264,3.0,7.047709,43.4,16.7,CROATIA,36.295045,Earthquake,Europe,Questionable Tsunami,6.64
265,1.0,7.047709,20.9,134.8,NORTHWEST PACIFIC OCEAN,36.295045,Volcano,Other,Very Doubtful Tsunami,5.617934


In [45]:
# 12. Encode categorical for SMOTE
le_country = LabelEncoder()
le_cause = LabelEncoder()
le_cont = LabelEncoder()
le_target = LabelEncoder()

df_final['COUNTRY_CODE'] = le_country.fit_transform(df_final['COUNTRY'])
df_final['CAUSE_CODE'] = le_cause.fit_transform(df_final['CAUSE'])
df_final['CONT_CODE'] = le_cont.fit_transform(df_final['CONTINENT_REGION'])
y = le_target.fit_transform(df_final['EVENT_VALIDITY'])
X = df_final[['MONTH','EQ_MAGNITUDE','LATITUDE','LONGITUDE','EQ_DEPTH',
              'TS_INTENSITY','COUNTRY_CODE','CAUSE_CODE','CONT_CODE']]

In [46]:
df_final[features].isnull().sum()


Unnamed: 0,0
MONTH,0
EQ_MAGNITUDE,0
LATITUDE,0
LONGITUDE,0
COUNTRY,0
EQ_DEPTH,0
TS_INTENSITY,0
CAUSE,0
CONTINENT_REGION,0
EVENT_VALIDITY,0


In [47]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# 1. Encode labels (if not already)
from sklearn.preprocessing import LabelEncoder
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(df_final['EVENT_VALIDITY'])

# 2. Prepare features (already done)
X = df_final[['MONTH','EQ_MAGNITUDE','LATITUDE','LONGITUDE',
              'EQ_DEPTH','TS_INTENSITY',
              'COUNTRY_CODE','CAUSE_CODE','CONT_CODE']]

# 3. Apply SMOTE
print("Before SMOTE:", np.bincount(y_encoded))
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
print("After SMOTE:", np.bincount(y_resampled))

# 4. Split into training/testing
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.3,
    stratify=y_resampled,
    random_state=42
)


Before SMOTE: [794  78 278 346 300]
After SMOTE: [794 794 794 794 794]


# **Model Development**

In [48]:
# 4. Define all of your models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM (RBF Kernel)': SVC(kernel='rbf', probability=True, random_state=42),
    'Neural Network (MLP)': MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu',
        solver='adam',max_iter=500, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}

# 5. Train & evaluate
accuracies = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    accuracies.append((name, acc))
    report = classification_report(
        y_test,
        y_pred,
        target_names=le_target.classes_,
        zero_division=0
    )

    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(report)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression ===
Accuracy: 0.3451
Confusion Matrix:
[[120  43  34  23  18]
 [ 52 106   7  36  37]
 [ 61  65  53  40  19]
 [ 31  38  46  72  52]
 [ 37  57  24  60  60]]
Classification Report:
                                                                   precision    recall  f1-score   support

                                                 Definite Tsunami       0.40      0.50      0.45       238
Event that only caused a seiche or disturbance in an inland river       0.34      0.45      0.39       238
                                                 Probable Tsunami       0.32      0.22      0.26       238
                                             Questionable Tsunami       0.31      0.30      0.31       239
                                            Very Doubtful Tsunami       0.32      0.25      0.28       238

                                                         accuracy                           0.35      1191
                                              

Parameters: { "use_label_encoder" } are not used.




=== XGBoost ===
Accuracy: 0.7557
Confusion Matrix:
[[186   8  22  14   8]
 [  5 227   1   1   4]
 [ 18   9 166  24  21]
 [ 21   5  27 148  38]
 [ 16   6  19  24 173]]
Classification Report:
                                                                   precision    recall  f1-score   support

                                                 Definite Tsunami       0.76      0.78      0.77       238
Event that only caused a seiche or disturbance in an inland river       0.89      0.95      0.92       238
                                                 Probable Tsunami       0.71      0.70      0.70       238
                                             Questionable Tsunami       0.70      0.62      0.66       239
                                            Very Doubtful Tsunami       0.71      0.73      0.72       238

                                                         accuracy                           0.76      1191
                                                        ma

In [49]:
# Sort by accuracy descending
accuracies.sort(key=lambda x: x[1], reverse=True)

# Print results
print("Model accuracies (high→low):")
for name, acc in accuracies:
    print(f"{name:25s} : {acc:.4f}")

Model accuracies (high→low):
Random Forest             : 0.7708
XGBoost                   : 0.7557
LightGBM                  : 0.7531
Neural Network (MLP)      : 0.6549
Decision Tree             : 0.4492
SVM (RBF Kernel)          : 0.4274
Naive Bayes               : 0.3661
Logistic Regression       : 0.3451


In [50]:
# === Train best model ===
best_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit on SMOTE-balanced data
best_rf.fit(X_train, y_train)

# Predict on test set
y_pred = best_rf.predict(X_test)

# Calculate and print accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy of best Random Forest model: {test_accuracy:.4f}")

Test set accuracy of best Random Forest model: 0.7708


In [51]:
# Save the model
joblib.dump(best_rf, 'tsunami_rf_model.pkl')
print("RandomForest model saved as 'tsunami_rf_model.pkl'")

# Save the label encoder for decoding predictions later
joblib.dump(le_target, 'tsunami_label_encoder.pkl')
print("Label encoder saved as 'tsunami_label_encoder.pkl'")

RandomForest model saved as 'tsunami_rf_model.pkl'
Label encoder saved as 'tsunami_label_encoder.pkl'
