"Dado um terremoto recém-registrado, este evento tem alto risco de gerar um tsunami?"

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Carregar dataset
df = pd.read_csv("earthquake_data_tsunami.csv")

# Features e target
X = df.drop(columns=['tsunami'])
y = df['tsunami']

# Dividir treino/teste
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Pipeline de pré-processamento + modelo
pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('model', RandomForestClassifier())
])

# Param grid com vários modelos
param_grid = [
    {
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 5, 10]
    },
    {
        'model': [GradientBoostingClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [3, 5]
    },
    {
        'model': [SVC()],
        'model__C': [1, 5],
        'model__kernel': ['linear', 'rbf'],
        'model__gamma': ['scale', 'auto']
    },
    {
        'model': [LogisticRegression(max_iter=1000)],
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs']
    }
]

# Balanceamento com SMOTE
Xtrain_bal, ytrain_bal = SMOTE(random_state=42).fit_resample(Xtrain, ytrain)

# Grid Search
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=2)
search.fit(Xtrain_bal, ytrain_bal)

# Melhor modelo
best_model = search.best_estimator_
print("Melhor modelo:")
print(best_model)
print("Score médio CV:", search.best_score_)

# Avaliação no teste
y_pred = best_model.predict(Xtest)
print("\nClassification Report:\n", classification_report(ytest, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(ytest, y_pred))


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Melhor modelo:
Pipeline(steps=[('scaler', RobustScaler()),
                ('model',
                 GradientBoostingClassifier(learning_rate=0.01, max_depth=5))])
Score médio CV: 0.9385276917784656

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95        96
           1       0.89      0.95      0.92        61

    accuracy                           0.94       157
   macro avg       0.93      0.94      0.93       157
weighted avg       0.94      0.94      0.94       157


Confusion Matrix:
 [[89  7]
 [ 3 58]]
