In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import cv2
import os

In [2]:
# 1. Load Dataset
def load_dataset(folder_path, label_map, img_size=(128, 128)):
    """
    Load images and their labels from folder.
    Args:
        folder_path (str): Path to the folder containing class subfolders.
        label_map (dict): Mapping of folder names to class labels.
        img_size (tuple): Desired size of images (default is 128x128).
    Returns:
        tuple: (images, labels)
    """
    images = []
    labels = []
    for label_name, label in label_map.items():
        class_folder = os.path.join(folder_path, label_name)
        for file in os.listdir(class_folder):
            file_path = os.path.join(class_folder, file)
            try:
                # Read image, resize, and normalize to [0, 1]
                img = cv2.imread(file_path)
                img = cv2.resize(img, img_size)
                images.append(img.flatten())  # Flatten the image
                labels.append(label)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    return np.array(images), np.array(labels)

In [3]:
# Load and preprocess training and validation data
train_folder = '/kaggle/input/disasterclassification/train'
val_folder = '/kaggle/input/disasterclassification/validation'
# Map folder names to class labels
label_map = {'earthquake': 0, 'cyclone': 1, 'flood': 2, 'wildfire': 3}

X_train, y_train = load_dataset(train_folder, label_map)
X_val, y_val = load_dataset(val_folder, label_map)


In [4]:
# 2. Standardize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [5]:
# 3. Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [6]:
# 4. Predict and Evaluate
y_pred = rf_model.predict(X_val)

# Classification Report
print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=label_map.keys()))

# Accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

# Feature Importance (optional)
importances = rf_model.feature_importances_
important_features = np.argsort(importances)[-10:][::-1]  # Top 10 features
print(f"Top 10 Important Features: {important_features}")

Classification Report:
              precision    recall  f1-score   support

  earthquake       0.61      0.59      0.60       100
     cyclone       0.94      0.76      0.84       100
       flood       0.66      0.67      0.67       100
    wildfire       0.72      0.88      0.79       100

    accuracy                           0.73       400
   macro avg       0.73      0.72      0.73       400
weighted avg       0.73      0.72      0.73       400

Validation Accuracy: 0.72
Top 10 Important Features: [27078 22479 23268 46644 26700 46632 24009 23640 23235 20151]


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],        # Number of trees in the forest
    'max_depth': [None, 30,40],      # Maximum depth of the trees
    'min_samples_split': [1,5, 7],      # Minimum samples required to split an internal node
    'min_samples_leaf': [1,2, 4],        # Minimum samples required at each leaf node
    'bootstrap': [True, False],           # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='accuracy',                   # Metric to optimize
    cv=3,                                 # 3-fold cross-validation
    verbose=2,                            # Print progress during the search
    n_jobs=-1                             # Use all available CPU cores
)

# Fit the Grid Search model
print("Starting Grid Search...")
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("\nBest Parameters:")
print(best_params)

# Validate the best model on the validation dataset
y_val_pred = best_model.predict(X_val)

# Evaluation metrics for best model
print("\nValidation Set Evaluation (Best Model):")
print(classification_report(y_val, y_val_pred, target_names=label_map.keys()))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.2f}")

# Get the validation scores for all models
cv_results = grid_search.cv_results_

# Create a DataFrame to display results
cv_results_df = pd.DataFrame({
    'params': cv_results['params'],
    'mean_test_score': cv_results['mean_test_score'],
    'std_test_score': cv_results['std_test_score'],
    'rank_test_score': cv_results['rank_test_score']
})

# Sort by rank to display the best performing models at the top
cv_results_df = cv_results_df.sort_values(by='rank_test_score')

# Display all models and their validation scores
print("\nValidation Scores for All Models:")
print(cv_results_df)


Starting Grid Search...
Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  40.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=7, n_estimators=100; total time=  20.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=7, n_estimators=200; total time=  40.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  19.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=  19.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=7

108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/opt/conda/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(



Best Parameters:
{'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 200}

Validation Set Evaluation (Best Model):
              precision    recall  f1-score   support

  earthquake       0.62      0.65      0.64       100
     cyclone       0.97      0.75      0.85       100
       flood       0.67      0.72      0.70       100
    wildfire       0.77      0.86      0.81       100

    accuracy                           0.74       400
   macro avg       0.76      0.74      0.75       400
weighted avg       0.76      0.74      0.75       400

Validation Accuracy: 0.74

Validation Scores for All Models:
                                                params  mean_test_score  \
65   {'bootstrap': False, 'max_depth': None, 'min_s...         0.718740   
101  {'bootstrap': False, 'max_depth': 40, 'min_sam...         0.718740   
83   {'bootstrap': False, 'max_depth': 30, 'min_sam...         0.718740   
21   {'bootstrap': True, 'max_depth':