In [101]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/forest-fires-regression/forestfires.csv


# Imports

In [119]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer, classification_report, confusion_matrix, roc_auc_score, make_scorer
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt


# Forest Fire data representation

In [103]:
df = pd.read_csv("/kaggle/input/forest-fires-regression/forestfires.csv")
df.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


# Implementing diff models and algorithms

## Helper functions & Data Preprocessing

In [104]:
def evaluate_model(model_name, y_test, y_pred):
    mad = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Deviation (MAD): {mad}")
    print(f"Root Mean Squared Error (RMSE): {rmse}\n")



In [105]:

numerical_features = ['temp', 'RH', 'wind', 'rain']  
categorical_features = []  
# Preprocess the data
def preprocess_data(df, target, categorical_features, numerical_features):
    # Log-transform the target variable
    y = np.log1p(df[target])  
    
    # Remove the target column from the features
    X = df.drop(columns=[target])
    
    # Define transformers
    preprocess = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    # Apply transformations
    X_transformed = preprocess.fit_transform(X)
    
    # Split the dataset into train and test sets
    return train_test_split(X_transformed, y, test_size=0.2, random_state=42)



### Neural network

In [106]:
def neural_network(X_train, X_test, y_train, y_test):
    model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=1000, solver='adam',learning_rate_init=0.001,early_stopping=True, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    evaluate_model("Neural Network", y_test, y_pred)


### Optimimzing MLP

In [107]:
# Cross-Validation for MLP
def cross_validate_mlp(X, y):
    model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=500, solver='adam', learning_rate_init=0.001, random_state=42)
    scorer = make_scorer(mean_squared_error, squared=False)
    scores = cross_val_score(model, X, y, cv=5, scoring=scorer)
    print("Cross-validated RMSE scores:", scores)
    print("Mean RMSE:", scores.mean())

# Hyperparameter Tuning for MLP
def tune_mlp(X_train, y_train):
    param_grid = {
        'hidden_layer_sizes': [(10,), (50,), (100,)],
        'learning_rate_init': [0.001, 0.01, 0.1],
        'solver': ['adam', 'sgd'],
        'max_iter': [500, 1000]
    }
    
    model = MLPRegressor(random_state=42, early_stopping=True)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)
    print("Best RMSE:", np.sqrt(-grid_search.best_score_))
    return best_model


### Evaluate

In [108]:
# Define target and features
target_column = 'area'
numerical_features = ['temp', 'RH', 'wind', 'rain']
categorical_features = []  # Add categorical features if needed

# Preprocess data
X_train, X_test, y_train, y_test = preprocess_data(df, target_column, categorical_features, numerical_features)

# Cross-validate MLP
print("Cross-Validation Results for MLP:")
cross_validate_mlp(X_train, y_train)

# Hyperparameter tuning for MLP
print("\nHyperparameter Tuning for MLP:")
best_mlp = tune_mlp(X_train, y_train)

# Evaluate best MLP on the test set
y_pred = best_mlp.predict(X_test)
evaluate_model("Tuned MLP Regressor", y_test, y_pred)

Cross-Validation Results for MLP:




Cross-validated RMSE scores: [1.41556921 1.26431999 1.70147067 1.54545318 1.26838491]
Mean RMSE: 1.4390395931493056

Hyperparameter Tuning for MLP:
Best Parameters: {'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'max_iter': 500, 'solver': 'sgd'}
Best RMSE: 1.3859581187939796
Tuned MLP Regressor Performance:
Mean Absolute Deviation (MAD): 1.1573045980401029
Root Mean Squared Error (RMSE): 1.453821275015725



## SVM


In [109]:
def support_vector_machine(X_train, X_test, y_train, y_test):
    model = SVR(kernel='rbf', C=3, epsilon=0.1, gamma='scale')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    evaluate_model("Support Vector Machine", y_test, y_pred)


### Optimizing SVM

In [110]:
# Cross-validate SVM
def cross_validate_svm(X, y):
    model = SVR(kernel='rbf', C=3, epsilon=0.1, gamma='scale')
    scorer = make_scorer(mean_squared_error, squared=False)  # Use RMSE
    scores = cross_val_score(model, X, y, cv=5, scoring=scorer)
    
    # Print cross-validation scores
    print("Cross-Validation Results for SVM:")
    for i, score in enumerate(scores, 1):
        print(f"Fold {i}: RMSE = {score:.4f}")
    print(f"Mean RMSE: {np.mean(scores):.4f}")
    print(f"Standard Deviation of RMSE: {np.std(scores):.4f}\n")

# Hyperparameter tuning for SVM
def tune_svm(X_train, y_train):
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.5, 1],
        'gamma': ['scale', 'auto', 0.01, 0.1]
    }
    model = SVR(kernel='rbf')
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Extract best parameters and results
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)  # Convert negative MSE to RMSE
    
    print("Hyperparameter Tuning Results for SVM:")
    print(f"Best Parameters: {best_params}")
    print(f"Best RMSE: {best_score:.4f}\n")
    
    return best_model

### Evaluation

In [111]:
# Evaluate SVM
print("Cross-Validation Results for SVM:")
cross_validate_svm(X_train, y_train)

print("\nHyperparameter Tuning for SVM:")
best_svm = tune_svm(X_train, y_train)

# Evaluate best SVM on the test set
y_pred = best_svm.predict(X_test)
evaluate_model("Tuned Support Vector Machine", y_test, y_pred)

Cross-Validation Results for SVM:
Cross-Validation Results for SVM:
Fold 1: RMSE = 1.4865
Fold 2: RMSE = 1.4091
Fold 3: RMSE = 1.6665
Fold 4: RMSE = 1.6731
Fold 5: RMSE = 1.2988
Mean RMSE: 1.5068
Standard Deviation of RMSE: 0.1458


Hyperparameter Tuning for SVM:
Hyperparameter Tuning Results for SVM:
Best Parameters: {'C': 100, 'epsilon': 1, 'gamma': 0.01}
Best RMSE: 1.3699

Tuned Support Vector Machine Performance:
Mean Absolute Deviation (MAD): 1.1675611358607194
Root Mean Squared Error (RMSE): 1.4712687197338465



## Naive Bayes Algorithms


In [151]:
# Define the classification function
def classify_area(area):
    if area == 0:
        return 0  # No fire
    elif area <= 5:
        return 1  # Small fire
    elif area <= 20:
        return 2  # Medium fire
    else:
        return 3  # Large fire

# Apply the function to create a new column
df['area_class'] = df['area'].apply(classify_area)

# Check the class distribution
print("Class distribution for 'area_class':")
print(df['area_class'].value_counts())# Define the classification function
def classify_area(area):
    if area == 0:
        return 0  # No fire
    elif area <= 5:
        return 1  # Small fire
    elif area <= 20:
        return 2  # Medium fire
    else:
        return 3  # Large fire

# Apply the function to create a new column
df['area_class'] = df['area'].apply(classify_area)

# Check the class distribution
print("Class distribution for 'area_class':")
print(df['area_class'].value_counts())

Class distribution for 'area_class':
area_class
0    247
1    119
2     92
3     59
Name: count, dtype: int64
Class distribution for 'area_class':
area_class
0    247
1    119
2     92
3     59
Name: count, dtype: int64


In [152]:
def preprocess_classification_data(df, target, categorical_features, numerical_features):
    # Encode target labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df[target])  # Converts target labels to integers

    # Define preprocessing for features
    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(), categorical_features),  # One-hot encode categorical features
            ("num", MinMaxScaler(), numerical_features),  # Min-max scale numerical features
        ]
    )

    # Apply transformations to input features
    X = preprocess.fit_transform(df.drop(columns=[target]))

    # Split data into training and testing sets
    return train_test_split(X, y, test_size=0.3, random_state=42), label_encoder

## Multinomial Bayes


In [None]:
def calculate_specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]  # True Negatives
    fp = cm[0, 1]  # False Positives
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    return specificity
def multinomial_nb(X_train, X_test, y_train, y_test):
    # Initialize the model
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if len(model.classes_) == 2 else None

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    specificity = calculate_specificity(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "Not applicable for multiclass"

    # Print Metrics
    print("Multinomial Naive Bayes Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    if y_proba is not None:
        print(f"AUC: {auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 50)


# With area

In [153]:
# Define target and features
target_column = 'area'  # Burned area column
categorical_features = []  # Replace with categorical feature names if available
numerical_features = ['temp', 'RH', 'wind', 'rain']  # Numerical feature names from dataset

# Preprocess the dataset
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)

# Train and evaluate Multinomial Naive Bayes
multinomial_nb(train_X, test_X, train_y, test_y)

Multinomial Naive Bayes Performance:
Accuracy: 0.4936
Precision: 0.2436
Recall (Sensitivity): 0.4936
Specificity: 1.0000
Confusion Matrix:
[[77  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]]
--------------------------------------------------


# With class 0 evaluating with area class

In [154]:
# Define target and features
target_column = 'area_class'  # New target column
categorical_features = []  # Replace with categorical feature names
numerical_features = ['temp', 'RH', 'wind', 'rain']  # Numerical feature names


# Preprocess the dataset
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)

# Train and evaluate Multinomial Naive Bayes
multinomial_nb(train_X, test_X, train_y, test_y)

Multinomial Naive Bayes Performance:
Accuracy: 0.4936
Precision: 0.2436
Recall (Sensitivity): 0.4936
Specificity: 1.0000
Confusion Matrix:
[[77  0  0  0]
 [34  0  0  0]
 [27  0  0  0]
 [18  0  0  0]]
--------------------------------------------------


# Without class 0

In [155]:
# Remove rows where area_class == 0
df_filtered = df[df['area_class'] != 0]

# Update target and features
target_column = 'area_class'
categorical_features = []  # Replace with categorical features if available
numerical_features = ['temp', 'RH', 'wind', 'rain']

# Preprocess the filtered dataset
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df_filtered, target_column, categorical_features, numerical_features
)