In [49]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/forest-fires-regression/forestfires.csv


# Imports

In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    roc_auc_score,
    classification_report,
    mean_absolute_error, 
    mean_squared_error
)
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt


# Forest Fire data representation

In [83]:
df = pd.read_csv("/kaggle/input/forest-fires-regression/forestfires.csv")
df.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


# Implementing diff models and algorithms

## Helper functions & Data Preprocessing

In [84]:
def evaluate_model(model_name, y_test, y_pred):
    mad = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Deviation (MAD): {mad}")
    print(f"Root Mean Squared Error (RMSE): {rmse}\n")



In [85]:

numerical_features = ['temp', 'RH', 'wind', 'rain']  
categorical_features = []  
# Preprocess the data
def preprocess_data(df, target, categorical_features, numerical_features):
    # Log-transform the target variable
    y = np.log1p(df[target])  
    
    # Remove the target column from the features
    X = df.drop(columns=[target])
    
    # Define transformers
    preprocess = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    # Apply transformations
    X_transformed = preprocess.fit_transform(X)
    
    # Split the dataset into train and test sets
    return train_test_split(X_transformed, y, test_size=0.2, random_state=42)



### Neural network

In [86]:
def neural_network(X_train, X_test, y_train, y_test):
    model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=1000, solver='adam',learning_rate_init=0.001,early_stopping=True, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    evaluate_model("Neural Network", y_test, y_pred)


### Optimimzing MLP

In [87]:
# Cross-Validation for MLP
def cross_validate_mlp(X, y):
    model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=500, solver='adam', learning_rate_init=0.001, random_state=42)
    scorer = make_scorer(mean_squared_error, squared=False)
    scores = cross_val_score(model, X, y, cv=5, scoring=scorer)
    print("Cross-validated RMSE scores:", scores)
    print("Mean RMSE:", scores.mean())

# Hyperparameter Tuning for MLP
def tune_mlp(X_train, y_train):
    param_grid = {
        'hidden_layer_sizes': [(10,), (50,), (100,)],
        'learning_rate_init': [0.001, 0.01, 0.1],
        'solver': ['adam', 'sgd'],
        'max_iter': [500, 1000]
    }
    
    model = MLPRegressor(random_state=42, early_stopping=True)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)
    print("Best RMSE:", np.sqrt(-grid_search.best_score_))
    return best_model


### Evaluate

In [88]:
# Define target and features
target_column = 'area'
numerical_features = ['temp', 'RH', 'wind', 'rain']
categorical_features = []  # Add categorical features if needed

# Preprocess data
X_train, X_test, y_train, y_test = preprocess_data(df, target_column, categorical_features, numerical_features)

# Cross-validate MLP
print("Cross-Validation Results for MLP:")
cross_validate_mlp(X_train, y_train)

# Hyperparameter tuning for MLP
print("\nHyperparameter Tuning for MLP:")
best_mlp = tune_mlp(X_train, y_train)

# Evaluate best MLP on the test set
y_pred = best_mlp.predict(X_test)
evaluate_model("Tuned MLP Regressor", y_test, y_pred)

Cross-Validation Results for MLP:




Cross-validated RMSE scores: [1.41556921 1.26431999 1.70147067 1.54545318 1.26838491]
Mean RMSE: 1.4390395931493056

Hyperparameter Tuning for MLP:
Best Parameters: {'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'max_iter': 500, 'solver': 'sgd'}
Best RMSE: 1.3859581187939796
Tuned MLP Regressor Performance:
Mean Absolute Deviation (MAD): 1.1573045980401029
Root Mean Squared Error (RMSE): 1.453821275015725



## SVM


In [89]:
def support_vector_machine(X_train, X_test, y_train, y_test):
    model = SVR(kernel='rbf', C=3, epsilon=0.1, gamma='scale')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    evaluate_model("Support Vector Machine", y_test, y_pred)


### Optimizing SVM

In [90]:
# Cross-validate SVM
def cross_validate_svm(X, y):
    model = SVR(kernel='rbf', C=3, epsilon=0.1, gamma='scale')
    scorer = make_scorer(mean_squared_error, squared=False)  # Use RMSE
    scores = cross_val_score(model, X, y, cv=5, scoring=scorer)
    
    # Print cross-validation scores
    print("Cross-Validation Results for SVM:")
    for i, score in enumerate(scores, 1):
        print(f"Fold {i}: RMSE = {score:.4f}")
    print(f"Mean RMSE: {np.mean(scores):.4f}")
    print(f"Standard Deviation of RMSE: {np.std(scores):.4f}\n")

# Hyperparameter tuning for SVM
def tune_svm(X_train, y_train):
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.5, 1],
        'gamma': ['scale', 'auto', 0.01, 0.1]
    }
    model = SVR(kernel='rbf')
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Extract best parameters and results
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)  # Convert negative MSE to RMSE
    
    print("Hyperparameter Tuning Results for SVM:")
    print(f"Best Parameters: {best_params}")
    print(f"Best RMSE: {best_score:.4f}\n")
    
    return best_model

### Evaluation

In [91]:
# Evaluate SVM
print("Cross-Validation Results for SVM:")
cross_validate_svm(X_train, y_train)

print("\nHyperparameter Tuning for SVM:")
best_svm = tune_svm(X_train, y_train)

# Evaluate best SVM on the test set
y_pred = best_svm.predict(X_test)
evaluate_model("Tuned Support Vector Machine", y_test, y_pred)

Cross-Validation Results for SVM:
Cross-Validation Results for SVM:
Fold 1: RMSE = 1.4865
Fold 2: RMSE = 1.4091
Fold 3: RMSE = 1.6665
Fold 4: RMSE = 1.6731
Fold 5: RMSE = 1.2988
Mean RMSE: 1.5068
Standard Deviation of RMSE: 0.1458


Hyperparameter Tuning for SVM:
Hyperparameter Tuning Results for SVM:
Best Parameters: {'C': 100, 'epsilon': 1, 'gamma': 0.01}
Best RMSE: 1.3699

Tuned Support Vector Machine Performance:
Mean Absolute Deviation (MAD): 1.1675611358607194
Root Mean Squared Error (RMSE): 1.4712687197338465



## Naive Bayes Algorithms
### Change the areas into diff. classes

In [92]:
# Define the classification function
def classify_area(area):
    if area == 0:
        return 0  # No fire
    elif area <= 5:
        return 1  # Small fire
    elif area <= 20:
        return 2  # Medium fire
    else:
        return 3  # Large fire

# Apply the function to create a new column
df['area_class'] = df['area'].apply(classify_area)

# Check the class distribution
print("Class distribution for 'area_class':")
print(df['area_class'].value_counts())# Define the classification function
def classify_area(area):
    if area == 0:
        return 0  # No fire
    elif area <= 5:
        return 1  # Small fire
    elif area <= 20:
        return 2  # Medium fire
    else:
        return 3  # Large fire

# Apply the function to create a new column
df['area_class'] = df['area'].apply(classify_area)

# Check the class distribution
print("Class distribution for 'area_class':")
print(df['area_class'].value_counts())

Class distribution for 'area_class':
area_class
0    247
1    119
2     92
3     59
Name: count, dtype: int64
Class distribution for 'area_class':
area_class
0    247
1    119
2     92
3     59
Name: count, dtype: int64


### pre-process the data

In [100]:
def preprocess_classification_data(df, target, categorical_features, numerical_features):
    # Encode target labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df[target])  # Converts target labels to integers

    # Define preprocessing for features
    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(), categorical_features),  # One-hot encode categorical features
            ("num", MinMaxScaler(), numerical_features),  # Min-max scale numerical features
        ]
    )

    # Apply transformations to input features
    X = preprocess.fit_transform(df.drop(columns=[target]))

    # Split data into training and testing sets
    return train_test_split(X, y, test_size=0.3, random_state=42), label_encoder

## Multinomial Bayes


In [101]:
# Cross-validation function
def cross_validate_multinomial_nb(X, y):
    model = MultinomialNB()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    print("Cross-Validation Results for Multinomial Naive Bayes:")
    print(f"Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")
    print("-" * 50)

In [102]:
# Hyperparameter tuning using Grid Search
def grid_search_multinomial_nb(X_train, y_train):
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]  # Laplace smoothing parameter
    }
    model = MultinomialNB()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Grid Search Results for Multinomial Naive Bayes:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    print("-" * 50)
    return grid_search.best_estimator_

In [103]:
def calculate_specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]  # True Negatives
    fp = cm[0, 1]  # False Positives
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    return specificity
def multinomial_nb(X_train, X_test, y_train, y_test):
    # Cross-validation
    cross_validate_multinomial_nb(X_train, y_train)

    # Grid search
    best_model = grid_search_multinomial_nb(X_train, y_train)

    # Train the best model on the training set
    best_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if len(best_model.classes_) == 2 else None

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    specificity = calculate_specificity(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "Not applicable for multiclass"

    # Print Metrics
    print("Final Evaluation with Tuned Multinomial Naive Bayes:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    if y_proba is not None:
        print(f"AUC: {auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 50)


# With area

In [104]:
# Define target and features
target_column = 'area'  # Burned area column
categorical_features = []  # Replace with categorical feature names if available
numerical_features = ['temp', 'RH', 'wind', 'rain']  # Numerical feature names from dataset

# Preprocess the dataset
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)

# Train and evaluate Multinomial Naive Bayes
multinomial_nb(train_X, test_X, train_y, test_y)

Cross-Validation Results for Multinomial Naive Bayes:
Accuracy Scores: [0.46575342 0.47222222 0.47222222 0.47222222 0.47222222]
Mean Accuracy: 0.4709
Standard Deviation: 0.0026
--------------------------------------------------
Grid Search Results for Multinomial Naive Bayes:
Best Parameters: {'alpha': 0.1}
Best Cross-Validation Accuracy: 0.4709
--------------------------------------------------
Final Evaluation with Tuned Multinomial Naive Bayes:
Accuracy: 0.4936
Precision: 0.2436
Recall (Sensitivity): 0.4936
Specificity: 1.0000
Confusion Matrix:
[[77  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]]
--------------------------------------------------




# With class 0 evaluating with area class

In [105]:
# Define target and features
target_column = 'area_class'  # New target column
categorical_features = []  # Replace with categorical feature names
numerical_features = ['temp', 'RH', 'wind', 'rain']  # Numerical feature names


# Preprocess the dataset
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)

# Train and evaluate Multinomial Naive Bayes
multinomial_nb(train_X, test_X, train_y, test_y)

Cross-Validation Results for Multinomial Naive Bayes:
Accuracy Scores: [0.46575342 0.47222222 0.47222222 0.47222222 0.47222222]
Mean Accuracy: 0.4709
Standard Deviation: 0.0026
--------------------------------------------------
Grid Search Results for Multinomial Naive Bayes:
Best Parameters: {'alpha': 0.1}
Best Cross-Validation Accuracy: 0.4709
--------------------------------------------------
Final Evaluation with Tuned Multinomial Naive Bayes:
Accuracy: 0.4936
Precision: 0.2436
Recall (Sensitivity): 0.4936
Specificity: 1.0000
Confusion Matrix:
[[77  0  0  0]
 [34  0  0  0]
 [27  0  0  0]
 [18  0  0  0]]
--------------------------------------------------


# Without class 0

In [108]:
# Remove rows where area_class == 0
df_filtered = df[df['area_class'] != 0]

# Update target column to 'area_class'
target_column = 'area_class'

# Preprocess and train
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df_filtered, target_column, categorical_features, numerical_features
)
multinomial_nb(train_X, test_X, train_y, test_y)

Cross-Validation Results for Multinomial Naive Bayes:
Accuracy Scores: [0.42105263 0.39473684 0.39473684 0.39473684 0.40540541]
Mean Accuracy: 0.4021
Standard Deviation: 0.0103
--------------------------------------------------
Grid Search Results for Multinomial Naive Bayes:
Best Parameters: {'alpha': 2.0}
Best Cross-Validation Accuracy: 0.4075
--------------------------------------------------
Final Evaluation with Tuned Multinomial Naive Bayes:
Accuracy: 0.5185
Precision: 0.2689
Recall (Sensitivity): 0.5185
Specificity: 1.0000
Confusion Matrix:
[[42  0  0]
 [24  0  0]
 [15  0  0]]
--------------------------------------------------


## Gaussian Naive Bayes

In [109]:
# Cross-validation for GaussianNB
def cross_validate_gaussian_nb(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    print("Cross-Validation Results for Gaussian Naive Bayes:")
    print(f"Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")
    print("-" * 50)

# Hyperparameter tuning for GaussianNB (tuning var_smoothing)
def grid_search_gaussian_nb(X_train, y_train):
    param_grid = {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  # Example range for variance smoothing
    }
    model = GaussianNB()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Grid Search Results for Gaussian Naive Bayes:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    print("-" * 50)
    return grid_search.best_estimator_

In [112]:
from sklearn.naive_bayes import GaussianNB
# Train and evaluate GaussianNB with cross-validation and grid search
def gaussian_nb(X_train, X_test, y_train, y_test):
    # Cross-validation
    cross_validate_gaussian_nb(X_train, y_train)

    # Grid search
    best_model = grid_search_gaussian_nb(X_train, y_train)

    # Train the best model on the training set
    best_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if len(best_model.classes_) == 2 else None

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    specificity = calculate_specificity(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "Not applicable for multiclass"

    # Print Metrics
    print("Final Evaluation with Tuned Gaussian Naive Bayes:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    if y_proba is not None:
        print(f"AUC: {auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 50)

# Area

In [113]:
# Define target and features
target_column = 'area'
categorical_features = []  # Replace with categorical feature names if available
numerical_features = ['temp', 'RH', 'wind', 'rain']  # Numerical features

# Preprocess and train
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)
gaussian_nb(train_X, test_X, train_y, test_y)




Cross-Validation Results for Gaussian Naive Bayes:
Accuracy Scores: [0.01369863 0.04166667 0.01388889 0.375      0.27777778]
Mean Accuracy: 0.1444
Standard Deviation: 0.1521
--------------------------------------------------
Grid Search Results for Gaussian Naive Bayes:
Best Parameters: {'var_smoothing': 1e-05}
Best Cross-Validation Accuracy: 0.3491
--------------------------------------------------
Final Evaluation with Tuned Gaussian Naive Bayes:
Accuracy: 0.2821
Precision: 0.2088
Recall (Sensitivity): 0.2821
Specificity: 0.8800
Confusion Matrix:
[[44  6  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]]
--------------------------------------------------


# with area class

In [114]:
# Update target column to 'area_class'
target_column = 'area_class'

# Preprocess and train
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)
gaussian_nb(train_X, test_X, train_y, test_y)


Cross-Validation Results for Gaussian Naive Bayes:
Accuracy Scores: [0.21917808 0.23611111 0.25       0.19444444 0.23611111]
Mean Accuracy: 0.2272
Standard Deviation: 0.0191
--------------------------------------------------
Grid Search Results for Gaussian Naive Bayes:
Best Parameters: {'var_smoothing': 1e-09}
Best Cross-Validation Accuracy: 0.2272
--------------------------------------------------
Final Evaluation with Tuned Gaussian Naive Bayes:
Accuracy: 0.2179
Precision: 0.5419
Recall (Sensitivity): 0.2179
Specificity: 0.0137
Confusion Matrix:
[[ 1 72  1  3]
 [ 0 33  1  0]
 [ 0 26  0  1]
 [ 0 18  0  0]]
--------------------------------------------------


# With area class excluding class 0

In [115]:
# Remove rows where area_class == 0
df_filtered = df[df['area_class'] != 0]

# Update target column to 'area_class'
target_column = 'area_class'

# Preprocess and train
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df_filtered, target_column, categorical_features, numerical_features
)
gaussian_nb(train_X, test_X, train_y, test_y)


Cross-Validation Results for Gaussian Naive Bayes:
Accuracy Scores: [0.23684211 0.23684211 0.31578947 0.23684211 0.2972973 ]
Mean Accuracy: 0.2647
Standard Deviation: 0.0346
--------------------------------------------------
Grid Search Results for Gaussian Naive Bayes:
Best Parameters: {'var_smoothing': 1e-09}
Best Cross-Validation Accuracy: 0.2647
--------------------------------------------------
Final Evaluation with Tuned Gaussian Naive Bayes:
Accuracy: 0.1852
Precision: 0.0343
Recall (Sensitivity): 0.1852
Specificity: 0.0000
Confusion Matrix:
[[ 0  0 42]
 [ 0  0 24]
 [ 0  0 15]]
--------------------------------------------------


## Categorical Naive Bayes

In [116]:
from sklearn.preprocessing import KBinsDiscretizer

def preprocess_for_categorical_nb(df, target, numerical_features, n_bins=5):
    """
    Preprocess data for CategoricalNB by discretizing numerical features.
    """
    # Encode the target as integers
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df[target])
    
    # Discretize numerical features
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    X_discretized = discretizer.fit_transform(df[numerical_features])

    # Return transformed features, target, and discretizer
    return train_test_split(X_discretized, y, test_size=0.3, random_state=42), label_encoder


In [117]:
from sklearn.naive_bayes import CategoricalNB

# Cross-validation for CategoricalNB
def cross_validate_categorical_nb(X, y):
    model = CategoricalNB()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    print("Cross-Validation Results for Categorical Naive Bayes:")
    print(f"Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")
    print("-" * 50)

# Hyperparameter tuning for CategoricalNB
def grid_search_categorical_nb(X_train, y_train):
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]  # Laplace smoothing parameter
    }
    model = CategoricalNB()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Grid Search Results for Categorical Naive Bayes:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    print("-" * 50)
    return grid_search.best_estimator_

In [118]:
# Train and evaluate CategoricalNB
def categorical_nb(X_train, X_test, y_train, y_test):
    # Initialize and train the model
    model = CategoricalNB()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    # Print Metrics
    print("Categorical Naive Bayes Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 50)


# With Area

In [119]:
# Define target and features
target_column = 'area'
categorical_features = []  # Replace with categorical feature names if available
numerical_features = ['temp', 'RH', 'wind', 'rain']  # Numerical features

# Preprocess and train
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)
categorical_nb(train_X, test_X, train_y, test_y)


Categorical Naive Bayes Performance:
Accuracy: 0.4936
Precision: 0.2436
Recall: 0.4936
Confusion Matrix:
[[77  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]]
--------------------------------------------------


## With Area class


In [120]:
# Update target column to 'area_class'
target_column = 'area_class'

# Preprocess and train
(train_X, test_X, train_y, test_y), label_encoder = preprocess_classification_data(
    df, target_column, categorical_features, numerical_features
)
categorical_nb(train_X, test_X, train_y, test_y)


Categorical Naive Bayes Performance:
Accuracy: 0.4936
Precision: 0.2452
Recall: 0.4936
Confusion Matrix:
[[77  0  0  0]
 [34  0  0  0]
 [27  0  0  0]
 [17  1  0  0]]
--------------------------------------------------


# With area class wihtout class 0

In [121]:
# Remove rows where area_class == 0
df_filtered = df[df['area_class'] != 0]

# Update target column to 'area_class'
target_column = 'area_class'
numerical_features = ['temp', 'RH', 'wind', 'rain']

# Preprocess data for CategoricalNB
(train_X, test_X, train_y, test_y), label_encoder = preprocess_for_categorical_nb(
    df_filtered, target_column, numerical_features, n_bins=5
)
# Train and evaluate CategoricalNB
categorical_nb(train_X, test_X, train_y, test_y)


Categorical Naive Bayes Performance:
Accuracy: 0.4198
Precision: 0.4039
Recall: 0.4198
Confusion Matrix:
[[23 16  3]
 [14 10  0]
 [ 8  6  1]]
--------------------------------------------------


## Bernoulli Bayes

In [122]:
from sklearn.preprocessing import Binarizer

def preprocess_for_bernoulli_nb(df, target, numerical_features, threshold=None):
    """
    Preprocess data for BernoulliNB by binarizing numerical features.
    """
    # Encode the target as integers
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df[target])
    
    # Binarize numerical features
    binarizer = Binarizer(threshold=threshold) if threshold else Binarizer()
    X_binarized = binarizer.fit_transform(df[numerical_features])

    # Return transformed features, target, and binarizer
    return train_test_split(X_binarized, y, test_size=0.3, random_state=42), label_encoder


In [123]:
from sklearn.naive_bayes import BernoulliNB

def cross_validate_bernoulli_nb(X, y):
    model = BernoulliNB()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    print("Cross-Validation Results for Bernoulli Naive Bayes:")
    print(f"Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")
    print("-" * 50)


In [124]:
def grid_search_bernoulli_nb(X_train, y_train):
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0],  # Smoothing parameter
        'binarize': [0.0, 0.5, 1.0]  # Threshold for binarization
    }
    model = BernoulliNB()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Grid Search Results for Bernoulli Naive Bayes:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
    print("-" * 50)
    return grid_search.best_estimator_


In [125]:
def bernoulli_nb(X_train, X_test, y_train, y_test):
    # Cross-validation
    cross_validate_bernoulli_nb(X_train, y_train)

    # Grid search
    best_model = grid_search_bernoulli_nb(X_train, y_train)

    # Train the best model on the training set
    best_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = best_model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    # Print Metrics
    print("Final Evaluation with Tuned Bernoulli Naive Bayes:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 50)


# With area

In [130]:
# Define target and features
target_column = 'area'  # Use 'area_class' or other target column
numerical_features = ['temp', 'RH', 'wind', 'rain']

# Preprocess data for BernoulliNB
(train_X, test_X, train_y, test_y), label_encoder = preprocess_for_bernoulli_nb(
    df, target_column, numerical_features, threshold=0.5  # Example threshold
)

# Train and evaluate BernoulliNB
bernoulli_nb(train_X, test_X, train_y, test_y)


Cross-Validation Results for Bernoulli Naive Bayes:
Accuracy Scores: [0.46575342 0.47222222 0.47222222 0.47222222 0.47222222]
Mean Accuracy: 0.4709
Standard Deviation: 0.0026
--------------------------------------------------
Grid Search Results for Bernoulli Naive Bayes:
Best Parameters: {'alpha': 0.1, 'binarize': 0.0}
Best Cross-Validation Accuracy: 0.4709
--------------------------------------------------
Final Evaluation with Tuned Bernoulli Naive Bayes:
Accuracy: 0.4936
Precision: 0.2436
Recall: 0.4936
Confusion Matrix:
[[77  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]]
--------------------------------------------------




# With area class

In [129]:
# Define target and features
target_column = 'area_class'  # Use 'area_class' or other target column
numerical_features = ['temp', 'RH', 'wind', 'rain']

# Preprocess data for BernoulliNB
(train_X, test_X, train_y, test_y), label_encoder = preprocess_for_bernoulli_nb(
    df, target_column, numerical_features, threshold=0.5  # Example threshold
)

# Train and evaluate BernoulliNB
bernoulli_nb(train_X, test_X, train_y, test_y)


Cross-Validation Results for Bernoulli Naive Bayes:
Accuracy Scores: [0.46575342 0.47222222 0.47222222 0.47222222 0.47222222]
Mean Accuracy: 0.4709
Standard Deviation: 0.0026
--------------------------------------------------
Grid Search Results for Bernoulli Naive Bayes:
Best Parameters: {'alpha': 0.1, 'binarize': 0.0}
Best Cross-Validation Accuracy: 0.4709
--------------------------------------------------
Final Evaluation with Tuned Bernoulli Naive Bayes:
Accuracy: 0.5000
Precision: 0.4632
Recall: 0.5000
Confusion Matrix:
[[77  0  0  0]
 [33  1  0  0]
 [27  0  0  0]
 [18  0  0  0]]
--------------------------------------------------


# With area class excluding class 0

In [131]:
# Remove rows where area_class == 0
df_filtered = df[df['area_class'] != 0]

# Update target column to 'area_class'
target_column = 'area_class'
numerical_features = ['temp', 'RH', 'wind', 'rain']

# Preprocess data for CategoricalNB
(train_X, test_X, train_y, test_y), label_encoder = preprocess_for_categorical_nb(
    df_filtered, target_column, numerical_features, n_bins=5
)
# Train and evaluate CategoricalNB
bernoulli_nb(train_X, test_X, train_y, test_y)


Cross-Validation Results for Bernoulli Naive Bayes:
Accuracy Scores: [0.47368421 0.39473684 0.44736842 0.39473684 0.35135135]
Mean Accuracy: 0.4124
Standard Deviation: 0.0432
--------------------------------------------------
Grid Search Results for Bernoulli Naive Bayes:
Best Parameters: {'alpha': 0.5, 'binarize': 1.0}
Best Cross-Validation Accuracy: 0.4502
--------------------------------------------------
Final Evaluation with Tuned Bernoulli Naive Bayes:
Accuracy: 0.5185
Precision: 0.3983
Recall: 0.5185
Confusion Matrix:
[[36  6  0]
 [18  6  0]
 [11  4  0]]
--------------------------------------------------
