In [10]:
import sqlite3
import numpy as np
from tqdm import tqdm
from joblib import dump
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [11]:
def parse_dataset():
    connection = sqlite3.connect("../backend/database.sqlite")
    cursor = connection.cursor()
    query_template = """
    SELECT FIRE_SIZE, LATITUDE, LONGITUDE, DISCOVERY_DOY
    FROM Fires
    WHERE FIRE_SIZE_CLASS = ?
    ORDER BY RANDOM()
    LIMIT ?
    """

    fires = []
    fire_classes = ["A", "B", "C", "D", "E", "F", "G"]

    for fire_class in fire_classes:
        cursor.execute(query_template, (fire_class, 4000))
        class_data = cursor.fetchall()
        fires.extend(class_data)

    # Initialize lists to store data
    data = []
    labels = []

    # Normalize the day of year
    for size, lat, lon, day in tqdm(fires):
        normalized_day = day / 366  # Normalize the day of the year
        data.append([lat, lon, normalized_day])
        labels.append(size)

    return np.array(data), np.array(labels)

In [12]:
data, labels = parse_dataset()

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42
)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
dump(model, "model.joblib")

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

feature_importance = model.feature_importances_
print(
    f"Feature Importances: Latitude {feature_importance[0]}, Longitude {feature_importance[1]}, Day of Year {feature_importance[2]}"
)

100%|██████████| 27773/27773 [00:00<00:00, 2351495.92it/s]


Mean Squared Error: 418271024.7676328
Feature Importances: Latitude 0.447772708840805, Longitude 0.3972408638901661, Day of Year 0.15498642726902895
