In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.neighbors import NearestNeighbors

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Define a function to calculate the Mean Absolute Percentage Error (MAPE)
def mape(y_pred,y_true):
    # Calculate the absolute percentage error for each prediction
    # np.abs((y_true - y_pred) / y_true) computes the absolute percentage errors
    # np.mean() calculates the average of these errors
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 # Return the MAPE as a percentage

In [4]:
# Reading the data from the CSV file
data = pd.read_csv('../model/data/modisTodos.csv', header=0)

# Selecting only the 'latitude' and 'longitude' columns from the dataset
lat_lon_data = data[['latitude', 'longitude']]

In [5]:
# Using Nearest Neighbors to find nearby regions
n_neighbors = 20  # Number of nearby regions to consider
# Initializing the NearestNeighbors model using the 'ball_tree' algorithm and fitting it with the latitude and longitude data
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(lat_lon_data)
# Finding the distances and indices of the nearest neighbors for each point in the dataset
distances, indices = nbrs.kneighbors(lat_lon_data)

x_data = [] # List to store the x-coordinates or features
y_data = [] # List to store the y-coordinates or target values

In [6]:
# Formatting so that each row of the matrix X consists of the latitudes and longitudes of neighboring regions
for i in range(len(lat_lon_data)):
    # Using the coordinates of neighboring regions as features
    neighbors = lat_lon_data.iloc[indices[i]].values.ravel()  # Getting the latitudes and longitudes of the neighbors
    x_data.append(neighbors)
    y_data.append(lat_lon_data.iloc[i].values)

# Converting the lists to numpy arrays for further processing
x_data = np.array(x_data)
y_data = np.array(y_data)

In [7]:
# Splitting the data into training and testing sets
train_size = int(0.8 * len(x_data))  # Determining the size of the training set (80% of the data)

# Slicing the data arrays into training and testing sets
x_train, x_test = x_data[:train_size], x_data[train_size:]
y_train, y_test = y_data[:train_size], y_data[train_size:]

In [8]:
# Training the Gradient Boosting Regressor model
# Since we are predicting both latitude and longitude (two outputs), we will use two models, one for each variable
model_latitude = GradientBoostingRegressor()
model_longitude = GradientBoostingRegressor()

In [9]:
# Fitting the latitude model with training features and latitude targets
model_latitude.fit(x_train, y_train[:, 0])

# Fitting the longitude model with training features and longitude targets
model_longitude.fit(x_train, y_train[:, 1])

In [10]:
# Making predictions for latitude and longitude using the trained models
y_pred_latitude = model_latitude.predict(x_test)
y_pred_longitude = model_longitude.predict(x_test)

In [11]:
# Stacking the latitude and longitude predictions into a single array
y_pred = np.column_stack((y_pred_latitude, y_pred_longitude))

In [12]:
# Saving the results to a new CSV file
output_df = pd.DataFrame(y_pred, columns=['predicted_latitude', 'predicted_longitude'])
output_df.to_csv('predicted_boosting.csv', index=False)

In [13]:
# Testing the model's performance using Mean Absolute Percentage Error (MAPE) for latitude and longitude
print('\nMean Absolute Percentage Error')
print('MAPE Latitude:', mape(y_pred[:, 0], y_test[:, 0]))
print('MAPE Longitude:', mape(y_pred[:, 1], y_test[:, 1]))


Mean Absolute Percentage Error
MAPE Latitude: 25.240698675968858
MAPE Longitude: 0.07403579661384993


In [14]:
# Comparing predictions with the Last Value and Moving Average
y_pred_last = x_test[:, -2:]  # Extracting the last latitude and longitude values from the test features

# Calculating the moving average of the last two coordinates (latitude and longitude) for each test instance
y_pred_ma = np.array([np.mean(x_test[i].reshape(-1, 2), axis=0) for i in range(len(x_test))])

In [15]:
# Calculating and printing the Mean Absolute Percentage Error (MAPE) for the Last Value predictions
print('MAPE Último Valor (Latitude):', mape(y_pred_last[:, 0], y_test[:, 0]))  # MAPE for Last Value predictions (Latitude)
print('MAPE Último Valor (Longitude):', mape(y_pred_last[:, 1], y_test[:, 1]))  # MAPE for Last Value predictions (Longitude)

# Calculating and printing the Mean Absolute Percentage Error (MAPE) for the Moving Average predictions
print('MAPE Média Móvel (Latitude):', mape(y_pred_ma[:, 0], y_test[:, 0]))  # MAPE for Moving Average predictions (Latitude)
print('MAPE Média Móvel (Longitude):', mape(y_pred_ma[:, 1], y_test[:, 1]))  # MAPE for Moving Average predictions (Longitude)


MAPE Último Valor (Latitude): 1.719060750166217
MAPE Último Valor (Longitude): 0.08892044259311617
MAPE Média Móvel (Latitude): 0.5042896006555369
MAPE Média Móvel (Longitude): 0.02603155662388484
