In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import labolibrary as labo
from sklearn.preprocessing import MinMaxScaler

DATOS_DIR = 'D:/Dropbox/Python/LaboIII/labo3-2024v/src/GradientBoosting2/data/'

# Definir la métrica personalizada
def multinacional_metric(y_true, y_pred):
    return abs(sum(y_true - y_pred)) / sum(y_true)

# Función para escalar y devolver una serie
def minmax_scale_group(group):
    scaler = MinMaxScaler()
    scaled_values = scaler.fit_transform(group.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Almacenar el escalador para este grupo
    return pd.Series(scaled_values, index=group.index)

# Función para desescalar y devolver una serie
def inverse_minmax_scale_group(group):
    scaler = scalers[group.name]
    inversed_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    return pd.Series(inversed_values, index=group.index)

# Leer datos
df_final = pd.read_parquet(DATOS_DIR+'FE_dataset-CARLA.parquet') 
df_final.columns = df_final.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)

### Filtrar datos
df_true = df_final.loc['2019-12-01':'2020-02-01']
df_final = df_final.loc['2018-01-01':'2020-01-01']

### Indices
df_final.index = df_final.index.to_timestamp()
df_true.index = df_true.index.to_timestamp()

### Agrupar y escalar
scalers = {}

# Create the feature that represents the difference between TN+2 and TN
df_final['TN'] = df_final['tn']
df_final['TN+2'] = df_final.groupby('product_id')['tn'].shift(-2)
df_final['diff2'] = df_final['TN+2'] - df_final['TN']

# Normalize TN and TN+2
df_final['TN_normalized'] = df_final.groupby('product_id')['TN'].transform(minmax_scale_group)
df_final['TN+2_normalized'] = df_final.groupby('product_id')['TN+2'].transform(minmax_scale_group)

# Create the normalized difference feature
df_final['diff2_normalized'] = df_final['TN+2_normalized'] - df_final['TN_normalized']

# Correr Modelo
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'None',
}
model, average_metric = labo.train_lightgbm_model(df_final, params, col='diff2_normalized')
print("Overall custom metric: ", average_metric)

def predict_next_month(model, last_data_points, col='diff2_normalized'):
    predictions = []
    last_month = last_data_points.index.max() + pd.DateOffset(months=2)  # Predict 2 months ahead
    last_data_points.index = [last_month] * len(last_data_points)
    
    predictions = model.predict(last_data_points, num_iteration=model.best_iteration,
                                predict_disable_shape_check=True)
    
    prediction_df = last_data_points[['product_id', 'TN_normalized']].copy()
    prediction_df['diff2_normalized'] = predictions
    prediction_df.index = [last_month] * len(last_data_points)

    return prediction_df

def prepare_data_for_prediction(df, last_date):
    last_data = df[df.index <= last_date].copy()
    last_data['TN+2'] = np.nan  # Set TN+2 to NaN for the last two months
    last_data['diff2'] = np.nan  # Set diff2 to NaN for the last two months
    
    # Normalize TN and create other necessary features
    last_data['TN_normalized'] = last_data.groupby('product_id')['TN'].transform(minmax_scale_group)
    
    return last_data

def update_dataset_with_prediction(df, predictions, prediction_date):
    for idx, row in predictions.iterrows():
        df.loc[(df.index == prediction_date) & (df['product_id'] == row['product_id']), 'TN'] = row['TN+2']
        df.loc[(df.index == prediction_date) & (df['product_id'] == row['product_id']), 'TN_normalized'] = row['TN+2_normalized']
    return df




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "d:\Dropbox\Python\LaboIII\labo3-2024v\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\Dr

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.268756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81641
[LightGBM] [Info] Number of data points in the train set: 257834, number of used features: 804
[LightGBM] [Info] Start training from score 0.000064
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	validation's multinacional_metric: 0.222672
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.540121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81660
[LightGBM] [Info] Number of data points in the train set: 515667, number of used features: 814
[LightGBM] [Info] Start training from score 0.000038
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	validation's multinacional_metric: 0.102019
[LightGBM] [Info] Au

In [2]:


# Predict for December, January, and February
months_to_predict = ['2019-12-01', '2020-01-01', '2020-02-01']
all_predictions = []

for i, predict_month in enumerate(months_to_predict):
    print(f"Predicting for {predict_month}")
    
    # Prepare data for prediction
    last_date = pd.to_datetime(predict_month) - pd.DateOffset(months=2)
    last_data_points = prepare_data_for_prediction(df_final, last_date)
    
    # Make prediction
    predictions = predict_next_month(model, last_data_points)
    
    # Denormalize predictions
    predictions['TN+2_normalized'] = predictions['TN_normalized'] + predictions['diff2_normalized']
    predictions['TN+2'] = predictions.groupby('product_id')['TN+2_normalized'].transform(inverse_minmax_scale_group)
    
    # Store predictions
    predictions['prediction_date'] = predict_month
    all_predictions.append(predictions)
    
    # Update dataset with new prediction for next iteration
    df_final = update_dataset_with_prediction(df_final, predictions, pd.to_datetime(predict_month))

# Combine all predictions
final_predictions = pd.concat(all_predictions)

# Get February predictions
february_predictions = final_predictions[final_predictions['prediction_date'] == '2020-02-01']
february_predictions = february_predictions.groupby('product_id')['TN+2'].sum().reset_index()
february_predictions.columns = ['product_id', 'tn']

# Save predictions
february_predictions.to_csv(DATOS_DIR+'/pred/predicciones-v1.csv', index=False, header=True)

print("Predictions for February saved.")

# Calculate error for February
if '2020-02-01' in df_true.index:
    true_values = df_true.loc['2020-02-01'].groupby('product_id')['tn'].sum()
    predicted_values = february_predictions.set_index('product_id')['tn']
    error = abs(sum(true_values - predicted_values)) / sum(true_values)
    print("Error for February: ", error)

# Calculate overall error for December, January, and February
all_true_values = df_true.groupby('product_id')['tn'].sum()
all_predicted_values = final_predictions.groupby('product_id')['TN+2'].sum()
overall_error = abs(sum(all_true_values - all_predicted_values)) / sum(all_true_values)
print("Overall Error for Dec, Jan, Feb: ", overall_error)

Predicting for 2019-12-01
