In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import labolibrary as labo

from sklearn.preprocessing import MinMaxScaler


In [25]:

#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Function to center, scale, and return a series
def minmax_scale_group(group):
    median = group.median()
    centered_values = group - median
    scaler = MinMaxScaler()
    scaled_values = scaler.fit_transform(centered_values.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Store the scaler for this group
    medians[group.name] = median  # Store the median for this group
    return pd.Series(scaled_values, index=group.index, name=group.name)

# Function to inverse transform (de-scale) and decenter, and return a series
def inverse_minmax_scale_group(group):
    group_name = group.name
    scaler = scalers[group_name]
    median = medians[group_name]
    inversed_centered_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    original_values = inversed_centered_values + median
    return pd.Series(original_values, index=group.index, name=group_name)

# Custom metric function
def multinacional_metric(y_true, y_pred_scaled):
    # Inverse transform and decenter the predicted values
    y_pred_original = y_pred_scaled.groupby(y_pred_scaled.index).apply(inverse_minmax_scale_group, group_keys=False)
    
    # Calculate the metric using true values (already in the original scale)
    metric = abs(sum(y_true - y_pred_original)) / sum(y_true)
    
    return metric




In [None]:

# Leer datos
df_final = pd.read_parquet(DATOS_DIR+'FE_dataset-CARLA.parquet') 
df_final.columns = df_final.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)

### Filtrar datos
df_true = df_final.loc['2019-12-01':'2020-01-01']
df_final = df_final.loc['2018-01-01':'2020-01-01']


In [None]:

### Agrupar y escalar
scalers = {}
medians = {}
df_final['tn'] = df_final.groupby('product_id')['tn'].transform(minmax_scale_group) #escalado
df_final['tn_2'] = df_final.groupby('product_id')['tn_2'].transform(minmax_scale_group) #escalado


In [None]:
# Correr Modelo
params={
        'boosting_type': 'gbdt',
        'objective': 'Regression',
        'metric':'none',
        #'n_jobs': -1,
        #'seed': 113,
        #'learning_rate': 0.2,
        #'bagging_fraction': 0.85,
        #'bagging_freq': 1, 
        #'colsample_bytree': 0.85,
        #'colsample_bynode': 0.85,
        #'min_data_per_leaf': 25,
        #'num_leaves': 200,
        #'lambda_l1': 0.5,
        #'lambda_l2': 0.5
}

predictions_all = pd.DataFrame(columns=['tn'])
products = df_final['product_id'].unique()
tot = len(products)
nro = 0
for producto in products:
    print(f'Fitting and predicting for product_id: {producto}')
    # Filtrar los datos del producto
    df_producto = df_final[df_final['product_id'] == producto]
    model, average_metric = labo.train_lightgbm_model(df_producto,params)
    print("Overall rmse metric: ", average_metric)
    # Predict values for the entire dataset using the trained models
    # Prepare last data points for prediction
    last_data_points = df_producto[df_producto.index == df_producto.index.max()].copy()
    last_data_points.drop(columns=['tn_2'], inplace=True)
    # Predict the next month's value using the trained model
    predictions = labo.predict_next_month(model, last_data_points)
    preds = predictions.groupby('product_id')['tn_2'].transform(inverse_minmax_scale_group)
    predictions['tn'] = preds
    predictions.drop(columns=['tn_2'], inplace=True)
    predictions = predictions.reset_index()
    predictions =  predictions.groupby('product_id')['tn'].sum()
    predictions.columns = ['product_id', 'tn']
    predictions_all = pd.concat([predictions_all, predictions])
    print(predictions_all[-1:])


In [None]:

predictions_all['tn']=predictions_all['tn'].astype('float32')
predictions_all.index.names = ['product_id']
predictions_all.to_csv(DATOS_DIR+'/pred/0007-prediccion-custom_scaled-product_id.csv', index=True,header=True)
print("Overall custom metric: ", average_metric)
print("Error prediccion 12-19: " , 
      abs(sum(df_true.groupby('product_id')['tn'].sum() - predictions_all['tn']))/sum(df_true.groupby('product_id')['tn'].sum()))