In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import labolibrary as labo

from sklearn.preprocessing import MinMaxScaler


In [26]:

#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Definir la métrica personalizada
def multinacional_metric(y_true, y_pred):
    return abs(sum(y_true - y_pred)) / sum(y_true)
# Función para escalar y devolver una serie
def minmax_scale_group(group):
    scaler = MinMaxScaler()
    scaled_values = scaler.fit_transform(group.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Almacenar el escalador para este grupo
    return pd.Series(scaled_values, index=group.index)

# Función para desescalar y devolver una serie
def inverse_minmax_scale_group(group):
    scaler = scalers[group.name]
    inversed_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    return pd.Series(inversed_values, index=group.index)

# Leer datos
df_final = pd.read_parquet(DATOS_DIR+'FE_dataset-CARLA.parquet') 
df_final.columns = df_final.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)

### Filtrar datos
df_true = df_final.loc['2019-12-01':'2020-01-01']
df_final = df_final.loc['2018-01-01':'2019-10-01']


In [27]:

### Agrupar y escalar

scalers = {}

df_final['tn'] = df_final.groupby('product_id')['tn'].transform(minmax_scale_group) #escalado
df_final['tn_2'] = df_final.groupby('product_id')['tn_2'].transform(minmax_scale_group) #escalado


In [28]:
def custom_objective(y_pred, dataset):
    y_true = dataset.get_label()
    diff = y_pred - y_true
    norm = np.sum(np.abs(diff)) / np.sum(np.abs(y_true))
    grad = np.sign(diff) * norm
    hess = np.ones_like(y_true) * norm  # Second derivative is approximated as a constant here
    return grad, hess
# Correr Modelo
params={
        'boosting_type': 'gbdt',
        'objective': custom_objective,
        'metric':'None',
        #'n_jobs': -1,
        'seed': 113,
        #'learning_rate': 0.2,
        #'bagging_fraction': 0.85,
        #'bagging_freq': 1, 
        #'feature_fraction': 0.8,
        #'colsample_bytree': 0.85,
        #'colsample_bynode': 0.85,
        #'min_data_per_leaf': 25,
        #'num_leaves': 200,
        #'lambda_l1': 0.5,
        #'lambda_l2': 0.5
}
model, average_metric = labo.train_lightgbm_model(df_final,params)
print("Overall custom metric: ", average_metric)




[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.314397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 80118
[LightGBM] [Info] Number of data points in the train set: 237898, number of used features: 790
[LightGBM] [Info] Using self-defined objective function




[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.462536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 80132
[LightGBM] [Info] Number of data points in the train set: 475795, number of used features: 807
[LightGBM] [Info] Using self-defined objective function




[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.934709 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 113756
[LightGBM] [Info] Number of data points in the train set: 713692, number of used features: 984
[LightGBM] [Info] Using self-defined objective function




[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.179861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115770
[LightGBM] [Info] Number of data points in the train set: 951589, number of used features: 993
[LightGBM] [Info] Using self-defined objective function




[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.365507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115849
[LightGBM] [Info] Number of data points in the train set: 1189486, number of used features: 1026
[LightGBM] [Info] Using self-defined objective function




Overall custom metric:  0.2764194970848789


In [29]:
def predict_next_month(model, last_data_points):
    predictions = []
    last_month = last_data_points.index.max() + 1
    last_data_points.index = [last_month] * len(last_data_points)  # Set index to the next month
    
    predictions = model.predict(last_data_points, num_iteration=model.best_iteration)
    
    prediction_df = last_data_points[['product_id']].copy()
    prediction_df['tn_2'] = predictions
    prediction_df.index = [last_month] * len(last_data_points)

    return prediction_df

In [30]:
# Predict values for the entire dataset using the trained models
# Prepare last data points for prediction
last_data_points = df_final[df_final.index == df_final.index.max()].copy()
last_data_points.drop(columns=['tn_2'], inplace=True)


In [31]:
# Predict the next month's value using the trained model
predictions = predict_next_month(model, last_data_points)

preds = predictions.groupby('product_id')['tn_2'].transform(inverse_minmax_scale_group)
predictions['tn'] = preds
predictions.drop(columns=['tn_2'], inplace=True)
predictions = predictions.reset_index()
predictions =  predictions.groupby('product_id')['tn'].sum()
predictions.columns = ['product_id', 'tn']
predictions.to_csv(DATOS_DIR+'/pred/0002-predicciones-tn2-custom.csv', index=True,header=True)
print("Overall custom metric: ", average_metric)


Overall custom metric:  0.2764194970848789


In [32]:

print("Error: " ,abs(sum(df_true.groupby('product_id')['tn'].sum().values-predictions.values))/sum(df_true.groupby('product_id')['tn'].sum().values))

Error:  0.6795516282269598
