In [204]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from datetime import timedelta
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from itertools import product
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [205]:
data = pd.read_csv("data/data_01.txt", sep="\t", parse_dates=['fecha'])
df = data.copy()

## Pre-procesamiento

In [206]:
terminales_interes = [1774, 1908, 1964, 1910, 1980]
operaciones_interes = [0, 1, 3, 4, 7]
df_filtrado = df[df['idTerminal'].isin(terminales_interes) & df['oper'].isin(operaciones_interes)]
df_filtrado['fecha'] = pd.to_datetime(df_filtrado['fecha'], format='%Y-%m-%d').dt.date



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [207]:
df_agrupado = df_filtrado.groupby(['fecha', 'idTerminal', 'oper']).size().reset_index(name='conteo')

In [208]:
df_concat =  df_agrupado.copy()
df_concat.rename(columns={'conteo': 'Transacciones'}, inplace=True)
df_concat['fecha'] = pd.to_datetime(df_concat['fecha'])

In [209]:
df_agrupado['dia_semana'] = df['fecha'].dt.dayofweek
df_agrupado['mes'] = df['fecha'].dt.month
df_agrupado['dia_mes'] = df['fecha'].dt.day
y = df_agrupado['conteo']
df_agrupado.drop('conteo', inplace = True, axis=1)

In [210]:
enc = OneHotEncoder(sparse_output=False, categories=[sorted(terminales_interes), sorted(operaciones_interes)])
terminales_operaciones = np.array(list(zip(df_agrupado['idTerminal'], df_agrupado['oper'])))
enc.fit(terminales_operaciones)
terminales_operaciones_enc = enc.transform(terminales_operaciones)
X = np.hstack((terminales_operaciones_enc, df_agrupado[['dia_semana', 'mes', 'dia_mes']].values))

In [211]:
# División en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest

### GridSearch

In [212]:
# Definir el espacio de hiperparámetros
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

# Crear el modelo base
modelo_rf = RandomForestRegressor(random_state=42)

# Configuración de GridSearchCV
grid_search_rf = GridSearchCV(estimator=modelo_rf, param_grid=param_grid_rf, 
                              cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Ajuste del modelo
grid_search_rf.fit(X_train, y_train)

# Mejores hiperparámetros
mejores_parametros_rf = grid_search_rf.best_params_

print("Mejores hiperparámetros para Random Forest:", mejores_parametros_rf)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Mejores hiperparámetros para Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


### Ajuste del modelo con hiperparametros seleccionados

In [213]:
# Modelo Random Forest
modelo_rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=None, max_features='sqrt', min_samples_leaf=2,
                                  min_samples_split=2)
modelo_rf.fit(X_train, y_train)

### Evaluación del modelo

In [214]:
y_pred = modelo_rf.predict(X_test)

# Cálculo de MSE y MAE
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)

print("MSE:", round(mse, 3))
print("MAE:", round(mae, 3))
print("R^2:", round(r2_rf, 3))

MSE: 1131.189
MAE: 18.701
R^2: 0.643


# Gradient Boosting

In [215]:
from sklearn.ensemble import GradientBoostingRegressor

### GridSearch 

In [216]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4, 6]
}


modelo_gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
grid_search = GridSearchCV(estimator=modelo_gb, param_grid=param_grid, 
                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

grid_search.fit(X_train, y_train)
mejores_parametros = grid_search.best_params_

print("Mejores hiperparámetros:", mejores_parametros)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Mejores hiperparámetros: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 100}


### Ajuste del modelo con hiperparametros seleccionados

In [217]:
modelo_gb = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1, max_depth=4, min_samples_split=2)
modelo_gb.fit(X_train, y_train)

### Evaluación del modelo

In [218]:
y_pred_gb = modelo_gb.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print("MSE (Gradient Boosting):", round(mse_gb, 3))
print("MAE (Gradient Boosting):", round(mae_gb, 3))
print("R^2 (Gradient Boosting):", round(r2_gb, 3))

MSE (Gradient Boosting): 1131.218
MAE (Gradient Boosting): 18.719
R^2 (Gradient Boosting): 0.643


# SVM

In [219]:
from sklearn.svm import SVR

### GridSearch 

In [220]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1],
    'degree': [2, 3, 4]
}

# Crear el modelo base
modelo_svm = SVR()

# Configuración de GridSearchCV
grid_search_svm = GridSearchCV(estimator=modelo_svm, param_grid=param_grid_svm, 
                                cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Ajustar el modelo
grid_search_svm.fit(X_train, y_train)

# Mejores hiperparámetros
mejores_parametros_svm = grid_search_svm.best_params_

print("Mejores hiperparámetros para SVM:", mejores_parametros_svm)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


Mejores hiperparámetros para SVM: {'C': 10, 'degree': 2, 'gamma': 1, 'kernel': 'rbf'}


### Ajuste del modelo con hiperparametros seleccionados

In [221]:
modelo_svm = SVR(kernel='rbf', C=10, degree=2, gamma=1)
modelo_svm.fit(X_train, y_train)

### Evaluación del modelo

In [222]:
y_pred_svm = modelo_svm.predict(X_test)
mse_svm = mean_squared_error(y_test, y_pred_svm)
mae_svm = mean_absolute_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)

In [223]:
print("MSE (SVM):", round(mse_svm, 3))
print("MAE (SVM):", round(mae_svm, 3))
print("R^2 (SVM):", round(r2_svm, 3))

MSE (SVM): 1199.368
MAE (SVM): 18.572
R^2 (SVM): 0.621


# En resumen:

In [224]:
data = {
    "SVM": [mse_svm, mae_svm, r2_svm],
    "GB": [mse_gb, mae_gb, r2_gb],
    "RF": [mse, mae, r2_rf]
}

index = ["MSE", "MAE", "R2"]
df = pd.DataFrame(data, index=index)
df

def highlight_min_max(s):
    '''
    Subraya el valor mínimo en las filas MSE y MAE, y el valor máximo en la fila R2.
    '''
    if s.name == "R2":
        #Resaltar el valor máximo en R2
        is_max = s == s.max()
        return ['text-decoration: underline' if v else '' for v in is_max]
    else:
        #Resaltar el valor mínimo en R2MSE y MAE
        is_min = s == s.min()
        return ['text-decoration: underline' if v else '' for v in is_min]

styled_df = df.style.apply(highlight_min_max, axis=1)
styled_df

Unnamed: 0,SVM,GB,RF
MSE,1199.367574,1131.21778,1131.189023
MAE,18.572086,18.718598,18.700737
R2,0.62098,0.642517,0.642526


Se observa que de los 3 modelos implementados, por votación el que tiene un mejor desempeño en las métricas establecidas (MSE, MAE, R2) es el modelo Random Forest, el cual presenta un MSE menor respecto a los demás modelos y un coeficiente de determinación superior.

En consecuencia, se procede a realizar el respectivo pronostico:

# Pronostico

In [225]:
# Generación de datos para predicción
ultima_fecha = df_agrupado['fecha'].max()
fechas_futuras = [ultima_fecha + timedelta(days=i) for i in range(1, 16)]
fechas_futuras = pd.to_datetime(fechas_futuras, format='%Y-%m-%d')  # Convertir a datetime aquí
combinaciones_futuras = list(product(fechas_futuras, terminales_interes, operaciones_interes))
datos_prediccion = pd.DataFrame(combinaciones_futuras, columns=['fecha', 'idTerminal', 'oper'])
datos_prediccion['dia_semana'] = datos_prediccion['fecha'].dt.dayofweek
datos_prediccion['mes'] = datos_prediccion['fecha'].dt.month
datos_prediccion['dia_mes'] = datos_prediccion['fecha'].dt.day

In [226]:
datos_prediccion

Unnamed: 0,fecha,idTerminal,oper,dia_semana,mes,dia_mes
0,2017-06-01,1774,0,3,6,1
1,2017-06-01,1774,1,3,6,1
2,2017-06-01,1774,3,3,6,1
3,2017-06-01,1774,4,3,6,1
4,2017-06-01,1774,7,3,6,1
...,...,...,...,...,...,...
370,2017-06-15,1980,0,3,6,15
371,2017-06-15,1980,1,3,6,15
372,2017-06-15,1980,3,3,6,15
373,2017-06-15,1980,4,3,6,15


In [227]:
#One-hot encoding
terminales_operaciones_pred = np.array(list(zip(datos_prediccion['idTerminal'], datos_prediccion['oper'])))
terminales_operaciones_enc_pred = enc.transform(terminales_operaciones_pred)
X_pred = np.hstack((terminales_operaciones_enc_pred, 
                    datos_prediccion[['dia_semana', 'mes', 'dia_mes']].values))

In [228]:
# Predicciones
predicciones = modelo_rf.predict(X_pred)
datos_prediccion['Transacciones'] = predicciones

In [229]:
predicciones

array([145.61831071,  26.77260294,  55.47801787,  10.63347997,
        10.94699614, 131.24201182,  20.47546795,  53.00955722,
        18.82309232,  15.92872343, 131.36490092,  26.56848485,
        52.28325376,  15.86600515,   9.51491338, 135.14482688,
        17.1286597 ,  30.90440269,   7.28930324,  19.73457946,
        99.97214217,  34.93678622,  28.26083797,   6.05116915,
        12.98964957, 145.61831071,  26.77260294,  55.47801787,
        10.63347997,  10.94699614, 131.24201182,  20.47546795,
        53.00955722,  18.82309232,  15.92872343, 131.36490092,
        26.56848485,  52.28325376,  15.86600515,   9.51491338,
       135.14482688,  17.1286597 ,  30.90440269,   7.28930324,
        19.73457946,  99.97214217,  34.93678622,  28.26083797,
         6.05116915,  12.98964957, 145.61831071,  26.77260294,
        55.47801787,  10.63347997,  10.94699614, 131.24201182,
        20.47546795,  53.00955722,  18.82309232,  15.92872343,
       131.36490092,  26.56848485,  52.28325376,  15.86

In [230]:
# Resultados
predicciones_finales = datos_prediccion[['fecha', 'idTerminal', 'oper', 'Transacciones']]
predicciones_finales

Unnamed: 0,fecha,idTerminal,oper,Transacciones
0,2017-06-01,1774,0,145.618311
1,2017-06-01,1774,1,26.772603
2,2017-06-01,1774,3,55.478018
3,2017-06-01,1774,4,10.633480
4,2017-06-01,1774,7,10.946996
...,...,...,...,...
370,2017-06-15,1980,0,99.972142
371,2017-06-15,1980,1,34.936786
372,2017-06-15,1980,3,28.260838
373,2017-06-15,1980,4,6.051169


In [231]:
df_final = pd.concat([df_concat, predicciones_finales], ignore_index=True)

In [236]:
fig = px.line(
    df_final[df_final['fecha'] >= '2017-05-15'], 
    x='fecha', 
    y='Transacciones',
    color='oper',
    facet_row='idTerminal',  
    title='Pronóstico transacciones'
)

fig.update_layout(height=300 * df_final['idTerminal'].nunique())  
fig.add_vline(x='2017-06-01', line_width=2, line_dash="dash", line_color="black")

fig.show()