## **Fisrt Steps**

In [None]:
# Set imports
import pandas
import numpy
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Set Variables
selected_item = 'TOTAL'
show_plots = False

In [None]:
def plotter(x, y, title, y_limit):
   figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(12, 4))

   ax = axes
   ax.scatter(x, y, label=selected_item, s=12)
   ax.tick_params(axis='x', labelrotation=0)
   ax.margins(x=0.03, y=0.04)

   ax.set(
      title='  |  '.join([ title[0], f'{min(x).item().strftime("%Y-%m")} - {max(x).item().strftime("%Y-%m")}', title[1] ]),
      xlabel='Fecha', 
      ylabel='No. de Ventas',
      ylim=y_limit
   )
   ax.legend()

   figure.tight_layout()

In [None]:
def standart_deviation_filter(data, value):
   media = data[value].mean()
   desv_std = data[value].std()
   rango_valido = [media - 3 * desv_std, media + 3 * desv_std]

   # Filtrar datos dentro del rango válido
   return data[(data[value] >= rango_valido[0]) & (data[value] <= rango_valido[1])]

In [None]:
def quantiles_filter(data, value):
   # Límites basados en cuartiles
   Q1 = data[value].quantile(0.25)
   Q3 = data[value].quantile(0.75)
   IQR = Q3 - Q1
   rango_valido = [Q1 - 1.5 * IQR, Q3 + 1.5 * IQR]
   rango_valido_min = rango_valido[0]
   rango_valido_max = rango_valido[1]

   # Filtrar datos dentro del rango válido
   return data[(data[value] >= rango_valido_min) & (data[value] <= rango_valido_max)]

In [None]:
def linear_regression(data_x, data_y, model: LinearRegression):
   # Dividir los datos en conjuntos de entrenamiento y prueba
   X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, test_size=0.2, random_state=42)

   # Crear y entrenar el modelo
   model.fit(X_train, y_train)
   
   # Realizar predicciones en el conjunto de prueba
   y_pred = model.predict(X_test)

   # Calcular el MSE y el R^2
   ecm = mean_squared_error(y_test, y_pred)
   recm = numpy.sqrt(ecm)
   r2 = r2_score(y_test, y_pred)
   pr2 = r2 * 100

   print(f'Rango: {data_y.min()} y {data_y.max()}') # Rango de los valores
   print(f'MSE: {ecm}') # Error cuadrático medio (MSE)
   print(f"RECM: {recm}")
   print(f'R^2: {r2}') # Coeficiente de determinación 
   print(f'R^2(%): {pr2:.2f}%')

In [None]:
def model_results(freq, data):

   diario = df.resample('D', on='fecha').count()
   cada_tres_dias = df.resample('3D', on='fecha').count()
   semanal = df.resample('W', on='fecha').count()
   mensual = df.resample('ME', on='fecha').count()
   trimestral = df.resample('3ME', on='fecha').count()
   semestral = df.resample('6ME', on='fecha').count()
   anual = df.resample('YE', on='fecha').count()

In [None]:
class Struct:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            if isinstance(value, dict):
                self.__dict__[key] = Struct(**value)
            else:
                self.__dict__[key] = value

## **Prepare Data**

In [None]:
# Load and prepare Dataset
datos = pandas.read_csv('./assets/data_filtered.csv', parse_dates=['fecha'], date_format='%Y-%m-%d')

# Set Independent Data
items = list( datos.iloc[:, 1:-1].keys() )
independent_data = ['año', 'mes', 'dia']

In [None]:
# Filter data by time-frequencies
time_frequencies = ['D', '3D', 'W', 'ME', '3ME']
data_frequencies = {}

for frequency in time_frequencies[1:]:
   data_metadata = {}

   freq_data = datos.groupby(pandas.Grouper(key='fecha', freq=frequency, sort=True)).sum().reset_index()
   freq_data.insert(1, 'dia', (freq_data['fecha'].dt.to_period('D') + 1 - freq_data['fecha'].dt.to_period('D').min()).apply(lambda x: x.n))
   freq_data.insert(1, 'mes', (freq_data['fecha'].dt.to_period('M') + 1 - freq_data['fecha'].dt.to_period('M').min()).apply(lambda x: x.n))
   freq_data.insert(1, 'año', (freq_data['fecha'].dt.to_period('Y') + 1 - freq_data['fecha'].dt.to_period('Y').min()).apply(lambda x: x.n))

   data_metadata['dataset'] = freq_data
   data_metadata['time'] = numpy.asarray(freq_data['fecha'], dtype='datetime64[s]')
   data_metadata['range'] = (-3, freq_data[selected_item].max() * 1.1)

   data_frequencies[frequency] = data_metadata

datos.insert(1, 'dia', (datos['fecha'].dt.to_period('D') + 1 - datos['fecha'].dt.to_period('D').min()).apply(lambda x: x.n))
datos.insert(1, 'mes', (datos['fecha'].dt.to_period('M') + 1 - datos['fecha'].dt.to_period('M').min()).apply(lambda x: x.n))
datos.insert(1, 'año', (datos['fecha'].dt.to_period('Y') + 1 - datos['fecha'].dt.to_period('Y').min()).apply(lambda x: x.n))
data_frequencies = {
   'D': {
      'dataset': datos,
      'time': numpy.asarray(datos['fecha'], dtype='datetime64[s]'),
      'range': (-3, datos[selected_item].max() * 1.1)
   }
}

data = Struct(**data_frequencies)

## **Out Atypical Data Filter**

In [None]:
# All data
if show_plots:
   plotter(x=data.D.time, y=data.D.dataset[selected_item], title=[selected_item, 'Data Origin'], y_limit=data.D.range)

In [None]:
# Límites basados en desviaciones estándar
st_dev_data = standart_deviation_filter(data=data.D.dataset, value=selected_item)
st_dev_time = numpy.asarray(st_dev_data['fecha'], dtype='datetime64[s]')

if show_plots:
   plotter(x=st_dev_time, y=st_dev_data[selected_item], title=[selected_item, 'Filtrado de Datos Atípicos (Desviación Estandar)'], y_limit=data.D.range)

In [None]:
# Límites basados en cuartiles
quant_data = quantiles_filter(data=data.D.dataset, value=selected_item)
quant_time = numpy.asarray(quant_data['fecha'], dtype='datetime64[s]')

if show_plots:
   plotter(x=quant_time, y=quant_data[selected_item], title=[selected_item, 'Filtrado de Datos Atípicos (Cuartiles)'], y_limit=data.D.range)

In [None]:
# Comparativa de filtrados
if show_plots:
   figure, axes = pyplot.subplots(ncols=3, figsize=(12, 6))

   ax = axes[0]
   ax.boxplot(data.D.dataset[selected_item], showfliers=False)
   ax.scatter(x=[.8] * len(data.D.dataset), y=data.D.dataset[selected_item], alpha=0.2)
   ax.grid()
   ax.set(
      title='Totals  |  Totals',
      ylabel=selected_item,
      ylim=data.D.range,
      xticks=([])
   )

   ax = axes[1]
   ax.boxplot(st_dev_data[selected_item], showfliers=False)
   ax.scatter(x=[.8] * len(st_dev_data), y=st_dev_data[selected_item], alpha=0.2)
   ax.grid()
   ax.set(
      title='Ventas  |  Desv. Estándar',
      ylim=data.D.range,
      xticks=([])
   )

   ax = axes[2]
   ax.boxplot(quant_data[selected_item], showfliers=True)
   ax.scatter(x=[.8] * len(quant_data), y=quant_data[selected_item], alpha=0.2)
   ax.grid()
   ax.set(
      title='Ventas  |  Lim. Cuartiles',
      ylim=data.D.range,
      xticks=([])
   )


## **Model Train and Test**

In [None]:
# regresión Lineal | Total data
data_model = LinearRegression()
linear_regression(data_x=data.D.dataset[independent_data], data_y=data.D.dataset[selected_item], model=data_model)

In [None]:
# regresión Lineal | Filtrado x Desviación Estándar
st_dev_data_model = LinearRegression()
linear_regression(data_x=quant_data[independent_data], data_y=quant_data[selected_item], model=st_dev_data_model)

In [None]:
# regresión Lineal | Filtrado x Cuartiles
quant_data_model = LinearRegression()
linear_regression(data_x=st_dev_data[independent_data], data_y=st_dev_data[selected_item], model=quant_data_model)

## **Model Predictions**

In [None]:
# ventas_predichas = modelo.predict([])
# print(f"Las ventas predichas para el mes {mes_siguiente} son: {ventas_predichas[0]}")