# **Linear Regression Model**

## **Fisrt Steps**

In [None]:
# Set imports
import pandas
import numpy
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score


In [None]:
# Set Variables
data_path = '../assets/data_filtered.csv'

selected_item = 'FZ150'
time_frequencies = ['D', '3D', 'W', 'ME', '3ME']

show_plots = True

In [None]:
class Struct:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            if isinstance(value, dict):
                self.__dict__[key] = Struct(**value)
            else:
                self.__dict__[key] = value

In [None]:
def plotter(xy, title, y_limit, grid):
   
   dataframe = xy.copy()
   dataframe['fecha_ordinal'] = dataframe['fecha'].map(pandas.Timestamp.toordinal)
   time = numpy.asarray(dataframe['fecha'], dtype='datetime64[s]')

   coef = numpy.polyfit(dataframe['fecha_ordinal'], dataframe[selected_item], 1)
   poly1d_fn = numpy.poly1d(coef)  # Obtener la función polinómica de grado 1

   figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(10, 3))

   ax = axes
   ax.scatter(time, dataframe[selected_item], s=12, label=selected_item)
   ax.plot(time, poly1d_fn(dataframe['fecha_ordinal']), color='red', label='Tendencia lineal')
   ax.tick_params(axis='x', labelrotation=0)
   ax.margins(x=0.03, y=0.04)
   ax.set(
      title='  |  '.join(title),
      xlabel='Fecha', 
      ylabel='No. de Ventas',
      ylim=y_limit
   )
   ax.legend()
   if grid: 
      ax.grid()

   figure.tight_layout()

In [None]:
def standart_deviation_filter(data, value):
   media = data[value].mean()
   desv_std = data[value].std()
   rango_valido = [media - 3 * desv_std, media + 3 * desv_std]

   # Filtrar datos dentro del rango válido
   return data[(data[value] >= rango_valido[0]) & (data[value] <= rango_valido[1])]

In [None]:
def quantiles_filter(data, value):
   # Límites basados en cuartiles
   Q1 = data[value].quantile(0.25)
   Q3 = data[value].quantile(0.75)
   IQR = Q3 - Q1
   rango_valido = [Q1 - 1.5 * IQR, Q3 + 1.5 * IQR]
   rango_valido_min = rango_valido[0]
   rango_valido_max = rango_valido[1]

   # Filtrar datos dentro del rango válido
   return data[(data[value] >= rango_valido_min) & (data[value] <= rango_valido_max)]

In [None]:
def linear_regression(data_x, data_y):
   model = LinearRegression()

   # Dividir los datos en conjuntos de entrenamiento y prueba
   X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, test_size=0.2, random_state=42)

   # Crear y entrenar el modelo
   model.fit(X_train, y_train)
   
   # Realizar predicciones en el conjunto de prueba
   y_pred = model.predict(X_test)

   # Calcular el MSE y el R^2
   data_min = data_y.min()
   data_max = data_y.max()

   ecm = mean_squared_error(y_test, y_pred)
   recm = root_mean_squared_error(y_test, y_pred)
   precm = recm / (data_max - data_min) * 100

   r2 = r2_score(y_test, y_pred)
   pr2 = r2 * 100

   results = {
      'range': (data_min, data_max),

      'ecm': mean_squared_error(y_test, y_pred),
      'recm': numpy.sqrt(ecm), # Margen de error en las predicciones
      'precm': precm, # Porcentaje de error

      'r2': r2_score(y_test, y_pred), 
      'pr2': r2 * 100
   }

   return Struct(**results)

In [None]:
def frequency_spliter(data):
   data_splitted = {}
   
   for frequency in time_frequencies:
      data_metadata = {}

      freq_data = data.groupby(pandas.Grouper(key='fecha', freq=frequency, sort=True)).sum().reset_index() if frequency != 'D' else data.copy()

      freq_data.insert(1, 'dia', (freq_data['fecha'].dt.to_period('D') + 1 - freq_data['fecha'].dt.to_period('D').min()).apply(lambda x: x.n))
      freq_data.insert(1, 'mes', (freq_data['fecha'].dt.to_period('M') + 1 - freq_data['fecha'].dt.to_period('M').min()).apply(lambda x: x.n))
      freq_data.insert(1, 'año', (freq_data['fecha'].dt.to_period('Y') + 1 - freq_data['fecha'].dt.to_period('Y').min()).apply(lambda x: x.n))

      data_min = freq_data[selected_item].min()
      data_max = freq_data[selected_item].max()
      range_diff = (data_max - data_min) / 20
      range_min = range_diff if data_min < 0 else data_min - range_diff

      data_metadata['dataset'] = freq_data
      data_metadata['time'] = numpy.asarray(freq_data['fecha'], dtype='datetime64[s]')
      data_metadata['range'] = (range_min, data_max + range_diff)

      data_splitted[frequency] = Struct(**data_metadata)
   
   return data_splitted

## **Prepare Data**

In [None]:
# Load and prepare Dataset
datos = pandas.read_csv(data_path, parse_dates=['fecha'], date_format='%Y-%m-%d')

# Set Independent Data
items = list( datos.iloc[:, 1:-1].keys() )
independent_data = ['año', 'mes', 'dia']

# Filter data by time-frequencies
data = frequency_spliter(data=datos)

In [None]:
# Prepare data example to plot
time_frequency_example = '3D'
show_plot_grids = True
data_example = data[time_frequency_example]

## **Out Atypical Data Filter**

In [None]:
# Filter data removing Out atypical (extreme) values

# Stardart deviation limits
st_dev_data = standart_deviation_filter(data=data_example.dataset, value=selected_item)
st_dev_time = numpy.asarray(st_dev_data['fecha'], dtype='datetime64[s]')

# Quantiles limits
quant_data = quantiles_filter(data=data_example.dataset, value=selected_item)
quant_time = numpy.asarray(quant_data['fecha'], dtype='datetime64[s]')

In [None]:
# Example data plot
if show_plots:
   plotter(
      xy=data_example.dataset[['fecha' ,selected_item]], 
      title=[selected_item, f'freq = {time_frequency_example}', 'Data Origin'], 
      y_limit=data_example.range,
      grid=show_plot_grids,
   )

In [None]:
# Stardart Deviation plot
if show_plots:
   plotter(
      xy=st_dev_data[['fecha' ,selected_item]], 
      title=[selected_item, f'freq = {time_frequency_example}', 'Filtrado de Datos Atípicos (Desviación Estandar)'], 
      y_limit=data_example.range,
      grid=show_plot_grids,
   )

In [None]:
# Quantiles plot
if show_plots:
   plotter(
      xy=quant_data[['fecha' ,selected_item]], 
      title=[selected_item, f'freq = {time_frequency_example}', 'Filtrado de Datos Atípicos (Cuantiles)'], 
      y_limit=data_example.range,
      grid=show_plot_grids,
   )

In [None]:
# Limits comparison plot
if show_plots:
   figure, axes = pyplot.subplots(ncols=3, figsize=(12, 6))

   ax = axes[0]
   ax.boxplot(data_example.dataset[selected_item], showfliers=False)
   ax.scatter(x=[.8] * len(data_example.dataset), y=data_example.dataset[selected_item], alpha=0.2, s=24)
   ax.grid()
   ax.set(
      title='Ventas  |  Original Data',
      ylabel=selected_item,
      ylim=data_example.range,
      xticks=([])
   )

   ax = axes[1]
   ax.boxplot(st_dev_data[selected_item], showfliers=False)
   ax.scatter(x=[.8] * len(st_dev_data), y=st_dev_data[selected_item], alpha=0.2, s=24)
   ax.grid()
   ax.set(
      title='Ventas  |  Desv. Estándar',
      ylim=data_example.range,
      xticks=([])
   )

   ax = axes[2]
   ax.boxplot(quant_data[selected_item], showfliers=True)
   ax.scatter(x=[.8] * len(quant_data), y=quant_data[selected_item], alpha=0.2, s=24)
   ax.grid()
   ax.set(
      title='Ventas  |  Lim. Cuantiles',
      ylim=data_example.range,
      xticks=([])
   )


## **Model Train and Test**

In [None]:
# Regresión Lineal Results | Data ejemplo
data_example = data[time_frequency_example]

data_example_model_results = linear_regression(data_x=data_example.dataset[independent_data], data_y=data_example.dataset[selected_item])

print(f'Rango: {data_example_model_results.range}') # Rango de los valores
# print(f'MSE: {data_example_model_results.ecm}') # Error Cuadrático Medio
print(f'RECM: ± {data_example_model_results.recm:.0f}') # Raíz del Error Cuadrático Medio
print(f'RECM(%): {data_example_model_results.precm:.2f} %') # Porcentaje de la Raíz del Error Cuadrático Medio

print(f'R^2: {data_example_model_results.r2}') # R-Cuadrado (ajuste del modelo)
print(f'R^2(%): {data_example_model_results.pr2:.2f}%') # Porcentaje de R-Cuadrado

In [None]:
# Results mappers
rmse_results = {
   'Orig. Range': [],
   'Orig.': [], 
   'Stand. Dev. Range': [],
   'Stand. Dev.': [], 
   'Quant. Range': [],
   'Quant.': [],
}

prmse_results = {
   'Orig.': [],
   'Stand. Dev.': [],
   'Quant.': [],
}

pr2_results = {
   'Orig.': [],
   'Stand. Dev.': [], 
   'Quant.': [],
}

In [None]:
# Get lineal model results by freq and limit filter
for freq in time_frequencies:

   data_model_results = linear_regression(data_x=data[freq].dataset[independent_data], data_y=data[freq].dataset[selected_item])
   rmse_results['Orig. Range'].append( f'{data_model_results.range}' )
   rmse_results['Orig.'].append( f'{data_model_results.recm:.0f}' )
   prmse_results['Orig.'].append( f'{data_model_results.precm:.1f}%' )
   pr2_results['Orig.'].append( f'{data_model_results.pr2:.0f}%' )

   st_dev_data = standart_deviation_filter(data=data[freq].dataset, value=selected_item)
   st_dev_model_results = linear_regression(data_x=st_dev_data[independent_data], data_y=st_dev_data[selected_item])
   rmse_results['Stand. Dev. Range'].append( f'{st_dev_model_results.range}' )
   rmse_results['Stand. Dev.'].append( f'{st_dev_model_results.recm:.0f}' )
   prmse_results['Stand. Dev.'].append( f'{st_dev_model_results.precm:.1f}%' )
   pr2_results['Stand. Dev.'].append( f'{st_dev_model_results.pr2:.0f}%' )

   quant_data = standart_deviation_filter(data=data[freq].dataset, value=selected_item)
   quant_model_results = linear_regression(data_x=quant_data[independent_data], data_y=quant_data[selected_item])
   rmse_results['Quant. Range'].append( f'{quant_model_results.range}' )
   rmse_results['Quant.'].append( f'{quant_model_results.recm:.0f}' )
   prmse_results['Quant.'].append( f'{quant_model_results.precm:.1f}%' )
   pr2_results['Quant.'].append( f'{quant_model_results.pr2:.0f}%' )

In [None]:
# Show RMSE results
General_RMSE_Results = pandas.DataFrame(rmse_results, index=time_frequencies)
# General_RMSE_Results

In [None]:
# Show RMSE (%) results
General_PRMSE_Results = pandas.DataFrame(prmse_results, index=time_frequencies)
General_PRMSE_Results

In [None]:
# Show R2 results
General_R2_Results = pandas.DataFrame(pr2_results, index=time_frequencies)
General_R2_Results

## **Model Predictions**

In [None]:
# ventas_predichas = modelo.predict([])
# print(f"Las ventas predichas para el mes {mes_siguiente} son: {ventas_predichas[0]}")