# **Polynomial Regression Model**

In [None]:
# Set Variables
data_path = '../assets/data_filtered.csv'
time_frequencies = ['D', '3D', 'W', '3W', 'ME', '3ME']

selected_item = 'TOTAL'

degrees_limit = 20
model_random_seed = 0
show_plots = False

## **Firts Steps**

In [None]:
# Set imports
import numpy
import pandas
from matplotlib import pyplot
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

In [None]:
class Struct:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            if isinstance(value, dict):
                self.__dict__[key] = Struct(**value)
            else:
                self.__dict__[key] = value

In [None]:
def frequency_grouper(data):
   data_splitted = {}
   
   for frequency in time_frequencies:
      data_metadata = {}

      # Group by frequency time
      frequency_data = data.groupby(pandas.Grouper(key='fecha', freq=frequency, sort=True)).sum().reset_index() if frequency != 'D' else data.copy()

      # Ennumerate dates
      date_reference = frequency_data['fecha'].min()
      frequency_data['dia'] = (frequency_data['fecha'] - date_reference).dt.days + 1

      # Set range
      data_min = frequency_data[selected_item].min()
      data_max = frequency_data[selected_item].max()
      range_diff = (data_max - data_min) / 20
      range_min = -range_diff if data_min <= 0 else 0

      # group data
      data_metadata['min'] = data_min
      data_metadata['max'] = data_max

      data_metadata['dataset'] = frequency_data
      data_metadata['time'] = numpy.asarray(frequency_data['fecha'], dtype='datetime64[s]')
      data_metadata['range'] = (range_min, data_max + range_diff)

      # Convert data dict to object
      data_splitted[frequency] = Struct(**data_metadata)
   
   return data_splitted

In [None]:
def polynomial_train(data_x, data_y, degree):
   # Split data between train data and test data
   X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, test_size=0.2, random_state=model_random_seed)

   # Train model with train data
   polynomial_features = PolynomialFeatures(degree=degree)
   X_poly_train = polynomial_features.fit_transform(X_train)

   model = LinearRegression()
   model.fit(X_poly_train, y_train)

   # Test model
   X_poly_test = polynomial_features.transform(X_test)
   y_pred = model.predict(X_poly_test)

   return Struct(**{
      # 'X_train': X_train, 
      # 'X_test': X_test, 
      # 'y_train': y_train, 
      'y_test': y_test, 
      'y_pred': y_pred
   })

In [None]:
def polynomial_test(data_test, data_pred, min, max):
   # Verify model results
   ecm = mean_squared_error(data_test, data_pred)
   recm = root_mean_squared_error(data_test, data_pred)
   precm = recm / (max - min) * 100

   r2 = r2_score(data_test, data_pred)
   pr2 = r2 * 100

   # Get model metadata
   return Struct(**{
      # 'range': (min, max),
      # 'ecm': ecm,
      # 'recm': recm,
      'precm': precm,
      # 'r2': r2, 
      # 'pr2': pr2
   })

In [None]:
def polynomial_regression(data_x, data_y, degree):
   # Init Polynomial Features
   polynomial_features = PolynomialFeatures(degree=degree)  # Grado 2 para un polinomio cuadrático
   X_poly = polynomial_features.fit_transform(data_x)

   # Train Model
   model = LinearRegression()
   model.fit(X_poly, data_y)
   y_pred = model.predict(X_poly)

   return Struct(**{
      'model': model,
      'features': polynomial_features,
      'y_pred': y_pred
   })

In [None]:
def polynomial(data_x, data_y, degree):
   # Get max and min data values
   data_min = data_y.min()
   data_max = data_y.max()

   # Split data between train data and test data
   X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, test_size=0.2, random_state=model_random_seed)

   # Train model with train data
   polynomial_features = PolynomialFeatures(degree=degree)
   X_poly_train = polynomial_features.fit_transform( X_train )

   model = LinearRegression()
   model.fit(X_poly_train, y_train)

   # Test model
   X_poly_test = polynomial_features.transform( X_test )
   y_pred = model.predict(X_poly_test)

   # Verify model results
   ecm = mean_squared_error(y_test, y_pred)
   recm = root_mean_squared_error(y_test, y_pred)
   precm = recm / (data_max - data_min) * 100

   r2 = r2_score(y_test, y_pred)
   pr2 = r2 * 100

   # Get model metadata
   data_slices = {
      'x_test': X_test,
      'x_train': X_train,
      'y_test': y_test,
      'y_train': y_train,
      'y_pred': y_pred
   }
   results = {
      'range': (data_min, data_max),
      'ecm': ecm,
      'recm': recm,
      'precm': precm,
      'r2': r2, 
      'pr2': pr2
   }

   return Struct(**{
      'model': model,
      'data': data_slices,
      'results': results
   })

## **Prepare Data**

In [None]:
# Load filtered data
datos = pandas.read_csv(data_path, parse_dates=['fecha'], date_format='%Y-%m-%d')
datos.head(5)

In [None]:
# Exclude only necesary data
datos = datos[['fecha', selected_item]]
datos.head()

In [None]:
# Split data by time frequencies
data = frequency_grouper(data=datos)
independent_data = ['dia']

## **Optimal Degree**

In [None]:
# Eval model by frequency-degree
degrees = numpy.arange(1, degrees_limit+1)
degrees_prmse = { 'degree': range(1, degrees_limit+1) }

for frequency in time_frequencies:

   degrees_prmse[frequency] = []
   X = data[frequency].dataset[independent_data]
   y = data[frequency].dataset[selected_item]

   for degree in degrees:
      model_results = polynomial(
         data_x=X,
         data_y=y,
         degree=degree,
      )
      degrees_prmse[frequency].append( round(model_results.results.precm, 3) )

# Show RMSE (%) Results by Degree
prmse_results = pandas.DataFrame(degrees_prmse).set_index('degree')
prmse_results

In [None]:
# # Eval model by frequency-degree
# degrees = numpy.arange(1, degrees_limit+1)
# degrees_prmse = { 'degree': range(1, degrees_limit+1) }

# for frequency in time_frequencies:

#    degrees_prmse[frequency] = []

#    # independent and target data
#    X_data = data[frequency].dataset[independent_data]
#    y_data = data[frequency].dataset[selected_item]

#    # Get max and min data values
#    data_min = X_data.min()
#    data_max = y_data.max()

#    for degree in degrees:

#       train_results = polynomial_train(
#          data_x=X_data, 
#          data_y=y_data, 
#          degree=degree
#       )
#       test_results = polynomial_test(
#          data_test=train_results.y_test, 
#          data_pred=train_results.y_pred, 
#          min=data_min, 
#          max=data_max
#       )
#       degrees_prmse[frequency].append( round(test_results.precm, 3) )
#       # print(round(results.precm, 3))

# # Show RMSE (%) Results by Degree
# prmse_results = pandas.DataFrame(degrees_prmse).set_index('degree')
# prmse_results

In [None]:
# Get min RMSE Percent
min_prmse = prmse_results.values.min()
print(f'Min. error (prmse): {min_prmse}%')

# Obtenemos la fila y columna donde se encuentra el valor máximo
min_degree, min_freq = prmse_results.values.argmin()//len(prmse_results.columns), prmse_results.values.argmin()%len(prmse_results.columns)
print(f'degree: °{min_degree + 1}')
print(f'freq: {prmse_results.columns[min_freq]}')

## **Run Model**

In [None]:
# Set selected params
customs_selects = True
custom_freq = 'D'
custom_degree = 6

selected_freq = prmse_results.columns[min_freq] if not customs_selects else custom_freq
selected_degree = min_degree+1 if not customs_selects else custom_degree
selected_data = data[selected_freq].dataset 
periods_to_predict = 30*7

In [None]:
# Create new model instance
polynomial_model = polynomial_regression(
   data_x=numpy.array(selected_data[independent_data]).reshape(-1, 1),
   data_y=selected_data[selected_item],
   degree=selected_degree,
)

y_pred = polynomial_model.y_pred

In [None]:
# Show Original Data
pyplot.figure(figsize=(16, 4))
pyplot.scatter(selected_data['fecha'], selected_data[selected_item], label='Current Data')
pyplot.plot(selected_data['fecha'], y_pred, color='red', label='Polynomial Regression Model')
pyplot.title('  |  '.join([selected_item, f'freq = {selected_freq}', f'Polyn. Reg. °{selected_degree}']))
pyplot.xlabel('X')
pyplot.ylabel('y')
pyplot.legend()
pyplot.show()


## **Model Predictions**

In [None]:
# Get instance model and features
model = polynomial_model.model
features = polynomial_model.features

X_poly = features.fit_transform(selected_data[independent_data])

In [None]:
# Set new x_pred values
date_references = [selected_data['fecha'].min(), selected_data['fecha'].max()]

future_dates = pandas.date_range(date_references[1], periods=periods_to_predict+1, freq=selected_freq)
data_future_dates = pandas.DataFrame({'fecha': future_dates})
data_future_dates['dia'] = (data_future_dates['fecha'] - date_references[0]).dt.days + 1

data_predictions = data_future_dates.iloc[1:].reset_index()[['fecha', 'dia']]
data_predictions

In [None]:
new_dates = numpy.array(data_predictions[independent_data]).reshape(-1, 1)
new_sales = features.transform(new_dates)

predictions = model.predict(new_sales).flatten()
predictions = [round(item) for item in predictions]

data_predictions[selected_item] = pandas.DataFrame({ selected_item: predictions })
data_predictions

In [None]:
# Plot Data with predictions
pyplot.figure(figsize=(16, 4))
pyplot.scatter(selected_data['fecha'], selected_data[selected_item], label='Current Data')
pyplot.plot(selected_data['fecha'], model.predict(X_poly), color='red', label='Polynomial Regression Model')
pyplot.scatter(data_predictions['fecha'], predictions, color='green', marker='x', label='Predictions')
pyplot.title(f'{selected_item}  |  freq = {selected_freq}  |  Regresión de Grado °{selected_degree}')
pyplot.xlabel('Fechas')
pyplot.ylabel('No. de Ventas')
pyplot.legend()
pyplot.show()