# **Predicción Ventas de Yamaha 2024-2025**

**(Data Mining Linear Regression)**

In [None]:
# Import needed libraries
import os
import dotenv
import pyodbc
import pandas
import numpy
import seaborn
import matplotlib
import matplotlib.pyplot as pyplot
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Set Project params
data_filter = 'modelo' # [asesor, modelo, clasificacion]
data_values = 'cantidad' # [cantidad, costo]
data_time_freq = 'mes' # [año, mes, dia]
fetched = True
time_start = '2020'
time_end = '2024'

# Set Model params

## **Get and Prepare Data**

Data connection, cleaning and filtering.

In [None]:
# Set data params

# Connect
connect_sql_path = './assets/query.sql'
connect_save_path = './assets/data.csv'

# Clean
clean_data_path = './assets/data.csv'
clean_save_path = './assets/data_cleaned.csv'

# Filter
filter_data_path = './assets/data_cleaned.csv'
filter_save_path = './assets/data_filtered.csv'

### Connect

Start with init **Connection** and **get the Data from DB**

In [None]:
dotenv.load_dotenv( )

# Get env variables 
DRIVER = os.getenv('PROJECT_DRIVER')
SERVER = os.getenv('PROJECT_SERVER')
DATABASE = os.getenv('PROJECT_DATABASE')
USERNAME = os.getenv('PROJECT_USERNAME')
PASSWORD = os.getenv('PROJECT_PASSWORD')

SERVER

In [None]:
# Get SQL query from file
with open(connect_sql_path) as file:
   sql_query = file.read()

In [None]:
if not fetched:
   # Establish connection
   connectionString = f'DRIVER={DRIVER};SERVER={SERVER};DATABASE={DATABASE};UID={USERNAME};PWD={PASSWORD};TrustServerCertificate=YES;'
   connection = pyodbc.connect(connectionString)

   # # Excecute query with pandas
   query = pandas.read_sql_query(
      sql_query,
      connection,
   )

In [None]:
if not fetched:
   # Get results and save it in .CSV file
   results = pandas.DataFrame(query)
   results.to_csv(connect_save_path, index=False, header=True, sep=";")

### Clean

Clean data removing extra chars and export it in a new .CSV file

In [None]:
# Load data to be cleaned
with open(clean_data_path, 'r') as file:
   file_content = file.read().split('\n')
   headers = file_content.pop(0)
   clean_data = '\n'.join(file_content)

   print(headers)
   print(clean_data)

In [None]:
# Delete extra chars:

# decimals to int values
clean_data = clean_data.replace('.0;', ';') 
# extra commas
clean_data = clean_data.replace(',', '')
# double spaces at start and end of any cell
clean_data = clean_data.replace('; ', ';')
clean_data = clean_data.replace(' ;', ';')
# double spaces at middle of any cell
clean_data = clean_data.replace('\n', '_')
clean_data = ' '.join(clean_data.split())
clean_data = clean_data.replace('_', '\n')
# extra quotation marks
clean_data = clean_data.replace('"', '')

In [None]:
# Set CSV default separator
headers = headers.replace(';', ',')
clean_data = clean_data.replace(';', ',')

# Save cleaned data
with open(clean_save_path, 'w') as file:
   file.write( headers+'\n'+clean_data )

### Filter

Filter the cleaned data and export it to a new .CSV file

In [None]:
# Load data to be filtered
filter_data = pandas.read_csv(filter_data_path)
filter_data.head()

In [None]:
# Set 'fecha' column as 'datetime' type
filter_data['fecha'] = pandas.to_datetime(filter_data['fecha'], format="%Y-%m-%d")

# Delete unnecesary columns
filter_data = filter_data.drop(columns=['sw', 'bodega', 'ident_asesor', 'ident_cliente', 'nom_cliente', 'utilidad', 'modelo', 'financiera', 'dias_inv', 'doc_ref'])
filter_data.head()

In [None]:
# Create params maps
date_mappings = {
   'año': {
      'symbol': 'YE',
      'format': '%Y'
   },
   'mes': {
      'symbol': 'ME',
      'format': '%Y-%m'
   },
   'dia': {
      'symbol': 'D',
      'format': '%Y-%m-%d'
   }
}
filter_mappings = {
   'asesor': 'nom_asesor',
   'modelo': 'des_modelo',
   'financiera': 'financiera',
   'clasificacion': 'clasificacion'
}
values_mappings = {
   'cantidad': {
      'name': 'cantidad',
      'type': int
   },
   'costo': {
      'name': 'costo_unitario',
      'type': float
   }
}

# Set the variables according filtering variables
selected_time = date_mappings.get(data_time_freq)
selected_filter = filter_mappings.get(data_filter)
selected_value = values_mappings.get(data_values)

In [None]:
# Group dataframe by ['fecha'] as primary and [selected_filter] as secondary
data_group = filter_data.groupby([pandas.Grouper(key='fecha', freq=selected_time['symbol'], sort=True), selected_filter])[selected_value['name']].sum() 
data_group

In [None]:
# Convert group series in a new dataframe
data_filtered = data_group.unstack(level=1)
data_filtered

In [None]:
# Fill NaN data
data_filtered = data_filtered.fillna(0)
data_filtered = data_filtered.astype(selected_value['type'])
data_filtered['total'] = data_filtered.sum(axis='columns')
data_filtered.head()

In [None]:
data_filtered.index = pandas.to_datetime(data_filtered.index, format=selected_time['format'])
data_filtered = data_filtered[(time_start <= data_filtered.index) & (data_filtered.index <= time_end)]
data_filtered.head()

In [None]:
# Export filtered_data
data_filtered.to_csv(filter_save_path, date_format=selected_time['format'])
# data_filtered.to_json(filter_save_path.replace('.csv', '.json'), date_format=selected_time['format'], indent=3)

## **Implement Data Model**

Linear Regression Model creation, and implement.

### Show

**Data plot** with pandas and matplotlib.

In [None]:
# Run plots without show()
%matplotlib inline

In [None]:
# Read filtered data
data = pandas.read_csv('./assets/data_filtered.csv')

# format date and number columns
data['fecha'] = pandas.to_datetime(data['fecha'], format=selected_time['format'])
data.iloc[:, 2:] = data.iloc[:, 2:].astype(selected_value['type'])

items = list(data.iloc[:, 2:-1].keys())

In [None]:
# Create new datetime with time count
data.insert(1, data_time_freq, (data['fecha'].dt.to_period(selected_time['symbol'][0]) + 1 - data['fecha'].dt.to_period(selected_time['symbol'][0]).min()).apply(lambda x: x.n))
data.head()

In [None]:
# Set pyplot settings
pyplot.rcParams['axes.grid'] = False
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
time = numpy.asarray(data['fecha'], dtype='datetime64[s]')

In [None]:
figure_a, axes_a = pyplot.subplots(figsize=(12, 4))

ax = axes_a
ax.plot(time, data[items])
ax.tick_params(axis='x', labelrotation=45)
ax.set_title(f'No. de Ventas {time_start} - 2024')
ax.set_xlabel('Fecha')
ax.set_ylabel('Ventas')
ax.margins(x=0.03, y=0.01)
ax.grid()

figure_a.tight_layout()

In [None]:
# get items subgroup
subitems = items[0:10]

In [None]:
figure_b, axes_b = pyplot.subplots(nrows=int(len(subitems)/5), ncols=5, figsize=(20, 5))

# Iterar sobre cada modelo y graficar en el subplot correspondiente
for index, item in enumerate(subitems):
    ax = axes_b[int(index/5), int(index%5)]
    ax.plot(time, data[item], label=item, color=colors[int(index%10)])
    ax.tick_params(axis='x', labelrotation=45)
    ax.legend()
figure_b.tight_layout()

In [None]:
figure_c, axes_c = pyplot.subplots(nrows=int(len(subitems)/5), ncols=5, figsize=(20, 5))

# Iterar sobre cada modelo y graficar en el subplot correspondiente
for index, item in enumerate(subitems):
    ax = axes_c[int(index/5), int(index%5)]
    ax.plot(time, data[item], label=item, color=colors[int(index%10)])
    ax.tick_params(axis='x', labelrotation=45)
    ax.legend()
    ax.set(ylim=(0, 250))
figure_c.tight_layout()

In [None]:
# Set example data
example_items = [ items[8], items[40] ]

In [None]:
# Set figure and axes
figure_c, axes_c = pyplot.subplots(nrows=1, ncols=len(example_items), figsize=(8, 4))

# Add subplots to axes
for index, key in enumerate(example_items):
   ax = axes_c[index]
   ax.plot(time, data[key], label=key)
   ax.plot(time, data['total'], label='Total')
   ax.tick_params(axis='x', labelrotation=45)
   ax.margins(x=0.03, y=0.04)
   ax.grid()
   ax.set(
      title=f'{key}',
      xlabel='Fecha', 
      ylabel='No. de Ventas',
   )
   ax.legend()

# Fix figure layout
figure_c.tight_layout()

### Model

Create a Linear Regression Predict model with scikit-learn.

In [None]:
# Create a new linear regression model instance
# linearRegressionModel = linear_model.LinearRegression()