# **Predicción Ventas de Yamaha 2024-2025**

**(Data Mining Linear Regression)**

In [None]:
# Import needed libraries
import os
import dotenv
import pyodbc
import pandas
import numpy
import matplotlib
import matplotlib.pyplot as pyplot
import seaborn
import json

## **Get Data**

Data connection, cleaning and filtering.

In [None]:
# Get Data variables

# Connection
connection_sql_path = './assets/query.sql'
connection_save_path = './assets/data.csv'

# Cleaning
cleaning_data_path = './assets/data.csv'
cleaning_save_path = './assets/data_cleaned.csv'

# Filtering
filtering_data_path = './assets/data_cleaned.csv'
filtering_save_path = './assets/data_filtered.csv'
filtering_time_range = 'M' # [Y, M, D]
filtering_filter = 'modelo' # [asesor, modelo, clasificacion]
filtering_values = 'cantidad' # [cantidad, costo]

### Connection

Start with init **Connection** and **get the Data from DB**

In [None]:
dotenv.load_dotenv( )

# Get env variables 
DRIVER = os.getenv('PROJECT_DRIVER')
SERVER = os.getenv('PROJECT_SERVER')
DATABASE = os.getenv('PROJECT_DATABASE')
USERNAME = os.getenv('PROJECT_USERNAME')
PASSWORD = os.getenv('PROJECT_PASSWORD')

In [None]:
# Get SQL query from file
with open(connection_sql_path) as file:
   sql_query = file.read()

In [None]:
conn_state = True

try:
   # Establish connection
   connectionString = f'DRIVER={DRIVER};SERVER={SERVER};DATABASE={DATABASE};UID={USERNAME};PWD={PASSWORD};TrustServerCertificate=YES;'
   connection = pyodbc.connect(connectionString)


   # Excecute query with pandas
   query = pandas.read_sql_query(
      sql_query,
      connection,
   )
except:
   conn_state = False
   print('Exception at connection with MSSQL-Server')

In [None]:
if conn_state:
   # Get results and save it in .CSV file
   results = pandas.DataFrame(query)
   results.to_csv(connection_save_path, index=False, header=True, sep=";")

### Cleaning

Clean data removing extra chars and export it in a new .CSV file

In [None]:
# Load data to be cleaned
with open(cleaning_data_path, 'r') as file:
   file_content = file.read().split('\n')
   headers = file_content.pop(0)
   clean_data = '\n'.join(file_content)

   print(headers)
   print(clean_data)

In [None]:
# Delete extra chars:

# decimals to int values
clean_data = clean_data.replace('.0;', ';') 
# extra commas
clean_data = clean_data.replace(',', '')
# double spaces at start and end of any cell
clean_data = clean_data.replace('; ', ';')
clean_data = clean_data.replace(' ;', ';')
# double spaces at middle of any cell
clean_data = clean_data.replace('\n', '_')
clean_data = ' '.join(clean_data.split())
clean_data = clean_data.replace('_', '\n')
# extra quotation marks
clean_data = clean_data.replace('"', '')

In [None]:
# Set CSV default separator
headers = headers.replace(';', ',')
clean_data = clean_data.replace(';', ',')

# Save cleaned data
with open(cleaning_save_path, 'w') as file:
   file.write( headers+'\n'+clean_data )

### Filtering

Filter the cleaned data and export it to a new .CSV file

In [None]:
# Load data to be filtered
filter_data = pandas.read_csv(filtering_data_path)
filter_data

In [None]:
# Set 'fecha' column as 'datetime' type
filter_data['fecha'] = pandas.to_datetime(filter_data['fecha'], format="%Y-%m-%d")

# Delete unnecesary columns
filter_data = filter_data.drop(columns=['sw', 'bodega', 'ident_asesor', 'ident_cliente', 'nom_cliente', 'utilidad', 'modelo', 'financiera', 'dias_inv', 'doc_ref'])
filter_data

In [None]:
# Create params maps
date_mappings = {
   'Y': {
      'symbol': 'YE',
      'format': '%Y'
   },
   'M': {
      'symbol': 'ME',
      'format': '%Y-%m'
   },
   'D': {
      'symbol': 'D',
      'format': '%Y-%m-%d'
   }
}
filter_mappings = {
   'asesor': 'nom_asesor',
   'modelo': 'des_modelo',
   'financiera': 'financiera',
   'clasificacion': 'clasificacion'
}
values_mappings = {
   'cantidad': {
      'name': 'cantidad',
      'type': int
   },
   'costo': {
      'name': 'costo_unitario',
      'type': float
   }
}

# Set the variables according filtering variables
selected_time = date_mappings.get(filtering_time_range)
selected_filter = filter_mappings.get(filtering_filter)
selected_value = values_mappings.get(filtering_values)

In [None]:
# Group dataframe by ['fecha'] as primary and [selected_filter] as secondary
data_group = filter_data.groupby([pandas.Grouper(key='fecha', freq=selected_time['symbol'], sort=True), selected_filter])[selected_value['name']].sum() 
data_group

In [None]:
# Convert group series in a new dataframe
data_filtered = data_group.unstack(level=1)
data_filtered

In [None]:
# Fill NaN data
data_filtered = data_filtered.fillna(0)
data_filtered = data_filtered.astype(selected_value['type'])
data_filtered.index = data_filtered.index.strftime(selected_time['format'])
data_filtered

In [None]:
# Export filtered_data
data_filtered.to_csv(filtering_save_path, date_format=selected_time['format'])
data_filtered.to_json(filtering_save_path.replace('.csv', '.json'), date_format=selected_time['format'], indent=3)

## **Show Data**

Data plot with pandas and matplotlib.