## Generate Data
Generate data following the example: https://gurobi-machinelearning.readthedocs.io/en/stable/auto_examples/example4_price_optimization.html#sphx-glr-auto-examples-example4-price-optimization-py

In [1]:
import pandas as pd
import numpy as np

### 0. Root repo

In [2]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('/')[:-1]
root_path = '/'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  /Users/joseortega/Documents/GitHub/gurobi-ml-optimization-prices


### 1. download data

In [3]:
data_url = "https://raw.githubusercontent.com/Gurobi/modeling-examples/master/price_optimization/"
avocado = pd.read_csv(
    data_url + "HABdata_2019_2022.csv"
)  # dataset downloaded directly from HAB
avocado_old = pd.read_csv(
    data_url + "kaggledata_till2018.csv"
)  # dataset downloaded from Kaggle

# The date is in different formats in the two data sets and
# need to be converted separately
avocado["date"] = pd.to_datetime(avocado["date"], format="%m/%d/%y %H:%M")
avocado_old["date"] = pd.to_datetime(avocado_old["date"], format="%m/%d/%y")

# Concatenate the two notebooks
avocado = pd.concat([avocado, avocado_old])
avocado

Unnamed: 0,region,date,type,price,units_sold
0,Great_Lakes,2019-01-07,Conventional,1.106743,3812441.96
1,Great_Lakes,2019-01-07,Organic,1.371280,275987.52
2,Great_Lakes,2019-01-13,Conventional,1.063457,3843318.68
3,Great_Lakes,2019-01-13,Organic,1.493384,244991.95
4,Great_Lakes,2019-01-20,Conventional,1.049931,4587957.69
...,...,...,...,...,...
3703,West,2018-11-18,Organic,1.610000,334096.14
3704,West,2018-11-25,Conventional,1.240000,3260102.17
3705,West,2018-11-25,Organic,1.730000,268362.34
3706,West,2018-12-02,Conventional,1.200000,4594863.86


### 2. Prepare the dataset.
- Agregar índice con el año
- Agregar columna con las temporadas alta. De febrero a Agosto
- Transformar las unidades vendidas a millones de unidades vendidas (units/1000000)
- Elegir solo tipo de palta convencional

In [4]:
# Add the index for each year from 2015 through 2022
avocado["year"] = pd.DatetimeIndex(avocado["date"]).year
avocado = avocado.sort_values(by="date")

# Define the peak season
avocado["month"] = pd.DatetimeIndex(avocado["date"]).month
peak_months = range(2, 8)  # <--------- Set the months for the "peak season"


def peak_season(row):
    return 1 if int(row["month"]) in peak_months else 0


avocado["peak"] = avocado.apply(lambda row: peak_season(row), axis=1)

# Scale the number of avocados to millions
avocado["units_sold"] = avocado["units_sold"] / 1000000

# Select only conventional avocados
avocado = avocado[avocado["type"] == "Conventional"]

avocado = avocado[
    ["date", "units_sold", "price", "region", "year", "month", "peak"]
].reset_index(drop=True)

avocado

Unnamed: 0,date,units_sold,price,region,year,month,peak
0,2015-01-04,3.382800,1.020000,Great_Lakes,2015,1,0
1,2015-01-04,2.578275,1.100000,Midsouth,2015,1,0
2,2015-01-04,5.794411,0.890000,West,2015,1,0
3,2015-01-04,3.204112,0.980000,Southeast,2015,1,0
4,2015-01-04,0.321824,1.050000,Northern_New_England,2015,1,0
...,...,...,...,...,...,...,...
3397,2022-05-15,4.150433,1.269883,SouthCentral,2022,5,1
3398,2022-05-15,4.668815,1.644873,Northeast,2022,5,1
3399,2022-05-15,32.745321,1.527357,Total_US,2022,5,1
3400,2022-05-15,3.542902,1.514583,Midsouth,2022,5,1


### 3. Save data

In [5]:
path_data = 'artifacts/data/data_raw.pkl'
avocado.to_pickle(path_data)