In [5]:
import pandas as pd
import numpy as np

# In your Jupyter notebook
import sys
sys.path.append('../../../src')  # Add src to Python path if needed

from utils import DataFrameUtils

In [6]:
df_offer = pd.read_csv('../../../data/00_raw/ofertas_relampago.csv')

Siguiendo la analysis de lo notebook 0:
- Converter los tipos de datos de tiempo para datetime
- deletar la columna OFFER_TYPE
- Añadir un tipo 'Otros' para la columna de ORIGIN

In [7]:
df_offer['OFFER_START_DATE'] = pd.to_datetime(df_offer['OFFER_START_DATE'])
df_offer['OFFER_START_DTTM'] = pd.to_datetime(df_offer['OFFER_START_DTTM'])
df_offer['OFFER_FINISH_DTTM'] = pd.to_datetime(df_offer['OFFER_FINISH_DTTM'])

In [8]:
df_offer.drop(columns = 'OFFER_TYPE',inplace=True)

In [9]:
df_offer.loc[df_offer['ORIGIN'].isnull(),'ORIGIN'] = 'Otros'

En la teoria, las ofertas relampago son ofertas que duran algunas horas, vamos crear variables de tiempo como:
- Horas que lo producto tuve la promo
- Cual dia de la semana
- Cual mes, si existir más de uno mes

In [10]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID
11215,2021-07-27,2021-07-27 19:00:00+00:00,2021-07-28 00:14:18+00:00,5,0,15.32,5.0,Otros,none,SPORTS,APP & SPORTS,MLM-SPORT_SHORTS
17521,2021-07-24,2021-07-24 19:00:00+00:00,2021-07-25 01:00:01+00:00,10,10,,,Otros,none,HOME&DECOR,HOME & INDUSTRY,MLM-CHRISTMAS_LIGHTS
265,2021-06-22,2021-06-22 07:00:00+00:00,2021-06-22 13:00:00+00:00,5,3,7.34,2.0,Otros,none,STATIONARY,HOME & INDUSTRY,MLM-ADHESIVE_TAPES


In [11]:
def compute_duration_hours(df, start_col, end_col):
    """
    Calcula la duración en horas entre dos columnas de tipo datetime en un DataFrame.

    Parámetros:
        df (pd.DataFrame): DataFrame que contiene las columnas de fecha y hora.
        start_col (str): Nombre de la columna con la fecha/hora de inicio.
        end_col (str): Nombre de la columna con la fecha/hora de fin.

    Retorna:
        pd.Series: Serie de pandas con la duración en horas (tipo float) para cada fila.
    
    Lanza:
        TypeError: Si alguna de las columnas no es de tipo datetime.
    """
    if not pd.api.types.is_datetime64_any_dtype(df[start_col]):
        raise TypeError(f"La columna '{start_col}' debe ser de tipo datetime.")
    if not pd.api.types.is_datetime64_any_dtype(df[end_col]):
        raise TypeError(f"La columna '{end_col}' debe ser de tipo datetime.")

    return (df[end_col] - df[start_col]).dt.total_seconds() / 3600

In [12]:
df_offer['DURATION_HOURS'] = DataFrameUtils.compute_duration_hours(df_offer,'OFFER_START_DTTM','OFFER_FINISH_DTTM')

In [13]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID,DURATION_HOURS
38353,2021-07-29,2021-07-29 07:00:00+00:00,2021-07-29 13:00:05+00:00,5,3,11.34,2.0,Otros,free_shipping,TOOLS AND CONSTRUCTION,HOME & INDUSTRY,MLM-SHOWER_HEADS,6.001389
22301,2021-07-28,2021-07-28 19:00:00+00:00,2021-07-29 01:00:01+00:00,5,5,,,Otros,free_shipping,COMPUTERS,CE,MLM-COMPUTER_EQUIPMENT_AND_SPARE_PARTS,6.000278
46037,2021-07-26,2021-07-26 19:00:00+00:00,2021-07-27 01:00:03+00:00,15,10,12.4,5.0,Otros,none,COMPUTERS,CE,MLM-AUDIO_AND_VIDEO_CABLES_AND_ADAPTERS,6.000833


In [14]:
df_offer['START_DAY_OF_WEEK'] = df_offer['OFFER_START_DTTM'].dt.day_of_week
df_offer['START_MONTH'] = df_offer['OFFER_START_DTTM'].dt.month

La funccion de pandas dt.day_of_week empeza con lunes=0 y termina con domingo=6

In [15]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID,DURATION_HOURS,START_DAY_OF_WEEK,START_MONTH
42994,2021-07-02,2021-07-02 15:00:00+00:00,2021-07-02 23:00:00+00:00,46,46,,,A,none,HOME&DECOR,HOME & INDUSTRY,MLM-TABLE_AND_DESK_LAMPS,8.0,4,7
25616,2021-07-18,2021-07-18 19:00:00+00:00,2021-07-19 01:50:41+00:00,8,8,,,Otros,free_shipping,SPORTS,APP & SPORTS,MLM-LEGGINGS,6.844722,6,7
23879,2021-07-31,2021-07-31 19:00:00+00:00,2021-08-01 01:00:03+00:00,15,12,6.58,3.0,Otros,none,COMPUTERS,CE,MLM-COMPUTER_MICE,6.000833,5,7


<b> Vamos crear columnas de relacionadas a monetary como:
- Cantidad vendidas por hora
- Valor vendido por hora

Nota que aun vamos tener que agregar y despues tener las cuentas tambien dado que nuestro dado esta por linea de ventas y necesitamos entender lo comportamiento de los grupos de las ofertas

In [16]:
df_offer['QUANTITY_PER_HOUR'] = df_offer['SOLD_QUANTITY'] / df_offer['DURATION_HOURS']
df_offer['AMOUNT_PER_HOUR'] = df_offer['SOLD_AMOUNT'] / df_offer['DURATION_HOURS']

In [17]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID,DURATION_HOURS,START_DAY_OF_WEEK,START_MONTH,QUANTITY_PER_HOUR,AMOUNT_PER_HOUR
27462,2021-06-24,2021-06-24 19:00:00+00:00,2021-06-25 01:00:02+00:00,15,14,5.06,1.0,Otros,free_shipping,APPAREL ACCESORIES,APP & SPORTS,MLM-RINGS,6.000556,3,6,0.166651,0.843255
46628,2021-07-16,2021-07-16 11:00:00+00:00,2021-07-16 19:00:04+00:00,1000,793,797.02,207.0,A,free_shipping,PHARMACEUTICS,BEAUTY & HEALTH,MLM-SURGICAL_AND_INDUSTRIAL_MASKS,8.001111,4,7,25.871407,99.613665
24129,2021-07-31,2021-07-31 13:00:00+00:00,2021-07-31 20:00:51+00:00,7,0,60.25,5.0,A,free_shipping,SECURITY,HOME & INDUSTRY,MLM-SURVEILLANCE_CAMERAS,7.014167,5,7,0.712843,8.589759


Vamos salvar y empezar lo EDA

In [18]:
df_offer.to_parquet('../../../data/01_processed/ofertas_relampago.parquet',index=False)