In [1]:
import pandas as pd
import numpy as np

In [2]:
df_offer = pd.read_csv('../../../data/00_raw/ofertas_relampago.csv')

Siguiendo la analysis de lo notebook 0:
- Converter los tipos de datos de tiempo para datetime
- deletar la columna OFFER_TYPE
- Añadir un tipo 'Otros' para la columna de ORIGIN

In [3]:
df_offer['OFFER_START_DATE'] = pd.to_datetime(df_offer['OFFER_START_DATE'])
df_offer['OFFER_START_DTTM'] = pd.to_datetime(df_offer['OFFER_START_DTTM'])
df_offer['OFFER_FINISH_DTTM'] = pd.to_datetime(df_offer['OFFER_FINISH_DTTM'])

In [4]:
df_offer.drop(columns = 'OFFER_TYPE',inplace=True)

In [5]:
df_offer.loc[df_offer['ORIGIN'].isnull(),'ORIGIN'] = 'Otros'

En la teoria, las ofertas relampago son ofertas que duran algunas horas, vamos crear variables de tiempo como:
- Horas que lo producto tuve la promo
- Cual dia de la semana
- Cual mes, si existir más de uno mes

In [6]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID
25579,2021-07-18,2021-07-18 13:00:00+00:00,2021-07-18 18:09:03+00:00,5,0,85.59,5.0,Otros,free_shipping,APPAREL ACCESORIES,APP & SPORTS,MLM-WRISTWATCHES
40225,2021-06-01,2021-06-01 19:00:00+00:00,2021-06-02 01:00:39+00:00,5,5,,,Otros,free_shipping,HOME&DECOR,HOME & INDUSTRY,MLM-DINING_CHAIRS
1710,2021-07-08,2021-07-08 16:00:00+00:00,2021-07-09 00:00:00+00:00,15,15,,,A,free_shipping,TOYS AND GAMES,T & B,MLM-BOARD_GAMES


In [7]:
def compute_duration_hours(df, start_col, end_col):
    """
    Calcula la duración en horas entre dos columnas de tipo datetime en un DataFrame.

    Parámetros:
        df (pd.DataFrame): DataFrame que contiene las columnas de fecha y hora.
        start_col (str): Nombre de la columna con la fecha/hora de inicio.
        end_col (str): Nombre de la columna con la fecha/hora de fin.

    Retorna:
        pd.Series: Serie de pandas con la duración en horas (tipo float) para cada fila.
    
    Lanza:
        TypeError: Si alguna de las columnas no es de tipo datetime.
    """
    if not pd.api.types.is_datetime64_any_dtype(df[start_col]):
        raise TypeError(f"La columna '{start_col}' debe ser de tipo datetime.")
    if not pd.api.types.is_datetime64_any_dtype(df[end_col]):
        raise TypeError(f"La columna '{end_col}' debe ser de tipo datetime.")

    return (df[end_col] - df[start_col]).dt.total_seconds() / 3600

In [8]:
df_offer['DURATION_HOURS'] = compute_duration_hours(df_offer,'OFFER_START_DTTM','OFFER_FINISH_DTTM')

In [9]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID,DURATION_HOURS
36951,2021-06-18,2021-06-18 13:00:00+00:00,2021-06-18 19:00:00+00:00,5,5,,,Otros,free_shipping,ELECTRONICS,CE,MLM-ELECTRONIC_PRODUCTS,6.0
8067,2021-07-07,2021-07-07 19:00:00+00:00,2021-07-08 01:00:00+00:00,15,13,18.12,2.0,Otros,free_shipping,TOOLS AND CONSTRUCTION,HOME & INDUSTRY,MLM-AIR_COMPRESSORS,6.0
44365,2021-06-29,2021-06-29 07:00:00+00:00,2021-06-29 13:00:03+00:00,5,5,,,Otros,free_shipping,STATIONARY,HOME & INDUSTRY,MLM-ADHESIVE_TAPES,6.000833


In [10]:
df_offer['START_DAY_OF_WEEK'] = df_offer['OFFER_START_DTTM'].dt.day_of_week
df_offer['START_MONTH'] = df_offer['OFFER_START_DTTM'].dt.month

La funccion de pandas dt.day_of_week empeza con lunes=0 y termina con domingo=6

In [11]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID,DURATION_HOURS,START_DAY_OF_WEEK,START_MONTH
2257,2021-06-02,2021-06-02 07:00:00+00:00,2021-06-02 07:00:09+00:00,5,5,,,Otros,free_shipping,MOBILE,CE,MLM-SMARTWATCHES,0.0025,2,6
33954,2021-06-28,2021-06-28 15:00:00+00:00,2021-06-28 23:00:00+00:00,20,20,,,A,free_shipping,HOME&DECOR,HOME & INDUSTRY,MLM-MATTRESS_COVERS,8.0,0,6
26286,2021-06-17,2021-06-17 19:00:00+00:00,2021-06-18 03:00:00+00:00,8,8,,,A,none,PETS FOOD,CPG,MLM-CATS_AND_DOGS_FOODS,8.0,3,6


<b> Vamos crear columnas de relacionadas a monetary como:
- Cantidad vendidas por hora
- Valor vendido por hora

Nota que aun vamos tener que agregar y despues tener las cuentas tambien dado que nuestro dado esta por linea de ventas y necesitamos entender lo comportamiento de los grupos de las ofertas

In [12]:
df_offer['QUANTITY_PER_HOUR'] = df_offer['SOLD_QUANTITY'] / df_offer['DURATION_HOURS']
df_offer['AMOUNT_PER_HOUR'] = df_offer['SOLD_AMOUNT'] / df_offer['DURATION_HOURS']

In [13]:
df_offer.sample(3)

Unnamed: 0,OFFER_START_DATE,OFFER_START_DTTM,OFFER_FINISH_DTTM,INVOLVED_STOCK,REMAINING_STOCK_AFTER_END,SOLD_AMOUNT,SOLD_QUANTITY,ORIGIN,SHIPPING_PAYMENT_TYPE,DOM_DOMAIN_AGG1,VERTICAL,DOMAIN_ID,DURATION_HOURS,START_DAY_OF_WEEK,START_MONTH,QUANTITY_PER_HOUR,AMOUNT_PER_HOUR
19714,2021-07-14,2021-07-14 13:00:00+00:00,2021-07-14 21:00:00+00:00,40,40,,,A,free_shipping,ELECTRONICS,CE,MLM-TELEVISIONS,8.0,2,7,,
14918,2021-07-09,2021-07-09 07:00:00+00:00,2021-07-09 13:00:07+00:00,15,15,,,Otros,free_shipping,TOOLS AND CONSTRUCTION,HOME & INDUSTRY,MLM-COMBUSTION_CHAINSAWS,6.001944,4,7,,
7838,2021-07-07,2021-07-07 13:00:00+00:00,2021-07-07 20:00:01+00:00,132,123,52.97,9.0,A,free_shipping,APPAREL,APP & SPORTS,MLM-JACKETS_AND_COATS,7.000278,2,7,1.285663,7.566843


Vamos salvar y empezar lo EDA

In [14]:
df_offer.to_parquet('../../../data/01_processed/ofertas_relampago.parquet',index=False)