In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# --- CONFIGURACI√ìN DEL DATASET ---
num_rows = 1_500_000
num_users = 160_000 
start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 12, 31)

# Diccionarios de dimensiones con pesos para realismo
countries = {'MEX': 0.3, 'BRA': 0.25, 'COL': 0.15, 'CHL': 0.1, 'ARG': 0.1, 'PER': 0.1}
devices = {'Mobile': 0.7, 'Desktop': 0.25, 'Tablet': 0.05}
categories = ['Tech', 'Fashion', 'Home', 'Beauty']
sources = ['organic', 'paid_ad', 'social', 'email', 'affiliate']
payments = ['Credit Card', 'Debit Card', 'PayPal', 'Mercado Pago']

data = []

# Pre-generar usuarios para mantener consistencia de pa√≠s y dispositivo
print("Creando base de usuarios...")
user_pool = []
for i in range(num_users):
    u_id = f"USR-{random.randint(100000, 999999)}"
    country = random.choices(list(countries.keys()), weights=list(countries.values()))[0]
    device = random.choices(list(devices.keys()), weights=list(devices.values()))[0]
    user_pool.append((u_id, country, device))

print(f"Generando {num_rows} registros de eventos...")

while len(data) < num_rows:
    user_id, country, device = random.choice(user_pool)
    session_id = f"SES-{random.getrandbits(32)}"
    source = random.choice(sources)
    category = random.choice(categories)
    sku = f"SKU-{category[:2].upper()}-{random.randint(1000, 9999)}"
    
    # L√ìGICA DE FUNNEL: Pesos para profundidad de la sesi√≥n (Abandono progresivo)
    # 1:home, 2:page_view, 3:scroll, 4:click, 5:add_to_cart, 6:checkout, 7:purchase
    depth = random.choices(range(1, 8), weights=[25, 20, 15, 12, 10, 10, 8])[0]
    stages = ['home_page', 'page_view', 'scroll', 'click_product', 'add_to_cart', 'begin_checkout', 'purchase']
    
    # Timestamp inicial de la sesi√≥n
    curr_time = start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds())))
    cart_id = None

    for i in range(depth):
        event = stages[i]
        
        # Estructura base de la fila
        row = {
            'user_id': user_id,
            'session_id': session_id,
            'event_timestamp': curr_time,
            'event_name': event,
            'country': country,
            'device': device,
            'traffic_source': source,
            'category': None,
            'sku': None,
            'cart_id': None,
            'units': None,
            'amount': None,
            'payment_method': None
        }

        # L√≥gica de relleno seg√∫n el evento
        if event in ['page_view', 'click_product', 'add_to_cart', 'begin_checkout', 'purchase']:
            row['category'] = category
            row['sku'] = sku
            
        if event >= 'add_to_cart':
            if cart_id is None: cart_id = f"CRT-{random.getrandbits(24)}"
            row['cart_id'] = cart_id
            
        if event == 'add_to_cart':
            row['units'] = random.randint(1, 3)
            
        if event == 'purchase':
            row['units'] = random.randint(1, 3)
            row['amount'] = round(row['units'] * random.uniform(15.0, 350.0), 2)
            row['payment_method'] = random.choice(payments)

        data.append(row)
        
        # El tiempo avanza entre 20 segundos y 5 minutos por cada paso del usuario
        curr_time += timedelta(seconds=random.randint(20, 300))
        
        if len(data) >= num_rows: break

# Crear DataFrame
df = pd.DataFrame(data)

# --- INYECCI√ìN DE ERRORES PARA LIMPIEZA EN CLASE ---
# 1. Valores nulos en pa√≠s y fuente de tr√°fico (3%)
mask_nulls = df.sample(frac=0.03).index
df.loc[mask_nulls, ['country', 'traffic_source']] = np.nan

# 2. Registros con montos en cero (para limpiar en an√°lisis de ventas)
mask_zeros = df[df['event_name'] == 'purchase'].sample(frac=0.05).index
df.loc[mask_zeros, 'amount'] = 0.0

# Exportar
df.to_csv('data_clase_sql_final.csv', index=False)

print("-" * 30)
print(f"Dataset exitoso: 'data_clase_sql_final.csv'")
print(f"Total filas: {len(df)}")
print(f"Usuarios √∫nicos: {df['user_id'].nunique()}")
print(f"Sesiones √∫nicas: {df['session_id'].nunique()}")
print("-" * 30)

Creando base de usuarios...
Generando 1500000 registros de eventos...
------------------------------
Dataset exitoso: 'data_clase_sql_final.csv'
Total filas: 1500000
Usuarios √∫nicos: 139016
Sesiones √∫nicas: 463095
------------------------------


In [11]:
df.sort_values(by=['user_id', 'event_timestamp'], ascending=True).head(10)

Unnamed: 0,user_id,session_id,event_timestamp,event_name,country,device,traffic_source,category,sku,cart_id,units,amount,payment_method
1432591,USR-100006,SES-868924841,2024-10-19 00:36:08,home_page,COL,Mobile,social,,,CRT-2460177,,,
1351640,USR-100006,SES-2683765444,2024-10-24 05:09:14,home_page,COL,Mobile,email,,,CRT-14637485,,,
1004787,USR-100006,SES-744110330,2025-01-30 15:08:01,home_page,COL,Mobile,paid_ad,,,CRT-2614225,,,
1004788,USR-100006,SES-744110330,2025-01-30 15:08:42,page_view,COL,Mobile,paid_ad,Home,SKU-HO-5161,CRT-2614225,,,
1004789,USR-100006,SES-744110330,2025-01-30 15:10:17,scroll,COL,Mobile,paid_ad,,,CRT-2614225,,,
1004790,USR-100006,SES-744110330,2025-01-30 15:14:29,click_product,COL,Mobile,paid_ad,Home,SKU-HO-5161,CRT-2614225,,,
484582,USR-100008,SES-484045906,2024-02-07 06:16:41,home_page,MEX,Mobile,social,,,CRT-2645505,,,
416032,USR-100008,SES-1209410973,2024-08-20 11:36:52,home_page,MEX,Mobile,email,,,CRT-3718464,,,
416033,USR-100008,SES-1209410973,2024-08-20 11:38:46,page_view,MEX,Mobile,email,Beauty,SKU-BE-1451,CRT-3718464,,,
544767,USR-100008,SES-2142417534,2024-08-27 17:30:02,home_page,MEX,Mobile,paid_ad,,,CRT-8750576,,,


In [12]:
import pandas as pd
import sqlite3
import time
from tqdm.notebook import tqdm
from IPython.core.magic import register_cell_magic
from IPython.display import display, HTML, clear_output

# 1. Configuraci√≥n de enlaces (URL LFS Directa)
datasets = {
    'ecommerce_events': "https://media.githubusercontent.com/media/hector1994/e_commerce_funnel_data_generator/refs/heads/master/data_clase_sql_final.csv"
}

# 2. Conexi√≥n a la base de datos en memoria
# Aumentamos el timeout para procesar el gran volumen de datos
connector = sqlite3.connect(':memory:', check_same_thread=False)

# 3. Proceso de carga con Barra de Progreso
print("‚¨áÔ∏èüóÇÔ∏è Descargando y sincronizando Dataset Masivo (1.5M registros).....üîÑ‚öôÔ∏è")
summary_data = []

for name, url in tqdm(datasets.items(), desc="Cargando Tablas"):
    # Descarga de datos
    df = pd.read_csv(url)

    # --- CAMBIO IMPORTANTE: DEFINICI√ìN DE FECHAS ---
    # Convertimos la columna de timestamp a formato datetime real
    if 'event_timestamp' in df.columns:
        df['event_timestamp'] = pd.to_datetime(df['event_timestamp'])

    # Conversi√≥n a SQL (con chunksize para optimizar memoria)
    df.to_sql(name, connector, index=False, if_exists='replace', chunksize=10000)

    # Guardar info para el resumen
    summary_data.append({
        "Table Name": f"{name}",
        "Rows": f"{len(df):,}",
        "Columns": len(df.columns)
    })

# 4. Definici√≥n de la "Palabra m√°gica" %%sql
@register_cell_magic
def sql(line, cell):
    try:
        # Ejecuci√≥n de la consulta
        resultado = pd.read_sql(cell, connector)
        clear_output(wait=True)
        display(HTML("<b style='color: #4CAF50;'>‚úÖ Query completed successfully:</b>"))
        return display(resultado)
    except Exception as e:
        clear_output(wait=True)
        display(HTML(f"<b style='color: #F44336;'>‚ùå Query execution failed:</b><br><code style='color: grey;'>{str(e)}</code>"))

# 5. Interfaz final para el alumno
clear_output()
display(HTML("<h2 style='color: #8e62f3'>‚ú® Database initialized - E-commerce Funnel Data ‚ú®</h2>"))
display(HTML("<p>The 1.5M rows dataset has been loaded. You can now use <b>ecommerce_events</b> in your queries:</p>"))

# Mostramos el resumen
display(pd.DataFrame(summary_data))
print("\n ‚ú®üöÄ SYSTEM READY! START ANALYZING THE FUNNEL! üöÄ‚ú®")

Unnamed: 0,Table Name,Rows,Columns
0,ecommerce_events,1500000,13



 ‚ú®üöÄ SYSTEM READY! START ANALYZING THE FUNNEL! üöÄ‚ú®


In [15]:
%%sql
SELECT COUNT(DISTINCT user_id) FROM ecommerce_events 

Unnamed: 0,COUNT(DISTINCT user_id)
0,139016
