In [5]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DecimalType, DateType, IntegerType

# Configuraci√≥n de Spark con conectores de Cassandra y ClickHouse
spark = SparkSession.builder \
    .appName("ETL_Cassandra_Spark_ClickHouse") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.5.0,com.clickhouse:clickhouse-jdbc:0.5.0") \
    .config("spark.cassandra.connection.host", "cassandra") \
    .config("spark.cassandra.connection.localDC", "dc1") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

In [6]:
print("--- 1. Leyendo datos de Cassandra (ventas_db.ventas_crudas) ---")
start_read = time.time()

df_raw = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="ventas_crudas", keyspace="ventas_db") \
    .load()

count_raw = df_raw.count()
end_read = time.time()

print(f"‚úÖ Lectura completada en {end_read - start_read:.2f} segundos")
print(f"Total de registros crudos: {count_raw}")
df_raw.printSchema()
df_raw.show(5)

--- 1. Leyendo datos de Cassandra (ventas_db.ventas_crudas) ---
‚úÖ Lectura completada en 0.98 segundos
Total de registros crudos: 280012
root
 |-- fecha_venta: timestamp (nullable = false)
 |-- categoria: string (nullable = true)
 |-- id_cliente: string (nullable = true)
 |-- id_producto: string (nullable = true)
 |-- id_venta: string (nullable = true)
 |-- monto_total: decimal(38,18) (nullable = true)

+-------------------+-----------+----------+-----------+--------------------+--------------------+
|        fecha_venta|  categoria|id_cliente|id_producto|            id_venta|         monto_total|
+-------------------+-----------+----------+-----------+--------------------+--------------------+
|2024-02-06 10:37:00|Electronica|   CLI-189|   PROD-918|63f71c34-fd47-44b...|414.6400000000000...|
|2024-12-14 02:35:00|  Alimentos|   CLI-192|   PROD-804|6920e5a7-3109-45c...|702.9500000000000...|
|2024-03-08 06:09:00|Electronica|   CLI-487|   PROD-466|5f9e5a47-59b4-457...|1459.320000000000...

In [7]:
print("--- 2. Transformando datos (Agregaci√≥n por Fecha y Categor√≠a) ---")
start_transform = time.time()

# Transformaci√≥n: Casting, GroupBy, Aggregation
df_aggregated = df_raw \
    .withColumn("fecha_dia", col("fecha_venta").cast(DateType())) \
    .groupBy("fecha_dia", "categoria") \
    .agg(
        sum("monto_total").alias("ventas_totales"),
        count("id_venta").alias("cantidad_transacciones")
    )

df_result = df_aggregated.select(
    col("fecha_dia").alias("fecha_venta"),
    col("categoria"),
    col("ventas_totales").cast(DecimalType(18, 2)),
    col("cantidad_transacciones").cast(IntegerType())
)

# Forzamos una acci√≥n para medir el tiempo real de transformaci√≥n (Spark es lazy)
count_result = df_result.count()
end_transform = time.time()

print(f"‚úÖ Transformaci√≥n completada en {end_transform - start_transform:.2f} segundos")
print(f"Total de filas agregadas: {count_result}")
df_result.show(5)

--- 2. Transformando datos (Agregaci√≥n por Fecha y Categor√≠a) ---
‚úÖ Transformaci√≥n completada en 1.56 segundos
Total de filas agregadas: 1830
+-----------+-----------+--------------+----------------------+
|fecha_venta|  categoria|ventas_totales|cantidad_transacciones|
+-----------+-----------+--------------+----------------------+
| 2024-07-21|  Alimentos|      99253.84|                   130|
| 2024-10-30|   Deportes|     127066.25|                   161|
| 2024-01-16|Electronica|     111586.49|                   154|
| 2024-07-02|      Hogar|     112151.44|                   144|
| 2024-01-22|      Hogar|     110991.60|                   146|
+-----------+-----------+--------------+----------------------+
only showing top 5 rows



In [8]:
print("--- 3. Escribiendo en ClickHouse (dw_analitico.ventas_resumen) ---")
start_write = time.time()

jdbc_url = "jdbc:clickhouse://clickhouse:8123/dw_analitico"
properties = {
    "driver": "com.clickhouse.jdbc.ClickHouseDriver"
}

try:
    df_result.write \
        .mode("append") \
        .jdbc(url=jdbc_url, table="ventas_resumen", properties=properties)
    
    end_write = time.time()
    print(f"‚úÖ Carga en ClickHouse exitosa en {end_write - start_write:.2f} segundos")
except Exception as e:
    print(f"‚ùå Error al escribir en ClickHouse: {e}")

--- 3. Escribiendo en ClickHouse (dw_analitico.ventas_resumen) ---
‚úÖ Carga en ClickHouse exitosa en 1.49 segundos


In [None]:
print("--- Resumen de M√©tricas de Rendimiento ---")
print(f"1. Lectura (Cassandra):    {end_read - start_read:.2f} s")
print(f"2. Transformaci√≥n (Spark): {end_transform - start_transform:.2f} s")
print(f"3. Carga (ClickHouse):     {end_write - start_write:.2f} s")
print(f"-------------------------------------------")
print(f"Tiempo Total ETL:          {(end_read - start_read) + (end_transform - start_transform) + (end_write - start_write):.2f} s")

--- üìä Resumen de M√©tricas de Rendimiento ---
1. Lectura (Cassandra):    0.98 s
2. Transformaci√≥n (Spark): 1.56 s
3. Carga (ClickHouse):     1.49 s
-------------------------------------------
Tiempo Total ETL:          4.03 s
