In [52]:
from dotenv import load_dotenv
import os

# Esto carga las variables desde tu archivo .env en el directorio actual
load_dotenv()

credencialesSnowflakeRaw = {
    "sfURL" : os.getenv("SNOWFLAKE_ACCOUNT"),
    "sfUser" :  os.getenv("SNOWFLAKE_USER"),
    "sfPassword" : os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase" : os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema" : os.getenv("SNOWFLAKE_SCHEMA_RAW"),
    "sfWarehouse" : os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole" : os.getenv("SNOWFLAKE_ROLE"),
}

print(f"Estas son mis credenciales para Snowflake: {credencialesSnowflakeRaw}")

Estas son mis credenciales para Snowflake: {'sfURL': 'LSNDJXB-RHC82043.snowflakecomputing.com', 'sfUser': 'usuario_spark', 'sfPassword': 'EstudianteEstudiante64', 'sfDatabase': 'NY_TAXI', 'sfSchema': 'raw', 'sfWarehouse': 'WAREHOUSE_TAXIS', 'sfRole': 'rol_pocos_privilegios'}


In [53]:
import pyspark
from pyspark.sql import SparkSession

# Crear SparkSession para conexión con Snowflake
spark = (SparkSession.builder.appName("IngestaNewYorkTaxis").config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.30,net.snowflake:spark-snowflake_2.12:2.9.0-spark_3.1").config("spark.sql.catalog.snowflake", "org.apache.spark.sql.execution.datasources.v2.snowflake.SnowflakeCatalog").config("spark.sql.catalog.snowflake.sfURL", credencialesSnowflake["sfURL"]).config("spark.sql.catalog.snowflake.sfUser", credencialesSnowflake["sfUser"]).config("spark.sql.catalog.snowflake.sfPassword", credencialesSnowflake["sfPassword"]).config("spark.sql.catalog.snowflake.sfDatabase", credencialesSnowflake["sfDatabase"]).config("spark.sql.catalog.snowflake.sfSchema", credencialesSnowflake["sfSchema"]).config("spark.sql.catalog.snowflake.sfWarehouse", credencialesSnowflake["sfWarehouse"]).config("spark.sql.catalog.snowflake.sfRole", credencialesSnowflake["sfRole"]).getOrCreate())

print(spark)
print("Spark Version : " + spark.version)

<pyspark.sql.session.SparkSession object at 0x72abedd23cd0>
Spark Version : 3.5.0


In [54]:
!pip install python-dotenv



In [65]:
def crear_tabla_raw_taxis(service: str):
    crear_tabla_taxis = f"""
    CREATE TABLE IF NOT EXISTS NY_TAXI_RAW_{service.upper()} (
        VENDORID INT,
        TPEP_PICKUP_DATETIME TIMESTAMP_NTZ,
        TPEP_DROPOFF_DATETIME TIMESTAMP_NTZ,
        PASSENGER_COUNT INT,
        TRIP_DISTANCE FLOAT,
        RATECODEID INT,
        STORE_AND_FWD_FLAG STRING,
        PULOCATIONID INT,
        DOLOCATIONID INT,
        PAYMENT_TYPE INT,
        FARE_AMOUNT FLOAT,
        EXTRA FLOAT,
        MTA_TAX FLOAT,
        TIP_AMOUNT FLOAT,
        TOLLS_AMOUNT FLOAT,
        IMPROVEMENT_SURCHARGE FLOAT,
        TOTAL_AMOUNT FLOAT,
        CONGESTION_SURCHARGE FLOAT,
        AIRPORT_FEE FLOAT,
        RUN_ID STRING,
        SERVICE_TYPE STRING,
        SOURCE_YEAR INT,
        SOURCE_MONTH INT,
        INGESTED_AT_UTC TIMESTAMP_NTZ,
        SOURCE_PATH STRING
    )
    """
    
    #Genero la tabla desde la presente funcion con primary key compuesta para idempotencia
    try:
        spark.sql(crear_tabla_taxis)
        print(f"Tabla NY_TAXI_RAW_{service.upper()} creada correctamente en Snowflake")
    except Exception as e:
        print(f"Fallo la creacion de la tabla de taxis: {e}")
    else:
        print("Tabla NY_TAXI_RAW creada correctamente en Snowflake")

In [66]:
import datetime

#Hago la presente funcion para generar un identificador unico asociado a cada carga de datos para el RUN_ID 
def generar_run_id():
    return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

In [67]:
import os
import requests
from pyspark.sql.functions import lit, current_timestamp, to_utc_timestamp
from pyspark.sql.types import TimestampType

def ingestar_parquet_a_raw(service: str, year: int, month: int):
    SOURCE_PATH=os.getenv("SOURCE_PATH")
    path_url = f"{SOURCE_PATH}/trip-data/{service}_tripdata_{year}-{month:02d}.parquet"
    local_path = f"/tmp/{service}_tripdata_{year}-{month:02d}.parquet"
    
    # Descargar el archivo parquet en carpeta temporal para ingesta de datos
    try:
        r = requests.get(path_url, stream=True)
        if r.status_code == 200:
            with open(local_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=10000000):
                    f.write(chunk)
        else:
            print(f"Archivo no encontrado en {path_url} (status {r.status_code})")
            return None
    except Exception as e:
        print(f"Error descargando {path_url}: {e}")
        return None
    else:
        print(f"Archivo obtenido existosamente de: {path_url}")
    
    # Leo el archivo parquet localmente
    try:
        df = spark.read.parquet(local_path)
    except Exception as e:
        print(f"No se pudo leer {local_path}: {e}")
        return None
    else:
        print(f"Archivo leído existosamente por Spark: {local_path}")

    run_id = generar_run_id()

    #Elimino nombre de columna presente en pocos archivos
    if 'cbd_congestion_fee' in df.columns:
        df = df.drop('cbd_congestion_fee')

    #Homogenizo nombres de columnas a mayusculas para evitar errores en ingesta
    df = df.toDF(*[c.upper() for c in df.columns])

    # Añadir metadatos
    df_meta = df.withColumn("run_id", lit(run_id)) \
                .withColumn("service_type", lit(service)) \
                .withColumn("source_year", lit(year)) \
                .withColumn("source_month", lit(month)) \
                .withColumn("ingested_at_utc", to_utc_timestamp(current_timestamp(), 'UTC')) \
                .withColumn("source_path", lit(path_url))

    
    for field in df_meta.schema.fields:
        if field.dataType.typeName() == "timestamp_ntz":
            df_meta = df_meta.withColumn(field.name, df_meta[field.name].cast(TimestampType()))

    
    conteoFilas = df_meta.count()
    print(f"Ingestando hacia Snowflake {service} {year}-{month}. Total de filas: {conteoFilas}")

    # Creo una vista temporal para los nuevos datos
    df_meta.createOrReplaceTempView("nueva_data")

    # Ejecuto el Merge para actualizar o insertar los datos en Snowflake y de esa forma asegurar idempotencia
    # Aplico validacion con combinacion de columnas para emular primary key compuesta
    merge_query = f"""
    MERGE INTO NY_TAXI_RAW_{service} AS target
    USING nueva_data AS source
    ON target.VENDORID = source.VENDORID 
       AND target.TPEP_PICKUP_DATETIME = source.TPEP_PICKUP_DATETIME 
       AND target.TPEP_DROPOFF_DATETIME = source.TPEP_DROPOFF_DATETIME
       AND target.RATECODEID = source.RATECODEID
       AND target.PULOCATIONID = source.PULOCATIONID
       AND target.DOLOCATIONID = source.DOLOCATIONID
    WHEN MATCHED THEN
        UPDATE SET
            target.PASSENGER_COUNT = source.PASSENGER_COUNT,
            target.TRIP_DISTANCE = source.TRIP_DISTANCE,
            target.STORE_AND_FWD_FLAG = source.STORE_AND_FWD_FLAG,
            target.PAYMENT_TYPE = source.PAYMENT_TYPE,
            target.FARE_AMOUNT = source.FARE_AMOUNT,
            target.EXTRA = source.EXTRA,
            target.MTA_TAX = source.MTA_TAX,
            target.TIP_AMOUNT = source.TIP_AMOUNT,
            target.TOLLS_AMOUNT = source.TOLLS_AMOUNT,
            target.IMPROVEMENT_SURCHARGE = source.IMPROVEMENT_SURCHARGE,
            target.TOTAL_AMOUNT = source.TOTAL_AMOUNT,
            target.CONGESTION_SURCHARGE = source.CONGESTION_SURCHARGE,
            target.AIRPORT_FEE = source.AIRPORT_FEE,
            target.RUN_ID = source.RUN_ID,
            target.SERVICE_TYPE = source.SERVICE_TYPE,
            target.SOURCE_YEAR = source.SOURCE_YEAR,
            target.SOURCE_MONTH = source.SOURCE_MONTH,
            target.INGESTED_AT_UTC = source.INGESTED_AT_UTC,
            target.SOURCE_PATH = source.SOURCE_PATH
    WHEN NOT MATCHED THEN
        INSERT (
            VENDORID, TPEP_PICKUP_DATETIME, TPEP_DROPOFF_DATETIME, PASSENGER_COUNT, TRIP_DISTANCE, RATECODEID,
            STORE_AND_FWD_FLAG,PULOCATIONID,DOLOCATIONID,PAYMENT_TYPE,FARE_AMOUNT,EXTRA,MTA_TAX,TIP_AMOUNT,
            TOLLS_AMOUNT,IMPROVEMENT_SURCHARGE,TOTAL_AMOUNT,CONGESTION_SURCHARGE,AIRPORT_FEE,RUN_ID,
            SERVICE_TYPE,SOURCE_YEAR,SOURCE_MONTH,INGESTED_AT_UTC,SOURCE_PATH
        ) VALUES (
            source.VENDORID, source.TPEP_PICKUP_DATETIME, source.TPEP_DROPOFF_DATETIME, source.PASSENGER_COUNT, 
            source.TRIP_DISTANCE, source.RATECODEID, source.STORE_AND_FWD_FLAG, source.PULOCATIONID,
            source.DOLOCATIONID, source.PAYMENT_TYPE, source.FARE_AMOUNT, source.EXTRA,
            source.MTA_TAX, source.TIP_AMOUNT, source.TOLLS_AMOUNT, source.IMPROVEMENT_SURCHARGE,
            source.TOTAL_AMOUNT, source.CONGESTION_SURCHARGE, source.AIRPORT_FEE, source.RUN_ID,
            source.SERVICE_TYPE, source.SOURCE_YEAR, source.SOURCE_MONTH, source.INGESTED_AT_UTC, source.SOURCE_PATH
        )
    """
    
    try:
        spark.sql(merge_query)
        print(f"Datos de {service} {year}-{month} actualizados o insertados correctamente.")
    except Exception as e:
        print(f"Error al ejecutar Merge en Snowflake: {e}")
        return None

    try:
        os.remove(local_path)
        print(f"Archivo parquet temporal removido: {local_path}")
    except OSError as e:
        print(f"No se pudo remover el archivo parquet temporal {local_path}: {e}")

    return {
        "year": year,
        "month": month,
        "count": conteoFilas
    }

In [68]:
tipos_taxis=os.getenv("SERVICES").split(',')
lista_years=os.getenv("YEARS").split(',')
lista_months=os.getenv("MONTHS").split(',')
lista_years = [int(item) for item in lista_years]
lista_months = [int(item) for item in lista_months]

print(tipos_taxis)
print(lista_years)
print(lista_months)

['yellow', 'green']
[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [69]:
import json
import os

def save_checkpoint(year, month):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"year": year, "month": month}, f)

def load_checkpoint(CHECKPOINT_FILE):
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    return {"year": 0, "month": 0}

resultadosGeneralesIngesta=[]

try:
    for tipo_taxi in tipos_taxis:
        CHECKPOINT_FILE = f"checkpointTaxis{tipo_taxi.capitalize()}.json"
        crear_tabla_raw_taxis(tipo_taxi)
        checkpoint=load_checkpoint(CHECKPOINT_FILE)
        lista_years= lista_years[lista_years.index(checkpoint["year"]):]
        lista_months= lista_months[lista_months.index(checkpoint["month"])+1:]

        for year_taxi in lista_years:
            
            for month_taxi in lista_months:  
                print(f"Iniciando ingesta de datos de taxis {tipo_taxi}: {month_taxi}-{year_taxi}")
                resultadosParciales=ingest_parquet_to_raw(tipo_taxi, year_taxi, month_taxi)
                resultadosGeneralesIngesta.append(resultadosParciales)
                save_checkpoint(year_taxi,month_taxi)
                
except Exception as e:
    print(f"Fallo el proceso de ingesta masiva de datos de taxis NY: {e}")
else:
    print("El proceso de ingesta masiva de taxis NY fue exitoso")

Fallo la creacion de la tabla de taxis: [NOT_SUPPORTED_COMMAND_WITHOUT_HIVE_SUPPORT] CREATE Hive TABLE (AS SELECT) is not supported, if you want to enable it, please set "spark.sql.catalogImplementation" to "hive".;
'CreateTable `spark_catalog`.`default`.`NY_TAXI_RAW_YELLOW`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Ignore

Fallo el proceso de ingesta masiva de datos de taxis NY: 0 is not in list


In [71]:
import pyspark
from pyspark.sql import SparkSession
import os

# Configuración de credenciales de Snowflake
credencialesSnowflake = {
    "sfURL": os.getenv("SNOWFLAKE_URL", "tu_account.snowflakecomputing.com"),
    "sfUser": os.getenv("SNOWFLAKE_USER", "tu_usuario"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD", "tu_password"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE", "NY_TAXI_DB"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole": os.getenv("SNOWFLAKE_ROLE", "")
}

# Crear SparkSession para conexión con Snowflake - SIN configurar catálogo
spark = (SparkSession.builder
    .appName("IngestaNewYorkTaxis")
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.30,net.snowflake:spark-snowflake_2.12:2.9.0-spark_3.1")
    # Removemos la configuración del catálogo para evitar conflictos con Hive
    .getOrCreate())

# Configuraciones adicionales de Spark para mejor performance
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

print(spark)
print("Spark Version: " + spark.version)
print("✓ SparkSession configurada exitosamente")

# Función para verificar la conexión a Snowflake
def verificar_conexion_snowflake():
    """
    Verifica que la conexión a Snowflake funcione correctamente
    """
    try:
        test_query = "SELECT CURRENT_TIMESTAMP() as current_time, CURRENT_VERSION() as version"
        
        df_test = spark.read \
            .format("snowflake") \
            .options(**credencialesSnowflake) \
            .option("query", test_query) \
            .load()
        
        resultado = df_test.collect()[0]
        print(f"✓ Conexión a Snowflake exitosa")
        print(f"  Hora actual en Snowflake: {resultado['current_time']}")
        print(f"  Versión de Snowflake: {resultado['version']}")
        return True
        
    except Exception as e:
        print(f"✗ Error en la conexión a Snowflake: {e}")
        return False

verificar_conexion_snowflake()