### Importar Librerias

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType
from pyspark.sql.functions import to_timestamp

### Crea una Session en PySpark

In [2]:
spark = SparkSession.builder.appName("CargarDatosNYC").getOrCreate()

### Definir las variables y sus tipos

In [3]:
# Esquema definido
schema = StructType([
    StructField("VendorID", IntegerType(), True),
    StructField("tpep_pickup_datetime", StringType(), True),
    StructField("tpep_dropoff_datetime", StringType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("RatecodeID", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("PULocationID", IntegerType(), True),
    StructField("DOLocationID", IntegerType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("congestion_surcharge", DoubleType(), True)
])

### Leer los CSVs con los campos y tipos definidos

In [4]:
df = spark.read.csv("../data/raw/yellow_tripdata_2019-*.csv", header=True, schema=schema)

### Convertir a timestamp

In [5]:
df = df.withColumn("tpep_pickup_datetime", to_timestamp("tpep_pickup_datetime"))
df = df.withColumn("tpep_dropoff_datetime", to_timestamp("tpep_dropoff_datetime"))

### Guardar en Parquet

In [6]:
df.write.mode("overwrite").parquet("../data/processed/yellow_2019_raw.parquet")

print("✅ Datos cargados y guardados como Parquet")

✅ Datos cargados y guardados como Parquet


### Cerrar Spark

In [7]:
spark.stop()