In [96]:
!pip install geopandas

[0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1752, in print
    extend(render(renderable, render_options))
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1390, in render
    for render_output in iter_render:
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/segment.py", line 245, in split_lines
    for segment in segments:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1368, in render
    renderable = rich_cast(renderable)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/protocol.py", lin

In [97]:
!hdfs dfs -mkdir /raw

mkdir: `/raw': File exists


In [49]:
!hadoop fs -rm -r /raw/data_raw.parquet

Deleted /raw/data_raw.parquet


In [98]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import json

spark = SparkSession \
  .builder \
  .appName("streaming") \
  .master("local[*]") \
  .getOrCreate()

spark

In [99]:
schema = ArrayType(StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("employee_id", IntegerType(), True),
    StructField("quantity_products", IntegerType(), True),
    StructField("order_id", StringType(), True)
]))

static_schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("employee_id", IntegerType(), True),
    StructField("quantity_products", IntegerType(), True),
    StructField("order_id", StringType(), True)
])

In [100]:
static_df = spark.createDataFrame([], static_schema)
n = 1
while n<=30:
    streaming_df = spark.readStream.format("socket").option("host", "localhost").option("port", "8000").load()
    json_df = streaming_df.select(from_json(col("value"), schema).alias("data"))
    json_df = json_df.selectExpr("explode(data) as dict").select("dict.*")
    writing_df = json_df.writeStream.format("memory").queryName("socketData").outputMode("update").start()
    static_stream_df = spark.sql("SELECT * FROM socketData")
    static_df = static_df.union(static_stream_df)
    writing_df.awaitTermination(2)
    writing_df.stop()
    n += 1
    if static_df.count() > 20:
        static_df.write.mode('append').parquet('/raw/data_raw.parquet')
        static_df = spark.createDataFrame([], static_schema)
    time.sleep(5)

In [101]:
df = spark.read.parquet("/raw/data_raw.parquet")
print("Tabla estatica:", df.count())
df = df.orderBy(desc("order_id"))
df.show(truncate=False)

Tabla estatica: 132
+------------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+
|latitude          |longitude         |date               |customer_id|employee_id|quantity_products|order_id                            |
+------------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+
|6.25793920439463  |-75.52284945246791|19/06/2024 20:52:47|8647       |1473       |92               |d8b9b417-b098-4344-b137-362894e4dcep|
|6.183064659371221 |-75.67390941700788|19/06/2024 04:48:54|2354       |9435       |60               |d8b9b417-b098-4344-b137-362894e4dcep|
|6.198929982040268 |-75.68510755820814|19/06/2024 20:52:47|8505       |2470       |147              |d8b9b417-b098-4344-b137-362894e4dceo|
|6.230269129471986 |-75.6407396896264 |19/06/2024 04:48:54|3340       |2232       |89               |d8b9b417-b098-4344-b137-362894e4dceo|
|6.3184

In [102]:
paths = [
    'file:///workspace/base.data/50001.parquet',
    'file:///workspace/base.data/customers.parquet',
    'file:///workspace/base.data/employees.parquet',
    'file:///workspace/base.data/medellin_neighborhoods.parquet'
]

hdfs_destination = "hdfs:///raw/"


Función para mover archivos a HDFS

In [103]:
def move_to_hdfs(file_paths, hdfs_dest):
    for path in file_paths:
        # Leer el archivo local
        df = spark.read.parquet(path)
        
        # Extraer el nombre del archivo para usarlo como nombre de archivo en HDFS
        file_name = path.split("/")[-1]
        
        # Escribir en HDFS
        df.write.parquet(hdfs_dest + file_name, mode="overwrite")
        
        print(f"Archivo {file_name} movido a {hdfs_dest}")

# Llamar a la función para mover archivos
move_to_hdfs(paths, hdfs_destination)


Archivo 50001.parquet movido a hdfs:///raw/
Archivo customers.parquet movido a hdfs:///raw/
Archivo employees.parquet movido a hdfs:///raw/
Archivo medellin_neighborhoods.parquet movido a hdfs:///raw/


Listar Archivos en HDFS desde Python

In [104]:
from subprocess import Popen, PIPE

# Comando para listar archivos en HDFS
command = ['hadoop', 'fs', '-ls', '/raw']

# Ejecutar el comando y capturar la salida
process = Popen(command, stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()

# Decodificar la salida y mostrarla
output = stdout.decode()
print(output)


Found 5 items
drwxr-xr-x   - root supergroup          0 2024-06-19 20:56 /raw/50001.parquet
drwxr-xr-x   - root supergroup          0 2024-06-19 20:56 /raw/customers.parquet
drwxr-xr-x   - root supergroup          0 2024-06-19 20:52 /raw/data_raw.parquet
drwxr-xr-x   - root supergroup          0 2024-06-19 20:56 /raw/employees.parquet
drwxr-xr-x   - root supergroup          0 2024-06-19 20:56 /raw/medellin_neighborhoods.parquet



In [105]:
# Definir la ruta base en HDFS
hdfs_base_path = 'hdfs://localhost:9000/raw'

# Leer cada archivo Parquet
df_50001 = spark.read.parquet(hdfs_base_path + '/50001.parquet')
df_customers = spark.read.parquet(hdfs_base_path + '/customers.parquet')
df_data_raw = spark.read.parquet(hdfs_base_path + '/data_raw.parquet')
df_employees = spark.read.parquet(hdfs_base_path + '/employees.parquet')
df_neighborhoods = spark.read.parquet(hdfs_base_path + '/medellin_neighborhoods.parquet')


In [106]:
df_data_raw.show(10)


+------------------+------------------+-------------------+-----------+-----------+-----------------+--------------------+
|          latitude|         longitude|               date|customer_id|employee_id|quantity_products|            order_id|
+------------------+------------------+-------------------+-----------+-----------+-----------------+--------------------+
| 6.264198325828029|-75.70341889601379|19/06/2024 20:51:02|       7041|       9438|              135|d8b9b417-b098-434...|
| 6.186075326766365| -75.5543462599129|19/06/2024 20:51:55|       9869|       9438|              104|d8b9b417-b098-434...|
|6.3031184264160895| -75.6061881263135|19/06/2024 20:50:01|       6121|       1679|               78|d8b9b417-b098-434...|
| 6.213752084841632| -75.6560596963381|19/06/2024 04:45:56|       5370|       6659|              179|d8b9b417-b098-434...|
| 6.256734161883117|-75.65165960270286|19/06/2024 04:46:48|       5670|       1737|               44|d8b9b417-b098-434...|
|6.2937101019925

In [107]:
df_50001.show()

+--------+----------+----------+----------+----------+--------------------+
|DPTOMPIO|DPTO_CCDGO|MPIO_CCDGO|MPIO_CNMBR|MPIO_CCNCT|            geometry|
+--------+----------+----------+----------+----------+--------------------+
|   05001|        05|       001|  MEDELLÍN|     05001|[01 03 00 00 00 0...|
+--------+----------+----------+----------+----------+--------------------+



In [108]:
df_neighborhoods.show(5)

+--------+------+----------+--------------+-----------------+---------------------------+--------------+----------------+------------------+--------------------+
|OBJECTID|CODIGO|    NOMBRE|IDENTIFICACION|LIMITEMUNICIPIOID|SUBTIPO_COMUNACORREGIMIENTO|LINK_DOCUMENTO|       SHAPEAREA|          SHAPELEN|            geometry|
+--------+------+----------+--------------+-----------------+---------------------------+--------------+----------------+------------------+--------------------+
|     321|    01|   POPULAR|      COMUNA 1|              001|                          1|          null|3098289.60257159| 9604.987826371042|[01 03 00 00 00 0...|
|     322|    02|SANTA CRUZ|      COMUNA 2|              001|                          1|          null|2195874.52580248| 8597.714448746181|[01 03 00 00 00 0...|
|     323|    03|  MANRIQUE|      COMUNA 3|              001|                          1|          null|5096746.29132065|  12078.2371083362|[01 03 00 00 00 0...|
|     324|    04|  ARANJUEZ|

In [109]:
df_employees.show(5)

+-----------+----------------+--------------+--------------------+--------------------+---------+
|employee_id|            name|         phone|               email|             address|comission|
+-----------+----------------+--------------+--------------------+--------------------+---------+
|       3830|Shaeleigh Turner|1-382-217-5724|pellentesque.ultr...| Ap #497-3659 Eu St.|     0.06|
|       8362|  Catherine King|1-721-878-1085|  sed@localstack.com|Ap #897-2636 Enim...|     0.07|
|       6696|    Patricia Cox|1-265-643-2312|imperdiet.erat.no...|Ap #775-1599 Sed Av.|     0.04|
|       1482|   Elijah Parker|1-960-392-6387|blandit.congue@lo...|P.O. Box 351, 382...|     0.13|
|       9435|    Ryan Nichols|1-746-416-6687|porttitor.tellus....|P.O. Box 829, 407...|     0.18|
+-----------+----------------+--------------+--------------------+--------------------+---------+
only showing top 5 rows



In [110]:
# Mostrar el esquema del DataFrame
df_data_raw.printSchema()

# Contar la cantidad de eventos
event_count = df_data_raw.count()
# Mostrar la cantidad de eventos
print(f"La cantidad de eventos es: {event_count}")

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity_products: integer (nullable = true)
 |-- order_id: string (nullable = true)

La cantidad de eventos es: 132


In [111]:
#Dejar las tablas en bronze con HIVE
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark SQL Hive Integration") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [112]:
# Aumentar el límite de la tasa de datos de IOPub
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {
    'ServerApp': {
        'iopub_data_rate_limit': 10000000
    }
})


In [113]:
# Crear la base de datos bronze si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

DataFrame[]

In [114]:
# Guardar los datos como una tabla en la base de datos bronze
df_data_raw.write.mode('overwrite').saveAsTable('bronze.almacenamiento_bronze')

In [115]:
# Verificar que la tabla está guardada en almacenamiento_bronze
spark.sql("SELECT * FROM bronze.almacenamiento_bronze").show(truncate=False)

+------------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+
|latitude          |longitude         |date               |customer_id|employee_id|quantity_products|order_id                            |
+------------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+
|6.207646785033583 |-75.67205276992796|19/06/2024 04:48:11|3118       |3455       |47               |d8b9b417-b098-4344-b137-362894e4dcea|
|6.194005736634873 |-75.55700426663965|19/06/2024 20:51:17|9678       |3830       |24               |d8b9b417-b098-4344-b137-362894e4dcdd|
|6.260501418879362 |-75.53895438134626|19/06/2024 20:52:10|7459       |1561       |90               |d8b9b417-b098-4344-b137-362894e4dcea|
|6.272821397970896 |-75.70678988842589|19/06/2024 20:50:16|1215       |9435       |111              |d8b9b417-b098-4344-b137-362894e4dcci|
|6.352380414515474 |-75.672

In [116]:
# Crear la base de datos bronze si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")
# Guardar los datos como una tabla en la base de datos municipio
df_50001.write.mode('overwrite').saveAsTable('bronze.municipio')
# Verificar que la tabla está guardada en municipio
#spark.sql("SELECT * FROM bronze.municipio").show(truncate=False, n=20)


In [117]:
# Crear la base de datos bronze si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")
# Guardar los datos como una tabla en la base de datos neighborhoods
df_neighborhoods.write.mode('overwrite').saveAsTable('bronze.neighborhoods')
# Verificar que la tabla está guardada en barrios y mostrar solo las primeras 20 filas de algunas columnas
spark.sql("SELECT CODIGO, NOMBRE, IDENTIFICACION, LIMITEMUNICIPIOID,SUBTIPO_COMUNACORREGIMIENTO  FROM bronze.neighborhoods").show(truncate=False, n=5)


+------+----------+--------------+-----------------+---------------------------+
|CODIGO|NOMBRE    |IDENTIFICACION|LIMITEMUNICIPIOID|SUBTIPO_COMUNACORREGIMIENTO|
+------+----------+--------------+-----------------+---------------------------+
|01    |POPULAR   |COMUNA 1      |001              |1                          |
|02    |SANTA CRUZ|COMUNA 2      |001              |1                          |
|03    |MANRIQUE  |COMUNA 3      |001              |1                          |
|04    |ARANJUEZ  |COMUNA 4      |001              |1                          |
|05    |CASTILLA  |COMUNA 5      |001              |1                          |
+------+----------+--------------+-----------------+---------------------------+
only showing top 5 rows



In [118]:
# Crear la base de datos bronze si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")
# Guardar los datos como una tabla en la base de datos clientes
df_customers.write.mode('overwrite').saveAsTable('bronze.customers')
# Verificar que la tabla está guardada en clientes
spark.sql("SELECT * FROM bronze.customers").show(truncate=False)

+-----------+--------------------+--------------+------------------------------+-------------------------------+
|customer_id|name                |phone         |email                         |address                        |
+-----------+--------------------+--------------+------------------------------+-------------------------------+
|4758       |Callie Reyes        |1-765-410-5785|magnis.dis@protonmail.ca      |151-4553 Interdum Road         |
|5379       |Elizabeth Washington|1-955-634-5542|vel@google.edu                |4063 Nunc St.                  |
|8111       |Hasad Wright        |1-324-830-5595|sed.auctor@aol.org            |Ap #625-8512 Non Rd.           |
|9258       |Kirk Watts          |1-578-784-1146|laoreet.ipsum@protonmail.org  |211-2213 Pede St.              |
|9142       |Cally Robbins       |1-887-472-0478|at.augue.id@google.com        |Ap #287-6324 A, Av.            |
|5041       |Benedict Underwood  |1-138-146-9856|ante.ipsum@yahoo.ca           |2927 Velit Rd.  

In [119]:
# Crear la base de datos bronze si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")
# Guardar los datos como una tabla en la base de datos empleados
df_employees.write.mode('overwrite').saveAsTable('bronze.employees')
# Verificar que la tabla está guardada en empleados
spark.sql("SELECT * FROM bronze.employees").show(truncate=False)

+-----------+-----------------+--------------+--------------------------------------+-----------------------------+---------+
|employee_id|name             |phone         |email                                 |address                      |comission|
+-----------+-----------------+--------------+--------------------------------------+-----------------------------+---------+
|3830       |Shaeleigh Turner |1-382-217-5724|pellentesque.ultricies@localstack.com |Ap #497-3659 Eu St.          |0.06     |
|8362       |Catherine King   |1-721-878-1085|sed@localstack.com                    |Ap #897-2636 Enim Av.        |0.07     |
|6696       |Patricia Cox     |1-265-643-2312|imperdiet.erat.nonummy@localstack.com |Ap #775-1599 Sed Av.         |0.04     |
|1482       |Elijah Parker    |1-960-392-6387|blandit.congue@localstack.com         |P.O. Box 351, 3827 Dolor. Ave|0.13     |
|9435       |Ryan Nichols     |1-746-416-6687|porttitor.tellus.non@localstack.com   |P.O. Box 829, 4074 Et Rd.    |0.1

# En Bronze se dejaron las tablas en Hive: nombre BD= bronze con tablas 1.almacenamiento_bronze 2.municipio 3.neighborhoods 4.customers y 5.employees

##### CAPA SILVER ###########

In [120]:
# Listar las tablas en la base de datos eventos_bronze
tables_bronze = spark.sql("SHOW TABLES IN bronze")
tables_bronze.show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
|  bronze|almacenamiento_br...|      false|
|  bronze|           customers|      false|
|  bronze|           employees|      false|
|  bronze|           municipio|      false|
|  bronze|       neighborhoods|      false|
|        |          socketdata|       true|
+--------+--------------------+-----------+



In [121]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, to_timestamp, month, dayofmonth, hour, minute, second, date_format


In [122]:
spark = SparkSession.builder \
    .appName("Transformación Bronze a Silver") \
    .getOrCreate()


In [123]:
bronze_path_hdfs = 'hdfs://localhost:9000/raw'


In [124]:
df_bronze = spark.read.parquet(bronze_path_hdfs + '/data_raw.parquet')


In [137]:
df_bronze.printSchema()
df_bronze.show(10, truncate=False)


root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity_products: integer (nullable = true)
 |-- order_id: string (nullable = true)

+------------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+
|latitude          |longitude         |date               |customer_id|employee_id|quantity_products|order_id                            |
+------------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+
|6.264198325828029 |-75.70341889601379|19/06/2024 20:51:02|7041       |9438       |135              |d8b9b417-b098-4344-b137-362894e4dccx|
|6.186075326766365 |-75.5543462599129 |19/06/2024 20:51:55|9869       |9438       |104              |d8b9b417-b098-4344-b137-362894e4dcdu|
|6.3

In [138]:
!hdfs dfs -mkdir /silver

mkdir: `/silver': File exists


# Definir la ruta base en HDFS para la capa Silver

In [139]:
silver_path_hdfs = "hdfs://localhost:9000/silver"

In [140]:
# Transformación del DataFrame
df_silver = df_bronze.withColumn("event_date", to_timestamp(col("date"), "dd/MM/yyyy HH:mm:ss")) \
    .withColumn("partition_date", date_format(col("event_date"), "ddMMyyyy")) \
    .withColumn("event_day", dayofmonth(col("event_date"))) \
    .withColumn("event_hour", hour(col("event_date"))) \
    .withColumn("event_minute", minute(col("event_date"))) \
    .withColumn("event_month", month(col("event_date"))) \
    .withColumn("event_second", second(col("event_date"))) \
    .withColumn("event_year", year(col("event_date"))) \
    .withColumnRenamed("order_id", "order_id") \
    .withColumnRenamed("employee_id", "employee_id") \
    .withColumnRenamed("quantity_products", "quantity_products") \
    .withColumnRenamed("latitude", "latitude") \
    .withColumnRenamed("longitude", "longitude") \
    .withColumnRenamed("customer_id", "customer_id")

# Eliminar la columna original de fecha si no es necesaria
df_silver = df_silver.drop("date")

# Mostrar el esquema y las primeras 10 filas
df_silver.printSchema()
df_silver.show(10, truncate=False)

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity_products: integer (nullable = true)
 |-- order_id: string (nullable = true)
 |-- event_date: timestamp (nullable = true)
 |-- partition_date: string (nullable = true)
 |-- event_day: integer (nullable = true)
 |-- event_hour: integer (nullable = true)
 |-- event_minute: integer (nullable = true)
 |-- event_month: integer (nullable = true)
 |-- event_second: integer (nullable = true)
 |-- event_year: integer (nullable = true)

+------------------+------------------+-----------+-----------+-----------------+------------------------------------+-------------------+--------------+---------+----------+------------+-----------+------------+----------+
|latitude          |longitude         |customer_id|employee_id|quantity_products|order_id                            |event_date         |partition_date|event

In [141]:
# Guardar el DataFrame df_silver en la ruta definida en HDFS
df_silver.write.mode("overwrite").parquet(silver_path_hdfs + "/df_silver")

In [142]:
# Crear la base de datos silver en Hive
tables_silver = spark.sql("CREATE DATABASE IF NOT EXISTS silver")
tables_silver.show()

++
||
++
++



In [143]:
# Crear la base de datos silver si no existe
spark.sql("CREATE DATABASE IF NOT EXISTS silver")
# Guardar los datos como una tabla en la base de datos silver
df_silver.write.mode('overwrite').saveAsTable('silver.silver')
# Verificar que la tabla está guardada en empleados
spark.sql("SELECT * FROM silver.silver").show(truncate=False)

+------------------+------------------+-----------+-----------+-----------------+------------------------------------+-------------------+--------------+---------+----------+------------+-----------+------------+----------+
|latitude          |longitude         |customer_id|employee_id|quantity_products|order_id                            |event_date         |partition_date|event_day|event_hour|event_minute|event_month|event_second|event_year|
+------------------+------------------+-----------+-----------+-----------------+------------------------------------+-------------------+--------------+---------+----------+------------+-----------+------------+----------+
|6.205638533075326 |-75.62882804888227|2561       |5668       |-2               |d8b9b417-b098-4344-b137-362894e4dcdv|2024-06-19 20:51:55|19062024      |19       |20        |51          |6          |55          |2024      |
|6.230532427919653 |-75.50037567966193|9526       |2232       |161              |d8b9b417-b098-4344-b137

In [136]:
#df_silver = df_silver.join(df_communes, on=[df_silver.latitude == df_communes.latitude, df_silver.longitude == df_communes.longitude], how="left") \
#    .select(df_silver["*"], df_communes["commune"], df_communes["neighborhood"]

# Union (join) 

In [45]:
#from pyspark.sql.functions import col

# Unir eventos con información de clientes
#df_eventos = almacenamiento_bronze.join(
    clientes,
    almacenamiento_bronze.customer_id == customers.customer_id,
    "left"
).select(
    almacenamiento_bronze["*"],  # Seleccionar todos los campos de eventos
    customers["name"].alias("customer_name")  # Añadir el nombre del cliente
)

In [46]:
# Mostrar el DataFrame
#df_eventos.show(truncate=False, n=5)

+-----------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+--------------+
|latitude         |longitude         |date               |customer_id|employee_id|quantity_products|order_id                            |customer_name |
+-----------------+------------------+-------------------+-----------+-----------+-----------------+------------------------------------+--------------+
|6.287253619436304|-75.59859567560713|18/06/2024 21:23:57|9723       |3830       |386              |d8b9b417-b098-4344-b137-362894e4dcel|Owen Davenport|
|6.203530513155968|-75.63432984287904|18/06/2024 21:22:55|9059       |1561       |593              |d8b9b417-b098-4344-b137-362894e4dccl|Garrett Booth |
|6.225420074041011|-75.65419144934765|18/06/2024 21:24:20|3770       |3455       |636              |d8b9b417-b098-4344-b137-362894e4dcfl|Carol Santiago|
|6.238686550450639|-75.56815898930817|18/06/2024 21:26:22|9595       |1737       |

In [47]:
df_eventos.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity_products: integer (nullable = true)
 |-- order_id: string (nullable = true)
 |-- customer_name: string (nullable = true)

