# Trabajar con datos sucios

Creación de un _dataset_ para poder ejemplificar los nulos

In [0]:
malasVentas = [
    (6666, "2022-03-22", "03206", 33, 3333.33, "Spain"),
    (6666, "2022-03-22", None, 33, 3333.33, "Spain"),
    (6666, "2022-03-23", "03206", None, 2222.22, "Spain"),
    (6666, "2022-03-24", "03206", None, None, "Espain"),
    (6666, "2022-03-25", None, None, None, "Espain"),    
    (None, None, None, None, None, None)
]
malDF = spark.createDataFrame(malasVentas, ["ProductID", "Date", "Zip", "Units", "Revenue" , "Country"])
malDF.show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22| null|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
|     6666|2022-03-24|03206| null|   null| Espain|
|     6666|2022-03-25| null| null|   null| Espain|
|     null|      null| null| null|   null|   null|
+---------+----------+-----+-----+-------+-------+



#### Filtrado sobre nulos

In [0]:
# Filas con la columna Zip con valor nulo
malDF.filter(malDF.Zip.isNull()).show()

+---------+----------+----+-----+-------+-------+
|ProductID|      Date| Zip|Units|Revenue|Country|
+---------+----------+----+-----+-------+-------+
|     6666|2022-03-22|null|   33|3333.33|  Spain|
|     null|      null|null| null|   null|   null|
+---------+----------+----+-----+-------+-------+



#### Eliminar filas con nulos

In [0]:
# Nuevo dataframe eliminando las filas que tienen algún dato nulo
sinNulosDF = malDF.na.drop().show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
+---------+----------+-----+-----+-------+-------+



In [0]:
# Nuevo dataframe eliminando las filas que tienen todos los datos nulos
todosNulosDF = malDF.na.drop("all").show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22| null|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
|     6666|2022-03-24|03206| null|   null| Espain|
+---------+----------+-----+-----+-------+-------+



In [0]:
# Nuevo dataframe eliminando las filas que tienen algún dato nulo
# el resultado es igual sin poner "any"
algunNuloDF = malDF.na.drop("any")
algunNuloDF.show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
+---------+----------+-----+-----+-------+-------+



In [0]:
# Elimina las filas que tienen nulos en la columna Revenue
malDF.na.drop(subset=["Revenue"]).show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22| null|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
+---------+----------+-----+-----+-------+-------+



#### Rellenar los nulos

In [0]:
# Rellenamos los zips vacíos por 99999
malDF.na.fill("99999", subset=["Zip"]).show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22|99999|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
|     6666|2022-03-24|03206| null|   null| Espain|
|     6666|2022-03-25|99999| null|   null| Espain|
|     null|      null|99999| null|   null|   null|
+---------+----------+-----+-----+-------+-------+



In [0]:
# Otra forma de rellenar los valores nulos
sinRevenueDF = malDF.fillna({"Revenue": "100.00"})
sinRevenueDF.show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22| null|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
|     6666|2022-03-24|03206| null|  100.0| Espain|
|     6666|2022-03-25| null| null|  100.0| Espain|
|     null|      null| null| null|  100.0|   null|
+---------+----------+-----+-----+-------+-------+



In [0]:
# Reemplazar valores mal escritos
malDF.na.replace("Espain", "Spain").show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22| null|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
|     6666|2022-03-24|03206| null|   null|  Spain|
|     6666|2022-03-25| null| null|   null|  Spain|
|     null|      null| null| null|   null|   null|
+---------+----------+-----+-----+-------+-------+



#### Transformar valores de columnas

In [0]:
# Es necesario importar las funciones trim y col
from pyspark.sql.functions import col, trim

In [0]:
# Eliminar los blancos de los valores de las filas Country y Zip
sinBlancosDF = malDF.withColumn("Country", trim(col("Country"))).withColumn("Zip", trim(col("Zip")))
sinBlancosDF.show()

+---------+----------+-----+-----+-------+-------+
|ProductID|      Date|  Zip|Units|Revenue|Country|
+---------+----------+-----+-----+-------+-------+
|     6666|2022-03-22|03206|   33|3333.33|  Spain|
|     6666|2022-03-22| null|   33|3333.33|  Spain|
|     6666|2022-03-23|03206| null|2222.22|  Spain|
|     6666|2022-03-24|03206| null|   null| Espain|
|     6666|2022-03-25| null| null|   null| Espain|
|     null|      null| null| null|   null|   null|
+---------+----------+-----+-----+-------+-------+



In [0]:
malDF.withColumn("Revenue", col("Revenue")/1000).show()

+---------+----------+-----+-----+------------------+-------+
|ProductID|      Date|  Zip|Units|           Revenue|Country|
+---------+----------+-----+-----+------------------+-------+
|     6666|2022-03-22|03206|   33|           3.33333|  Spain|
|     6666|2022-03-22| null|   33|           3.33333|  Spain|
|     6666|2022-03-23|03206| null|2.2222199999999996|  Spain|
|     6666|2022-03-24|03206| null|              null| Espain|
|     6666|2022-03-25| null| null|              null| Espain|
|     null|      null| null| null|              null|   null|
+---------+----------+-----+-----+------------------+-------+

