# Agregaciones de datos

#### Creación del DataFrame para los ejemplos

In [0]:
df = spark.read.option("sep",";").option("header", "true").option("inferSchema", "true").csv("/FileStore/tables/pdi_sales_small.csv")
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Units: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Country: string (nullable = true)



#### Importaciones necesarias

In [0]:
from pyspark.sql.functions import count, count_distinct, approx_count_distinct
from pyspark.sql.functions import min, max
from pyspark.sql.functions import sum, sum_distinct, avg
from pyspark.sql.functions import collect_list, collect_set

#### Contar elementos

In [0]:
# Elementos no nulos
df.select(count("Country")).show()

+--------------+
|count(Country)|
+--------------+
|        120239|
+--------------+



In [0]:
# Elementos no nulos diferentes
df.select(count_distinct("Country"), count_distinct("Zip")).show()

+-----------------------+-------------------+
|count(DISTINCT Country)|count(DISTINCT Zip)|
+-----------------------+-------------------+
|                      5|               2585|
+-----------------------+-------------------+



In [0]:
# Elementos no nulos diferentes aproximados (el segundo parámetro asigna la máxima desviación estándar admitida).
# Por defecto es un 5%
df.select(approx_count_distinct("Country"), approx_count_distinct("Zip")).show()

+------------------------------+--------------------------+
|approx_count_distinct(Country)|approx_count_distinct(Zip)|
+------------------------------+--------------------------+
|                             5|                      2737|
+------------------------------+--------------------------+



In [0]:
# Con un 15%
df.select(approx_count_distinct("Zip", 0.15)).show()

+--------------------------+
|approx_count_distinct(Zip)|
+--------------------------+
|                      2195|
+--------------------------+



#### Cálculos

In [0]:
# Mínimo y máximo
df.select(min("Units"), max("Units")).show()

+----------+----------+
|min(Units)|max(Units)|
+----------+----------+
|         1|        77|
+----------+----------+



In [0]:
# Suma de valores
df.select(sum("Units"), sum("Revenue")).show()

+----------+--------------------+
|sum(Units)|        sum(Revenue)|
+----------+--------------------+
|    125728|5.0107274999986745E7|
+----------+--------------------+



In [0]:
# Suma de valores distintos
df.select(sum_distinct("Units"), sum_distinct("Revenue")).show()

+-------------------+---------------------+
|sum(DISTINCT Units)|sum(DISTINCT Revenue)|
+-------------------+---------------------+
|                308|   1189127.0999999985|
+-------------------+---------------------+



In [0]:
# Media aritmética
df.select(avg("Revenue"), sum("Revenue")/count("Revenue")).show()

+-----------------+-------------------------------+
|     avg(Revenue)|(sum(Revenue) / count(Revenue))|
+-----------------+-------------------------------+
|416.7306364822291|              416.7306364822291|
+-----------------+-------------------------------+



#### Agrupaciones

In [0]:
# Número de ventas agrupadas por país (Country)
df.groupBy("Country").count().show()

+-------+-----+
|Country|count|
+-------+-----+
|Germany|30059|
|France |30059|
|Canada |30060|
|Mexico |30060|
| France|    1|
+-------+-----+



In [0]:
# Suma de las ventas por paises
df.groupBy("Country").sum("Revenue").show()

+-------+--------------------+
|Country|        sum(Revenue)|
+-------+--------------------+
|Germany|1.4982119999999512E7|
|France |1.2086961900000831E7|
|Canada |1.1642614200001905E7|
|Mexico | 1.139459870000116E7|
| France|               980.2|
+-------+--------------------+



In [0]:
# Agrupando en varias columnas
df.groupBy("Country").agg(sum("Revenue"), count("Revenue")).show()

+-------+--------------------+--------------+
|Country|        sum(Revenue)|count(Revenue)|
+-------+--------------------+--------------+
|Germany|1.4982119999999512E7|         30059|
|France |1.2086961900000831E7|         30059|
|Canada |1.1642614200001905E7|         30060|
|Mexico | 1.139459870000116E7|         30060|
| France|               980.2|             1|
+-------+--------------------+--------------+



In [0]:
# Aplicar distintos cálculos usando clave-valor
df.groupBy("Country").agg({"Zip":"count", "Revenue":"avg"}).show()

+-------+----------+------------------+
|Country|count(Zip)|      avg(Revenue)|
+-------+----------+------------------+
|Germany|     30059| 498.4237665923521|
|France |     30059| 402.1079177617629|
|Canada |     30060|387.31251497012323|
|Mexico |     30060| 379.0618330007039|
| France|         1|             980.2|
+-------+----------+------------------+



#### Colecciones de un grupo

In [0]:
# Listado de los códigos postales de los pedidos que han superado 5 unidades, por paises
df.where("Units > 5").groupBy("Country").agg(collect_list("Zip"), collect_set("Zip")).show()

+-------+--------------------+--------------------+
|Country|   collect_list(Zip)|    collect_set(Zip)|
+-------+--------------------+--------------------+
|Germany|[22397          ,...|[22111          ,...|
|France |[75213 CEDEX 16 ,...|[75391 CEDEX 08 ,...|
|Mexico |[7100           ,...|[10300          ,...|
|Canada |[T2X            ,...|[T6V            ,...|
+-------+--------------------+--------------------+



In [0]:
df.where("Units > 5").groupBy("Country").agg(collect_list("Zip")).collect()

Out[19]: [Row(Country='Germany', collect_list(Zip)=['22397          ', '22111          ', '40213          ', '45481          ', '47551          ', '12589          ', '47551          ', '40213          ', '22397          ', '40213          ', '22397          ', '47798          ', '22397          ']),
 Row(Country='France ', collect_list(Zip)=['75213 CEDEX 16 ', '75213 CEDEX 16 ', '6300           ', '06083 CEDEX 1  ', '06175 CEDEX 2  ', '75213 CEDEX 16 ', '06175 CEDEX 2  ', '75213 CEDEX 16 ', '06083 CEDEX 1  ', '75213 CEDEX 16 ', '06082 CEDEX 1  ', '06082 CEDEX 1  ', '6470           ', '06175 CEDEX 2  ', '75391 CEDEX 08 ', '75412 CEDEX 08 ', '75391 CEDEX 08 ', '06236 CEDEX    ', '75391 CEDEX 08 ', '06236 CEDEX    ', '75391 CEDEX 08 ', '06175 CEDEX 2  ', '06175 CEDEX 2  ']),
 Row(Country='Mexico ', collect_list(Zip)=['7100           ', '7810           ', '9739           ', '10300          ', '10300          ']),
 Row(Country='Canada ', collect_list(Zip)=['T2X            ', 'V6G           