# API para DataFrame

En este cuaderno trabajamos con el fichero _pdi_sales_small.csv_ ya utilizado previamente

In [0]:
# Generación del esquema y creación del DataFrame
df = spark.read.option("sep",";").option("header", "true").option("inferSchema", "true").csv("/FileStore/tables/pdi_sales_small.csv")
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Units: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
df.show(5)

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      725|1/15/1999|41540          |    1|  115.5|Germany|
|      787| 6/6/2002|41540          |    1|  314.9|Germany|
|      788| 6/6/2002|41540          |    1|  314.9|Germany|
|      940|1/15/1999|22587          |    1|  687.7|Germany|
|      396|1/15/1999|22587          |    1|  857.1|Germany|
+---------+---------+---------------+-----+-------+-------+
only showing top 5 rows



#### Selección de columnas (proyección)

In [0]:
df.select("ProductID","Revenue").show(3)

+---------+-------+
|ProductID|Revenue|
+---------+-------+
|      725|  115.5|
|      787|  314.9|
|      788|  314.9|
+---------+-------+
only showing top 3 rows



In [0]:
# Uso de operaciones
df.select(df.ProductID,(df.Revenue+10).alias("VentasMas10")).show(3)

+---------+-----------+
|ProductID|VentasMas10|
+---------+-----------+
|      725|      125.5|
|      787|      324.9|
|      788|      324.9|
+---------+-----------+
only showing top 3 rows



In [0]:
# Obtención de la mayoría de las columasn excepto unas determinadas
df.drop("Units", "Country").show(3)

+---------+---------+---------------+-------+
|ProductID|     Date|            Zip|Revenue|
+---------+---------+---------------+-------+
|      725|1/15/1999|41540          |  115.5|
|      787| 6/6/2002|41540          |  314.9|
|      788| 6/6/2002|41540          |  314.9|
+---------+---------+---------------+-------+
only showing top 3 rows



#### Modificación de columnas
Previamente hay que importar la función _col_

In [0]:
from pyspark.sql.functions import col

df.select("ProductID", "Revenue").show(3)

+---------+-------+
|ProductID|Revenue|
+---------+-------+
|      725|  115.5|
|      787|  314.9|
|      788|  314.9|
+---------+-------+
only showing top 3 rows



In [0]:
df.select(df.ProductID, df.Revenue).show(3)

+---------+-------+
|ProductID|Revenue|
+---------+-------+
|      725|  115.5|
|      787|  314.9|
|      788|  314.9|
+---------+-------+
only showing top 3 rows



In [0]:
df.select(df["ProductID"], df["Revenue"]).show(3)

+---------+-------+
|ProductID|Revenue|
+---------+-------+
|      725|  115.5|
|      787|  314.9|
|      788|  314.9|
+---------+-------+
only showing top 3 rows



In [0]:
df.select(col("ProductID"), col("Revenue")).show(3)

+---------+-------+
|ProductID|Revenue|
+---------+-------+
|      725|  115.5|
|      787|  314.9|
|      788|  314.9|
+---------+-------+
only showing top 3 rows



Añadir columnas

In [0]:
dfNuevo = df.withColumn("total", df.Units * df.Revenue)
dfNuevo.show(4)

+---------+---------+---------------+-----+-------+-------+-----+
|ProductID|     Date|            Zip|Units|Revenue|Country|total|
+---------+---------+---------------+-----+-------+-------+-----+
|      725|1/15/1999|41540          |    1|  115.5|Germany|115.5|
|      787| 6/6/2002|41540          |    1|  314.9|Germany|314.9|
|      788| 6/6/2002|41540          |    1|  314.9|Germany|314.9|
|      940|1/15/1999|22587          |    1|  687.7|Germany|687.7|
+---------+---------+---------------+-----+-------+-------+-----+
only showing top 4 rows



Método _SelectExpr_ usando una expresión, y con agregaciones de datos

In [0]:
df.selectExpr("*", "Units * Revenue as total").show(4)

+---------+---------+---------------+-----+-------+-------+-----+
|ProductID|     Date|            Zip|Units|Revenue|Country|total|
+---------+---------+---------------+-----+-------+-------+-----+
|      725|1/15/1999|41540          |    1|  115.5|Germany|115.5|
|      787| 6/6/2002|41540          |    1|  314.9|Germany|314.9|
|      788| 6/6/2002|41540          |    1|  314.9|Germany|314.9|
|      940|1/15/1999|22587          |    1|  687.7|Germany|687.7|
+---------+---------+---------------+-----+-------+-------+-----+
only showing top 4 rows



In [0]:
df.selectExpr("count(distinct(ProductID)) as productos","count(distinct(Country)) as paises").show()

+---------+------+
|productos|paises|
+---------+------+
|      799|     5|
+---------+------+



Cambio de nombre a una columna

In [0]:
df.withColumnRenamed("Zip", "PostalCode").show(3)

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|     PostalCode|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      725|1/15/1999|41540          |    1|  115.5|Germany|
|      787| 6/6/2002|41540          |    1|  314.9|Germany|
|      788| 6/6/2002|41540          |    1|  314.9|Germany|
+---------+---------+---------------+-----+-------+-------+
only showing top 3 rows



#### Filtrado de filas

In [0]:
# Usando el método filter
df.filter(df.Country=="Germany").show(3)

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      725|1/15/1999|41540          |    1|  115.5|Germany|
|      787| 6/6/2002|41540          |    1|  314.9|Germany|
|      788| 6/6/2002|41540          |    1|  314.9|Germany|
+---------+---------+---------------+-----+-------+-------+
only showing top 3 rows



In [0]:
# Usando condiciones con where
df.filter((df.Country=="Germany") & (df.Units>20)).show()

+---------+----------+---------------+-----+-------+-------+
|ProductID|      Date|            Zip|Units|Revenue|Country|
+---------+----------+---------------+-----+-------+-------+
|     2091| 6/15/1999|40213          |   41| 6240.1|Germany|
|     2091|10/15/1999|40213          |   41| 6347.7|Germany|
|     2091|12/15/1999|40213          |   23| 3560.9|Germany|
+---------+----------+---------------+-----+-------+-------+



In [0]:
df.filter((df.ProductID==2314) | (df.ProductID==1322)).show()

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|     2314|5/15/1999|46045          |    1|   13.9|Germany|
|     1322| 1/6/2000|75593 CEDEX 12 |    1|  254.5|France |
+---------+---------+---------------+-----+-------+-------+



Eliminación de registros repetidos

In [0]:
df.select("Country").distinct().show()

+-------+
|Country|
+-------+
|Germany|
|France |
|Canada |
|Mexico |
| France|
+-------+



In [0]:
df.dropDuplicates(["Country"]).select("Country").show()

+-------+
|Country|
+-------+
|Germany|
|France |
|Canada |
|Mexico |
| France|
+-------+



#### Ordenación de registros

In [0]:
# A partir de los valores de una columna, tras una proyección
df.select("ProductID","Revenue").sort("Revenue").show(5)

+---------+-------+
|ProductID|Revenue|
+---------+-------+
|     2314|   13.9|
|     1974|   52.4|
|     1975|   52.4|
|     1974|   52.4|
|     1974|   52.4|
+---------+-------+
only showing top 5 rows



In [0]:
# Ordenación descendente
df.sort("Revenue", ascending=False).show(5)

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      495|3/15/1999|75213 CEDEX 16 |   77|43194.1|France |
|      495| 3/1/2000|75391 CEDEX 08 |   18|10395.0|France |
|      464|6/11/2003|75213 CEDEX 16 |   16|10075.8|France |
|      464| 8/1/2000|22397          |   17| 9817.5|Germany|
|      495| 3/1/2000|06175 CEDEX 2  |   16| 9240.0|France |
+---------+---------+---------------+-----+-------+-------+
only showing top 5 rows



In [0]:
# Dos ordenaciones en dos columnas diferentes
df.sort(df.Revenue.desc(), df.Units.asc()).show(5)

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      495|3/15/1999|75213 CEDEX 16 |   77|43194.1|France |
|      495| 3/1/2000|75391 CEDEX 08 |   18|10395.0|France |
|      464|6/11/2003|75213 CEDEX 16 |   16|10075.8|France |
|      464| 8/1/2000|22397          |   17| 9817.5|Germany|
|      495| 3/1/2000|06175 CEDEX 2  |   16| 9240.0|France |
+---------+---------+---------------+-----+-------+-------+
only showing top 5 rows



In [0]:
# Lo mismo de otra forma
df.sort(["Revenue","Units"], ascending=[0,1]).show(5)

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      495|3/15/1999|75213 CEDEX 16 |   77|43194.1|France |
|      495| 3/1/2000|75391 CEDEX 08 |   18|10395.0|France |
|      464|6/11/2003|75213 CEDEX 16 |   16|10075.8|France |
|      464| 8/1/2000|22397          |   17| 9817.5|Germany|
|      495| 3/1/2000|06175 CEDEX 2  |   16| 9240.0|France |
+---------+---------+---------------+-----+-------+-------+
only showing top 5 rows



In [0]:
# Limitando el número de registros que llegan al Driver, en vez de traerlos todos
df.sort(df.Revenue.desc(), df.Units.asc()).limit(5).show()

+---------+---------+---------------+-----+-------+-------+
|ProductID|     Date|            Zip|Units|Revenue|Country|
+---------+---------+---------------+-----+-------+-------+
|      495|3/15/1999|75213 CEDEX 16 |   77|43194.1|France |
|      495| 3/1/2000|75391 CEDEX 08 |   18|10395.0|France |
|      464|6/11/2003|75213 CEDEX 16 |   16|10075.8|France |
|      464| 8/1/2000|22397          |   17| 9817.5|Germany|
|      495| 3/1/2000|06175 CEDEX 2  |   16| 9240.0|France |
+---------+---------+---------------+-----+-------+-------+



#### Añadir filas

In [0]:
nuevasVenta = [
    (6666, "2022-03-24", "03206", 33, 3333.33, "Spain"),
    (6666, "2022-03-25", "03206", 22, 2222.22, "Spain"),
]
# Creamos un nuevo DataFrame con las nuevas Ventas
nvDF = spark.createDataFrame(nuevasVenta)
# Unimos los dos DataFrames
dfUpdated = df.union(nvDF)
df.count()

Out[28]: 120239

In [0]:
dfUpdated.count()

Out[27]: 120241

#### Muestras de datos

In [0]:
muestra = df.sample(0.10)
muestra.count()             

Out[29]: 12128

In [0]:
muestraConRepetidos = df.sample(True, 0.10)
muestraConRepetidos.count() 

Out[30]: 12095

In [0]:
dfs = df.randomSplit([0.5, 0.3, 0.2])
df1 = dfs[0]
df2 = dfs[1]
df3 = dfs[2]
n1 = df1.count()
n2 = df2.count() 
n3 = df3.count()
print ("df1: " + str(n1) + "; df2: " + str(n2) + "; df3: " + str(n3) )

df1: 60090; df2: 36089; df3: 24060
