# Funciones

In [0]:
from pyspark.sql.functions import *

#### Fechas

In [0]:
from pyspark.sql.functions import to_date

# Obtenemos el dataFrame a partir del fichero pdi_sales_small.csv
df = spark.read.option("sep",";").option("header", "true").option("inferSchema", "true").csv("/FileStore/tables/pdi_sales_small.csv")
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Units: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
# Cambiamos el tipo de dato a la fecha
df = df.withColumn("Date", to_date(df.Date, "M/d/yyy"))
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Units: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
df.show(5)

+---------+----------+---------------+-----+-------+-------+
|ProductID|      Date|            Zip|Units|Revenue|Country|
+---------+----------+---------------+-----+-------+-------+
|      725|1999-01-15|41540          |    1|  115.5|Germany|
|      787|2002-06-06|41540          |    1|  314.9|Germany|
|      788|2002-06-06|41540          |    1|  314.9|Germany|
|      940|1999-01-15|22587          |    1|  687.7|Germany|
|      396|1999-01-15|22587          |    1|  857.1|Germany|
+---------+----------+---------------+-----+-------+-------+
only showing top 5 rows



In [0]:
df.select("Date", date_format("Date", "dd-MM-yyy"),
         next_day("Date", "Sun"), last_day("Date"),
            dayofmonth("Date"), dayofyear("Date"),
            month("Date"), year("Date")).show(2)

+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
|      Date|date_format(Date, dd-MM-yyy)|next_day(Date, Sun)|last_day(Date)|dayofmonth(Date)|dayofyear(Date)|month(Date)|year(Date)|
+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
|1999-01-15|                  15-01-1999|         1999-01-17|    1999-01-31|              15|             15|          1|      1999|
|2002-06-06|                  06-06-2002|         2002-06-09|    2002-06-30|               6|            157|          6|      2002|
+----------+----------------------------+-------------------+--------------+----------------+---------------+-----------+----------+
only showing top 2 rows



#### Cadenas

In [0]:
# Operaciones con cadenas de caracteres, eliminación de espacios, minúsculas o mayúsculas sobre filas de Canada
df.select("Zip", ltrim("Zip").alias("l"), rtrim("Zip").alias("r"), 
         lower("Zip"), upper("Zip")
         ).where(trim(df.Country)=="Canada").show(3)

+---------------+---------------+---+---------------+---------------+
|            Zip|              l|  r|     lower(Zip)|     upper(Zip)|
+---------------+---------------+---+---------------+---------------+
|H1B            |H1B            |H1B|h1b            |H1B            |
|H1B            |H1B            |H1B|h1b            |H1B            |
|H1B            |H1B            |H1B|h1b            |H1B            |
+---------------+---------------+---+---------------+---------------+
only showing top 3 rows



In [0]:
# Más operaciones con cadenas de caracteres
df.select("Country", initcap("Country"), reverse("Country"),
          length("Country"), translate("Country", "na", "pe")
         ).where(trim(df.Country)=="Canada").show(3)

+-------+----------------+----------------+---------------+--------------------------+
|Country|initcap(Country)|reverse(Country)|length(Country)|translate(Country, na, pe)|
+-------+----------------+----------------+---------------+--------------------------+
|Canada |         Canada |          adanaC|              7|                   Cepede |
|Canada |         Canada |          adanaC|              7|                   Cepede |
|Canada |         Canada |          adanaC|              7|                   Cepede |
+-------+----------------+----------------+---------------+--------------------------+
only showing top 3 rows



In [0]:
# Operaciones con subcadenas
df.select("Country", split("Country", "a"), locate("a", "Country"),
          substring("Country",3,2)
         ).where(trim(df.Country)=="Canada").show(3)

+-------+---------------------+---------------------+------------------------+
|Country|split(Country, a, -1)|locate(a, Country, 1)|substring(Country, 3, 2)|
+-------+---------------------+---------------------+------------------------+
|Canada |         [C, n, d,  ]|                    2|                      na|
|Canada |         [C, n, d,  ]|                    2|                      na|
|Canada |         [C, n, d,  ]|                    2|                      na|
+-------+---------------------+---------------------+------------------------+
only showing top 3 rows



#### Colecciones

Para trabajar con colecciones vamos a usar el fichero _yelp_academic_dataset_business.json_ con datos de negocio de la empresa **Yelp** publicados para uso académico.

Los negocios tienen una propiedad denominada _categories_ que contiene un array con las categorías de los mismos.

In [0]:
# Aunque con esta sentencia se obtiene sólo el primer registro del archivo, es válido para ver el funcionamiento de las colecciones

df = spark.read.option("inferSchema", "true").option("multiline",True).json("/FileStore/tables/yelp_business.json")

# df = spark.read.json("/FileStore/tables/yelp_academic_dataset_business.json")

df.printSchema()

root
 |-- attributes: struct (nullable = true)
 |    |-- Good for Kids: boolean (nullable = true)
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- city: string (nullable = true)
 |-- full_address: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhoods: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- open: boolean (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)



In [0]:
df.count()

Out[13]: 1

Realizamos una consulta sobre los datos del fichero siguiendo el esquema del dataframe.

Obtenemos el nombre, el horario de los domingos utilizando la notación '.' para acceder a los campos anidados,
la cantidad de categorías de cada comercio, un listado ordenado con sus categorías y si es un restaurante.

In [0]:
df.select("name", size("categories").alias("totalCategorias"),
               sort_array("categories").alias("categorias"),
               array_contains("categories", "Restaurants").alias("Restaurantes")).show(10)

+--------------------+---------------+--------------------+------------+
|                name|totalCategorias|          categorias|Restaurantes|
+--------------------+---------------+--------------------+------------+
|Shauna Brown Fitness|              5|[Active Life, Fit...|       false|
+--------------------+---------------+--------------------+------------+



In [0]:
df.select("name", explode("categories")).show(10, truncate=False)

+--------------------+---------------------+
|name                |col                  |
+--------------------+---------------------+
|Shauna Brown Fitness|Active Life          |
|Shauna Brown Fitness|Massage Therapy      |
|Shauna Brown Fitness|Health & Medical     |
|Shauna Brown Fitness|Trainers             |
|Shauna Brown Fitness|Fitness & Instruction|
+--------------------+---------------------+



#### Datos JSON

Las siguientes celdas muestran cómo transformar una colección obtenida desde datos creados como RDD a un DataFrame definido como JSON.

Para ello hay que crear el esquema de los datos JSON y con éste transformarlo con la función _from_json_

In [0]:
tareas = ["""{"dia": "Lunes", "tareas": ["Corregir ejercicios", "Ir a nadar", "Comprar pan"]}"""]
# ['{"dia": "Lunes", "tareas": ["Corregir ejercicios", "Ir a nadar", "Comprar pan"]}']
tareasRDD = spark.sparkContext.parallelize(tareas)
# tareasStrDF es un DF con una columna con nombre value de tipo string
tareasStrDF = tareasRDD.toDF("string")
tareasStrDF.printSchema()
tareasStrDF.show()

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|{"dia": "Lunes", ...|
+--------------------+



In [0]:
# Se quiere pasar el DataFrame a JSON.
# Lo primero es definir el esquema de la estructura JSON
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

esquemaTareas = StructType([
    StructField("dia", StringType(), False),
    StructField("tareas", ArrayType(StringType(), False), False)
])

In [0]:
# Transformamos el DF al formato JSON
todosDF = tareasStrDF.select(from_json("value", esquemaTareas).alias("datos"))
todosDF.printSchema()

root
 |-- datos: struct (nullable = true)
 |    |-- dia: string (nullable = true)
 |    |-- tareas: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [0]:
# Acceso a los datos
# Un elemento de una columna
todosDF.select(col("datos").getItem("dia"),
     "datos.tareas",
     (todosDF.datos.getItem("tareas")[0]).alias("tarea1")).show(truncate=False)

+---------+----------------------------------------------+-------------------+
|datos.dia|tareas                                        |tarea1             |
+---------+----------------------------------------------+-------------------+
|Lunes    |[Corregir ejercicios, Ir a nadar, Comprar pan]|Corregir ejercicios|
+---------+----------------------------------------------+-------------------+



In [0]:
# Representación JSON de una columna
todosDF.select(to_json("datos")).show(truncate=False)

+---------------------------------------------------------------------------+
|to_json(datos)                                                             |
+---------------------------------------------------------------------------+
|{"dia":"Lunes","tareas":["Corregir ejercicios","Ir a nadar","Comprar pan"]}|
+---------------------------------------------------------------------------+

