Notebook dedicado a explorar distintas funciones orientadas al tratamiento y conversión de fechas y horas

In [1]:
import spark.implicits._
val columns = Seq("name","date_of_birth", "birthday_date")
val data = Seq(
  ("Jose", "1980-03-22 05:45:00.000", "1980-03-22"),
  ("María Isabel", "1982-12-10 21:45:00.000", "1982-12-10"),
  ("Antonio", "1993-01-21 12:45:34.445", "1993-01-21"),
  ("Norma", "1954-08-14 06:15:12.001", "1954-08-14")
)
val df = data.toDF(columns:_*)
df.show()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.36:4046
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615723460618)
SparkSession available as 'spark'


+------------+--------------------+-------------+
|        name|       date_of_birth|birthday_date|
+------------+--------------------+-------------+
|        Jose|1980-03-22 05:45:...|   1980-03-22|
|María Isabel|1982-12-10 21:45:...|   1982-12-10|
|     Antonio|1993-01-21 12:45:...|   1993-01-21|
|       Norma|1954-08-14 06:15:...|   1954-08-14|
+------------+--------------------+-------------+



import spark.implicits._
columns: Seq[String] = List(name, date_of_birth, birthday_date)
data: Seq[(String, String, String)] = List((Jose,1980-03-22 05:45:00.000,1980-03-22), (María Isabel,1982-12-10 21:45:00.000,1982-12-10), (Antonio,1993-01-21 12:45:34.445,1993-01-21), (Norma,1954-08-14 06:15:12.001,1954-08-14))
df: org.apache.spark.sql.DataFrame = [name: string, date_of_birth: string ... 1 more field]


In [2]:
import org.apache.spark.sql.functions._

// current_date fecha actual
// current_timestamp timestamp actual
val dfWithDate = df.withColumn("current_date", current_date()).withColumn("current_timestamp", current_timestamp())
dfWithDate.show(false)

+------------+-----------------------+-------------+------------+-----------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |
+------------+-----------------------+-------------+------------+-----------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:04:29.155|
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:04:29.155|
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:04:29.155|
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:04:29.155|
+------------+-----------------------+-------------+------------+-----------------------+



import org.apache.spark.sql.functions._
dfWithDate: org.apache.spark.sql.DataFrame = [name: string, date_of_birth: string ... 3 more fields]


In [3]:
// date_add añade/suma un numero de días pasado como parametro a una fecha
dfWithDate.withColumn("first_birthday_date", date_add(col("birthday_date"), 365)).show(false)

+------------+-----------------------+-------------+------------+-----------------------+-------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |first_birthday_date|
+------------+-----------------------+-------------+------------+-----------------------+-------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:04:39.309|1981-03-22         |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:04:39.309|1983-12-10         |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:04:39.309|1994-01-21         |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:04:39.309|1955-08-14         |
+------------+-----------------------+-------------+------------+-----------------------+-------------------+



In [4]:
// date_format para cambiar el formato de la fecha, en este ejemplo
// se cambia al formato hispano dd/MM/yyyy
dfWithDate.withColumn("current_date_formatted", date_format(col("current_date"), "dd/MM/yyyy")).show(false)

+------------+-----------------------+-------------+------------+-----------------------+----------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |current_date_formatted|
+------------+-----------------------+-------------+------------+-----------------------+----------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:04:52.574|14/03/2021            |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:04:52.574|14/03/2021            |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:04:52.574|14/03/2021            |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:04:52.574|14/03/2021            |
+------------+-----------------------+-------------+------------+-----------------------+----------------------+



In [5]:
// date_add añade/suma un numero de días pasado como parametro a una fecha
dfWithDate.withColumn("current_date_1_year_ago", date_sub(col("current_date"), 365)).show(false)

+------------+-----------------------+-------------+------------+-----------------------+-----------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |current_date_1_year_ago|
+------------+-----------------------+-------------+------------+-----------------------+-----------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:05:04.102|2020-03-14             |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:05:04.102|2020-03-14             |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:05:04.102|2020-03-14             |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:05:04.102|2020-03-14             |
+------------+-----------------------+-------------+------------+-----------------------+-----------------------+



In [6]:
// datediff Devuelve el número de días de diferencia entre 2 fechas
dfWithDate.withColumn("date_diff", datediff(col("current_date"), col("birthday_date"))).show(false)

+------------+-----------------------+-------------+------------+-----------------------+---------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |date_diff|
+------------+-----------------------+-------------+------------+-----------------------+---------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:05:15.673|14967    |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:05:15.673|13974    |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:05:15.673|10279    |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:05:15.673|24319    |
+------------+-----------------------+-------------+------------+-----------------------+---------+



In [7]:
// day_of_year, day_of_month, day_of_week
// Funciones para calcular el día del año, mes y de la semana respectivamente
dfWithDate.withColumn("day_of_year", dayofyear(col("current_timestamp")))
          .withColumn("day_of_month", dayofmonth(col("current_timestamp")))
          .withColumn("day_of_week", dayofweek(col("current_timestamp"))).show(false)

+------------+-----------------------+-------------+------------+-----------------------+-----------+------------+-----------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |day_of_year|day_of_month|day_of_week|
+------------+-----------------------+-------------+------------+-----------------------+-----------+------------+-----------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:05:26.386|73         |14          |1          |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:05:26.386|73         |14          |1          |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:05:26.386|73         |14          |1          |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:05:26.386|73         |14          |1          |
+------------+-----------------------+-------------+------------+-----------------------+-----------+----------

In [8]:
// year, month, hour, minute, second
// Funciones para extraer de la fecha el año, mes, hora, minuto y segundo de una fecha
dfWithDate.withColumn("year", year(col("current_timestamp")))
          .withColumn("month", month(col("current_timestamp")))
          .withColumn("hour", hour(col("current_timestamp")))
          .withColumn("minute", minute(col("current_timestamp")))
          .withColumn("second", second(col("current_timestamp"))).show(false)

+------------+-----------------------+-------------+------------+-----------------------+----+-----+----+------+------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |year|month|hour|minute|second|
+------------+-----------------------+-------------+------------+-----------------------+----+-----+----+------+------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:05:36.884|2021|3    |13  |5     |36    |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:05:36.884|2021|3    |13  |5     |36    |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:05:36.884|2021|3    |13  |5     |36    |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:05:36.884|2021|3    |13  |5     |36    |
+------------+-----------------------+-------------+------------+-----------------------+----+-----+----+------+------+



In [9]:
dfWithDate.withColumn("year", year(col("date_of_birth")))
          .withColumn("month", month(col("date_of_birth")))
          .withColumn("hour", hour(col("date_of_birth")))
          .withColumn("minute", minute(col("date_of_birth")))
          .withColumn("second", second(col("date_of_birth"))).show(false)

+------------+-----------------------+-------------+------------+-----------------------+----+-----+----+------+------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |year|month|hour|minute|second|
+------------+-----------------------+-------------+------------+-----------------------+----+-----+----+------+------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:05:47.482|1980|3    |5   |45    |0     |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:05:47.482|1982|12   |21  |45    |0     |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:05:47.482|1993|1    |12  |45    |34    |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:05:47.482|1954|8    |6   |15    |12    |
+------------+-----------------------+-------------+------------+-----------------------+----+-----+----+------+------+



In [10]:
// months_between Función para obtener la diferencia de meses entre 2 fechas, 
// por defecto (salvo que se indique lo contrario con el 3 argumento) redondea decimales a 8 cifras
dfWithDate.withColumn("months_between", months_between(col("current_timestamp"), col("date_of_birth"))).show(false)

+------------+-----------------------+-------------+------------+-----------------------+--------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |months_between|
+------------+-----------------------+-------------+------------+-----------------------+--------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:05:58.152|491.75181377  |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:05:58.152|459.11740517  |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:05:58.152|337.77465054  |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:05:58.152|799.0         |
+------------+-----------------------+-------------+------------+-----------------------+--------------+



In [11]:
dfWithDate.withColumn("months_between", months_between(col("current_timestamp"), col("date_of_birth"), false)).show(false)

+------------+-----------------------+-------------+------------+----------------------+-----------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp     |months_between   |
+------------+-----------------------+-------------+------------+----------------------+-----------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:06:07.85|491.7518171296296|
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:06:07.85|459.1174085274791|
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:06:07.85|337.7746538978495|
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:06:07.85|799.0            |
+------------+-----------------------+-------------+------------+----------------------+-----------------+



In [12]:
// next_day función que nos devuelve que fecha será el proximo día, en el ejemplo cuando será el póximo viernes
dfWithDate.withColumn("next_friday", next_day(col("current_timestamp"), "Fri")).show(false)

+------------+-----------------------+-------------+------------+-----------------------+-----------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |next_friday|
+------------+-----------------------+-------------+------------+-----------------------+-----------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:06:19.071|2021-03-19 |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:06:19.071|2021-03-19 |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:06:19.071|2021-03-19 |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:06:19.071|2021-03-19 |
+------------+-----------------------+-------------+------------+-----------------------+-----------+



In [13]:
// to_date obtener una fecha a partir de un timestamp
dfWithDate.withColumn("date_of_birth_to_date", to_date(col("date_of_birth"), "yyyy-MM-dd HH:mm:ss.SSSS")).show(false)

+------------+-----------------------+-------------+------------+-----------------------+---------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |date_of_birth_to_date|
+------------+-----------------------+-------------+------------+-----------------------+---------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:06:30.703|1980-03-22           |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:06:30.703|1982-12-10           |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:06:30.703|1993-01-21           |
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:06:30.703|1954-08-14           |
+------------+-----------------------+-------------+------------+-----------------------+---------------------+



In [14]:
// to_timestamp convierte a formato timestamp una fecha
dfWithDate.withColumn("to_timestamp_with_format", to_timestamp(col("current_date"), "yyyy-MM-dd HH:mm:ss.SSSS"))
          .withColumn("to_timestamp", to_timestamp(col("current_date"))).show(false)

+------------+-----------------------+-------------+------------+-----------------------+------------------------+-------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp      |to_timestamp_with_format|to_timestamp       |
+------------+-----------------------+-------------+------------+-----------------------+------------------------+-------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:06:40.281|2021-03-14 00:00:00     |2021-03-14 00:00:00|
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:06:40.281|2021-03-14 00:00:00     |2021-03-14 00:00:00|
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:06:40.281|2021-03-14 00:00:00     |2021-03-14 00:00:00|
|Norma       |1954-08-14 06:15:12.001|1954-08-14   |2021-03-14  |2021-03-14 13:06:40.281|2021-03-14 00:00:00     |2021-03-14 00:00:00|
+------------+-----------------------+-------------+---

In [15]:
// unix_timestamp devuelve un timestamp Unix (segundos desde '1970-01-01 00:00:00' GMT) para una fecha pasado como parametro
// unix_timestamp sin parametro devuelve los segundos que han transcurrido desde 1970-01-01 hasta ahora
dfWithDate.withColumn("current_date_to_unixtimestamp_with_format", unix_timestamp(col("current_date"), "yyyy-MM-dd"))
          .withColumn("current_timestamp_to_unixtimestamp", unix_timestamp(col("current_timestamp")))
          .withColumn("current_unixtimestamp", unix_timestamp()).show()

+------------+--------------------+-------------+------------+--------------------+-----------------------------------------+----------------------------------+---------------------+
|        name|       date_of_birth|birthday_date|current_date|   current_timestamp|current_date_to_unixtimestamp_with_format|current_timestamp_to_unixtimestamp|current_unixtimestamp|
+------------+--------------------+-------------+------------+--------------------+-----------------------------------------+----------------------------------+---------------------+
|        Jose|1980-03-22 05:45:...|   1980-03-22|  2021-03-14|2021-03-14 13:06:...|                               1615676400|                        1615723610|           1615723610|
|María Isabel|1982-12-10 21:45:...|   1982-12-10|  2021-03-14|2021-03-14 13:06:...|                               1615676400|                        1615723610|           1615723610|
|     Antonio|1993-01-21 12:45:...|   1993-01-21|  2021-03-14|2021-03-14 13:06:...|  

In [16]:
// weekofyear devuelve la semana del año a la que corresponde una determinada fecha
dfWithDate.withColumn("date_of_birth_week", weekofyear(col("date_of_birth")))
          .withColumn("current_date_week", weekofyear(col("current_date"))).show()

+------------+--------------------+-------------+------------+--------------------+------------------+-----------------+
|        name|       date_of_birth|birthday_date|current_date|   current_timestamp|date_of_birth_week|current_date_week|
+------------+--------------------+-------------+------------+--------------------+------------------+-----------------+
|        Jose|1980-03-22 05:45:...|   1980-03-22|  2021-03-14|2021-03-14 13:07:...|                12|               10|
|María Isabel|1982-12-10 21:45:...|   1982-12-10|  2021-03-14|2021-03-14 13:07:...|                49|               10|
|     Antonio|1993-01-21 12:45:...|   1993-01-21|  2021-03-14|2021-03-14 13:07:...|                 3|               10|
|       Norma|1954-08-14 06:15:...|   1954-08-14|  2021-03-14|2021-03-14 13:07:...|                32|               10|
+------------+--------------------+-------------+------------+--------------------+------------------+-----------------+



In [17]:
// from_unixtime devuelve una fecha (con formato especifico o timestamp) a partir de un unix timestamp
val dfWithCurrentUnixtimestamp = dfWithDate.withColumn("current_unixtimestamp", unix_timestamp())
dfWithCurrentUnixtimestamp.withColumn("from_unixtimestamp", from_unixtime(col("current_unixtimestamp")))
                          .withColumn("from_unixtimestamp_formatted", from_unixtime(col("current_unixtimestamp"), "dd/MM/yyyy")).show(false)

+------------+-----------------------+-------------+------------+----------------------+---------------------+-------------------+----------------------------+
|name        |date_of_birth          |birthday_date|current_date|current_timestamp     |current_unixtimestamp|from_unixtimestamp |from_unixtimestamp_formatted|
+------------+-----------------------+-------------+------------+----------------------+---------------------+-------------------+----------------------------+
|Jose        |1980-03-22 05:45:00.000|1980-03-22   |2021-03-14  |2021-03-14 13:07:28.62|1615723648           |2021-03-14 13:07:28|14/03/2021                  |
|María Isabel|1982-12-10 21:45:00.000|1982-12-10   |2021-03-14  |2021-03-14 13:07:28.62|1615723648           |2021-03-14 13:07:28|14/03/2021                  |
|Antonio     |1993-01-21 12:45:34.445|1993-01-21   |2021-03-14  |2021-03-14 13:07:28.62|1615723648           |2021-03-14 13:07:28|14/03/2021                  |
|Norma       |1954-08-14 06:15:12.001|19

dfWithCurrentUnixtimestamp: org.apache.spark.sql.DataFrame = [name: string, date_of_birth: string ... 4 more fields]


In [18]:
// last_date devuelve la fecha que corresponde al último día del mes al que pertenece la fecha pasada como parametro
// trunc trunca una fecha por año, mes, semana
dfWithDate.withColumn("last_date", last_day(col("current_date")))
          .withColumn("trunc_year", trunc(col("current_date"), "yyyy"))
          .withColumn("trunc_year_month", trunc(col("current_date"), "MM"))
          .withColumn("trunc_week", trunc(col("current_date"), "week"))
          .withColumn("trunc_year2", trunc(col("current_date"), "year")).show()

+------------+--------------------+-------------+------------+--------------------+----------+----------+----------------+----------+-----------+
|        name|       date_of_birth|birthday_date|current_date|   current_timestamp| last_date|trunc_year|trunc_year_month|trunc_week|trunc_year2|
+------------+--------------------+-------------+------------+--------------------+----------+----------+----------------+----------+-----------+
|        Jose|1980-03-22 05:45:...|   1980-03-22|  2021-03-14|2021-03-14 13:07:...|2021-03-31|2021-01-01|      2021-03-01|2021-03-08| 2021-01-01|
|María Isabel|1982-12-10 21:45:...|   1982-12-10|  2021-03-14|2021-03-14 13:07:...|2021-03-31|2021-01-01|      2021-03-01|2021-03-08| 2021-01-01|
|     Antonio|1993-01-21 12:45:...|   1993-01-21|  2021-03-14|2021-03-14 13:07:...|2021-03-31|2021-01-01|      2021-03-01|2021-03-08| 2021-01-01|
|       Norma|1954-08-14 06:15:...|   1954-08-14|  2021-03-14|2021-03-14 13:07:...|2021-03-31|2021-01-01|      2021-03-01|20

In [19]:
// date_trunc devuelve un timestamp truncado
dfWithDate.withColumn("date_trunc_year", date_trunc("yyyy", col("current_timestamp")))
          .withColumn("date_trunc_month", date_trunc("MM", col("current_timestamp")))
          .withColumn("date_trunc_day", date_trunc("dd", col("current_timestamp"))).show()

+------------+--------------------+-------------+------------+--------------------+-------------------+-------------------+-------------------+
|        name|       date_of_birth|birthday_date|current_date|   current_timestamp|    date_trunc_year|   date_trunc_month|     date_trunc_day|
+------------+--------------------+-------------+------------+--------------------+-------------------+-------------------+-------------------+
|        Jose|1980-03-22 05:45:...|   1980-03-22|  2021-03-14|2021-03-14 13:07:...|2021-01-01 00:00:00|2021-03-01 00:00:00|2021-03-14 00:00:00|
|María Isabel|1982-12-10 21:45:...|   1982-12-10|  2021-03-14|2021-03-14 13:07:...|2021-01-01 00:00:00|2021-03-01 00:00:00|2021-03-14 00:00:00|
|     Antonio|1993-01-21 12:45:...|   1993-01-21|  2021-03-14|2021-03-14 13:07:...|2021-01-01 00:00:00|2021-03-01 00:00:00|2021-03-14 00:00:00|
|       Norma|1954-08-14 06:15:...|   1954-08-14|  2021-03-14|2021-03-14 13:07:...|2021-01-01 00:00:00|2021-03-01 00:00:00|2021-03-14 00