In [2]:
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.functions import *

In [3]:
# Init a Session
spark=SparkSession.builder.appName('PySpark_SQL_Function').getOrCreate()

In [4]:
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"]]
df=spark.createDataFrame(data,["id","input"])
df.show()

+---+----------+
| id|     input|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-03-01|
+---+----------+



In [5]:
# Get current date
df.select(current_date().alias("current_date")).show(1)

+------------+
|current_date|
+------------+
|  2022-11-17|
+------------+
only showing top 1 row



In [12]:
# format date
df.select("input",date_format("input","dd/MM/yyyy").alias("date_format")).show()

+----------+-----------+
|     input|date_format|
+----------+-----------+
|2020-02-01| 01/02/2020|
|2019-03-01| 01/03/2019|
|2021-03-01| 01/03/2021|
+----------+-----------+



In [14]:
# Date diff
df.select("input",datediff(current_date(),"input").alias("datediff")).show()

+----------+--------+
|     input|datediff|
+----------+--------+
|2020-02-01|    1020|
|2019-03-01|    1357|
|2021-03-01|     626|
+----------+--------+



In [16]:
# Month diff
df.select("input",months_between(current_date(),"input").alias("monthdiff")).show()

+----------+-----------+
|     input|  monthdiff|
+----------+-----------+
|2020-02-01|33.51612903|
|2019-03-01|44.51612903|
|2021-03-01|20.51612903|
+----------+-----------+



In [6]:
#trunc()
df.select("input", 
    trunc("input","Month").alias("Month_Trunc"), 
    trunc("input","Year").alias("Month_Year"), 
    trunc("input","Month").alias("Month_Trunc")
   ).show()

+----------+-----------+----------+-----------+
|     input|Month_Trunc|Month_Year|Month_Trunc|
+----------+-----------+----------+-----------+
|2020-02-01| 2020-02-01|2020-01-01| 2020-02-01|
|2019-03-01| 2019-03-01|2019-01-01| 2019-03-01|
|2021-03-01| 2021-03-01|2021-01-01| 2021-03-01|
+----------+-----------+----------+-----------+



In [8]:
# Add month, add date & sub date
df.select("input",add_months("input",3).alias("add_months"),
                  add_months("input",-3).alias("sub_months"),
                  date_add("input",4).alias("add_date"),
                  date_sub("input",4).alias("sub_date") 
          ).show()

+----------+----------+----------+----------+----------+
|     input|add_months|sub_months|  add_date|  sub_date|
+----------+----------+----------+----------+----------+
|2020-02-01|2020-05-01|2019-11-01|2020-02-05|2020-01-28|
|2019-03-01|2019-06-01|2018-12-01|2019-03-05|2019-02-25|
|2021-03-01|2021-06-01|2020-12-01|2021-03-05|2021-02-25|
+----------+----------+----------+----------+----------+



In [10]:
# Year, month, next day, week of year
df.select("input",year("input").alias("year"),
                  month("input").alias("month"),
                  next_day("input","Sunday").alias("next_day"), 
                  weekofyear("input").alias("week_of_year")
                  ).show()

+----------+----+-----+----------+------------+
|     input|year|month|  next_day|week_of_year|
+----------+----+-----+----------+------------+
|2020-02-01|2020|    2|2020-02-02|           5|
|2019-03-01|2019|    3|2019-03-03|           9|
|2021-03-01|2021|    3|2021-03-07|           9|
+----------+----+-----+----------+------------+



In [11]:
# Day of week, day of month, day of year
df.select("input",  
     dayofweek("input").alias("dayofweek"), 
     dayofmonth("input").alias("dayofmonth"), 
     dayofyear("input").alias("dayofyear"), 
  ).show()

+----------+---------+----------+---------+
|     input|dayofweek|dayofmonth|dayofyear|
+----------+---------+----------+---------+
|2020-02-01|        7|         1|       32|
|2019-03-01|        6|         1|       60|
|2021-03-01|        2|         1|       60|
+----------+---------+----------+---------+



In [18]:
data=[["1","02-01-2020 11 01 19 06"],["2","03-01-2019 12 01 19 406"],["3","03-01-2021 12 01 19 406"]]
df2=spark.createDataFrame(data,["id","input"])

In [16]:
# Get current timestamp
df2.select("input",current_timestamp().alias("current_timestamp")).show(1)

+--------------------+--------------------+
|               input|   current_timestamp|
+--------------------+--------------------+
|02-01-2020 11 01 ...|2022-11-17 17:21:...|
+--------------------+--------------------+
only showing top 1 row



In [19]:
# To timestamp
df2.select("input",to_timestamp("input","dd-MM-yyyy HH mm ss SSS").alias("to_timestamp")).show()

+--------------------+--------------------+
|               input|        to_timestamp|
+--------------------+--------------------+
|02-01-2020 11 01 ...|2020-01-02 11:01:...|
|03-01-2019 12 01 ...|2019-01-03 12:01:...|
|03-01-2021 12 01 ...|2021-01-03 12:01:...|
+--------------------+--------------------+



In [20]:
#hour, minute,second
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])

df3.select("input", 
    hour("input").alias("hour"), 
    minute("input").alias("minute"),
    second("input").alias("second") 
  ).show(truncate=False)

+-----------------------+----+------+------+
|input                  |hour|minute|second|
+-----------------------+----+------+------+
|2020-02-01 11:01:19.06 |11  |1     |19    |
|2019-03-01 12:01:19.406|12  |1     |19    |
|2021-03-01 12:01:19.406|12  |1     |19    |
+-----------------------+----+------+------+

