In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# create dataFrame

In [4]:
data = ((1,'kiran ','chinta',150000.00,'usa','+1 232 343 7889','000 00 0000'),
    (2,'goats ',' machi',100000.00,'ind','+91 111 222 7889','123 45 6789')
       )
schema="""eid INT,first_name STRING,last_name STRING,salary FLOAT,country STRING,phone_number STRING,ssn STRING"""
df = spark.createDataFrame(data=data,schema=schema)
df.show()

                                                                                

+---+----------+---------+--------+-------+----------------+-----------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|
+---+----------+---------+--------+-------+----------------+-----------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|
+---+----------+---------+--------+-------+----------------+-----------+



# Date manipulation functions

- getting current date & time : <i style="color:blue"> current_date, current_timestamp
- date & time arithemetic : <i style="color:blue"> date_add, date_sub, datediff, months_between, add_months, next_day
- date & time beginning :  <i style="color:blue"> trunc, date_trunc
- date & time formatting : <i style="color:blue"> to_date, to_timestamp, date_format
- extract : <i style="color:blue"> year,quater,month, hour, minute, weekofyear, dayofyear, dayofweek
- unix time : <i style="color:blue"> unix_timestamp(), from_unixtime()

###  get current date & timestamp

In [6]:
from pyspark.sql.functions import current_date,current_timestamp

df.withColumn('current_date',current_date()).show()
df.withColumn('current_date',current_timestamp()).show(truncate=False)

+---+----------+---------+--------+-------+----------------+-----------+------------+
|eid|first_name|last_name|  salary|country|    phone_number|        ssn|current_date|
+---+----------+---------+--------+-------+----------------+-----------+------------+
|  1|    kiran |   chinta|150000.0|    usa| +1 232 343 7889|000 00 0000|  2025-06-15|
|  2|    goats |    machi|100000.0|    ind|+91 111 222 7889|123 45 6789|  2025-06-15|
+---+----------+---------+--------+-------+----------------+-----------+------------+

+---+----------+---------+--------+-------+----------------+-----------+-------------------------+
|eid|first_name|last_name|salary  |country|phone_number    |ssn        |current_date             |
+---+----------+---------+--------+-------+----------------+-----------+-------------------------+
|1  |kiran     |chinta   |150000.0|usa    |+1 232 343 7889 |000 00 0000|2025-06-15 15:31:13.25003|
|2  |goats     | machi   |100000.0|ind    |+91 111 222 7889|123 45 6789|2025-06-15 15:3

### arithematic operation on date
- adding days to date or timestamp : <i style="color:blue"> date_add,
- substraction days from date or timestamp:  <i style="color:blue"> date_sub,
- getting num of days b/w two days or timestamps:  <i style="color:blue"> datediff,
- getting num of months b/w two dates or timestamps:<i style="color:blue"> months_between,
- adding months to date or timestamp : <i style="color:blue">add_months,
- next_day

In [16]:
datetimes = [('2023-08-23','2023-08-23 10:00:00.123'),
            ('1993-01-23','1993-01-23 21:30:50.143'),
            ('1995-08-11','1995-08-11 08:12:34.003')]

d_df = spark.createDataFrame(datetimes,schema = 'date STRING, timestamp STRING')
d_df.show(truncate=False)

+----------+-----------------------+
|date      |timestamp              |
+----------+-----------------------+
|2023-08-23|2023-08-23 10:00:00.123|
|1993-01-23|1993-01-23 21:30:50.143|
|1995-08-11|1995-08-11 08:12:34.003|
+----------+-----------------------+



In [18]:
# date_add, date_sub

from pyspark.sql.functions import date_add,date_sub
d_df.withColumn('add_date',date_add('date',10))\
   .withColumn('sub_date',date_sub('date',2))\
    .withColumn('add_to_timestamp',date_add('timestamp',5))\
    .withColumn('substract_from_timestamp',date_sub('timestamp',1))\
    .show(truncate=False)

+----------+-----------------------+----------+----------+----------------+------------------------+
|date      |timestamp              |add_date  |sub_date  |add_to_timestamp|substract_from_timestamp|
+----------+-----------------------+----------+----------+----------------+------------------------+
|2023-08-23|2023-08-23 10:00:00.123|2023-09-02|2023-08-21|2023-08-28      |2023-08-22              |
|1993-01-23|1993-01-23 21:30:50.143|1993-02-02|1993-01-21|1993-01-28      |1993-01-22              |
|1995-08-11|1995-08-11 08:12:34.003|1995-08-21|1995-08-09|1995-08-16      |1995-08-10              |
+----------+-----------------------+----------+----------+----------------+------------------------+



In [21]:
# datediff

from pyspark.sql.functions import datediff
d_df.withColumn('date_diff',datediff('date',current_date()))\
    .withColumn('date_diff_timestamp',datediff(current_timestamp(),'timestamp'))\
    .show()

+----------+--------------------+---------+-------------------+
|      date|           timestamp|date_diff|date_diff_timestamp|
+----------+--------------------+---------+-------------------+
|2023-08-23|2023-08-23 10:00:...|     -662|                662|
|1993-01-23|1993-01-23 21:30:...|   -11831|              11831|
|1995-08-11|1995-08-11 08:12:...|   -10901|              10901|
+----------+--------------------+---------+-------------------+



In [30]:
# months_between
from pyspark.sql.functions import months_between,col
d_df.withColumn('months_bw_date',months_between('date',current_date()))\
    .withColumn('months_bw_timestamp',months_between(current_timestamp(),'timestamp'))\
    .show()

+----------+--------------------+--------------+-------------------+
|      date|           timestamp|months_bw_date|months_bw_timestamp|
+----------+--------------------+--------------+-------------------+
|2023-08-23|2023-08-23 10:00:...|  -21.74193548|        21.74982378|
|1993-01-23|1993-01-23 21:30:...| -388.74193548|       388.73434812|
|1995-08-11|1995-08-11 08:12:...| -358.12903226|       358.13932721|
+----------+--------------------+--------------+-------------------+



In [32]:
# add_months
from pyspark.sql.functions import add_months
d_df.withColumn('add_months_date',add_months('date',2))\
    .withColumn('add_months_timestamp',add_months('timestamp',3))\
    .show()

+----------+--------------------+---------------+--------------------+
|      date|           timestamp|add_months_date|add_months_timestamp|
+----------+--------------------+---------------+--------------------+
|2023-08-23|2023-08-23 10:00:...|     2023-10-23|          2023-11-23|
|1993-01-23|1993-01-23 21:30:...|     1993-03-23|          1993-04-23|
|1995-08-11|1995-08-11 08:12:...|     1995-10-11|          1995-11-11|
+----------+--------------------+---------------+--------------------+



### trunc operation on date & timestamp

In [37]:
# trunc (year or month only) -> starting date of month or year

from pyspark.sql.functions import trunc

d_df\
    .withColumn('mm_date_tunc',trunc('date','MM'))\
    .withColumn('mm_timestamp_tunc',trunc('timestamp','MM'))\
    .withColumn('yy_date_tunc',trunc('date','yy'))\
    .withColumn('yy_timestamp_tunc',trunc('timestamp','yy'))\
    .show()

+----------+--------------------+------------+-----------------+------------+-----------------+
|      date|           timestamp|mm_date_tunc|mm_timestamp_tunc|yy_date_tunc|yy_timestamp_tunc|
+----------+--------------------+------------+-----------------+------------+-----------------+
|2023-08-23|2023-08-23 10:00:...|  2023-08-01|       2023-08-01|  2023-01-01|       2023-01-01|
|1993-01-23|1993-01-23 21:30:...|  1993-01-01|       1993-01-01|  1993-01-01|       1993-01-01|
|1995-08-11|1995-08-11 08:12:...|  1995-08-01|       1995-08-01|  1995-01-01|       1995-01-01|
+----------+--------------------+------------+-----------------+------------+-----------------+



In [44]:
# date_trunc (year or month or day ) -> starting timestamp of day month or year

from pyspark.sql.functions import date_trunc

d_df\
    .withColumn('day_starting_using_date',date_trunc('dd','date'))\
    .withColumn('month_starting_using_date',date_trunc('mm','date'))\
    .withColumn('year_starting_using_date',date_trunc('yy','date'))\
    .withColumn('day_starting_using_ts',date_trunc('dd','timestamp'))\
    .withColumn('month_starting_using_ts',date_trunc('mm','timestamp'))\
    .withColumn('year_starting_using_ts',date_trunc('yy','timestamp'))\
    .show()

+----------+--------------------+-----------------------+-------------------------+------------------------+---------------------+-----------------------+----------------------+
|      date|           timestamp|day_starting_using_date|month_starting_using_date|year_starting_using_date|day_starting_using_ts|month_starting_using_ts|year_starting_using_ts|
+----------+--------------------+-----------------------+-------------------------+------------------------+---------------------+-----------------------+----------------------+
|2023-08-23|2023-08-23 10:00:...|    2023-08-23 00:00:00|      2023-08-01 00:00:00|     2023-01-01 00:00:00|  2023-08-23 00:00:00|    2023-08-01 00:00:00|   2023-01-01 00:00:00|
|1993-01-23|1993-01-23 21:30:...|    1993-01-23 00:00:00|      1993-01-01 00:00:00|     1993-01-01 00:00:00|  1993-01-23 00:00:00|    1993-01-01 00:00:00|   1993-01-01 00:00:00|
|1995-08-11|1995-08-11 08:12:...|    1995-08-11 00:00:00|      1995-08-01 00:00:00|     1995-01-01 00:00:00|  

In [49]:
# date_trunc (hour, minute, second / week or quater ) -> starting timestamp of day month or year
d_df\
    .withColumn('hh_starting_using_ts',date_trunc('hour','timestamp'))\
    .withColumn('min_starting_using_ts',date_trunc('minute','timestamp'))\
    .withColumn('sec_starting_using_ts',date_trunc('second','timestamp'))\
    .withColumn('week_starting_using_ts',date_trunc('week','timestamp'))\
    .withColumn('quat_starting_using_ts',date_trunc('quarter','timestamp'))\
    .show()

+----------+--------------------+--------------------+---------------------+---------------------+----------------------+----------------------+
|      date|           timestamp|hh_starting_using_ts|min_starting_using_ts|sec_starting_using_ts|week_starting_using_ts|quat_starting_using_ts|
+----------+--------------------+--------------------+---------------------+---------------------+----------------------+----------------------+
|2023-08-23|2023-08-23 10:00:...| 2023-08-23 10:00:00|  2023-08-23 10:00:00|  2023-08-23 10:00:00|   2023-08-21 00:00:00|   2023-07-01 00:00:00|
|1993-01-23|1993-01-23 21:30:...| 1993-01-23 21:00:00|  1993-01-23 21:30:00|  1993-01-23 21:30:50|   1993-01-18 00:00:00|   1993-01-01 00:00:00|
|1995-08-11|1995-08-11 08:12:...| 1995-08-11 08:00:00|  1995-08-11 08:12:00|  1995-08-11 08:12:34|   1995-08-07 00:00:00|   1995-07-01 00:00:00|
+----------+--------------------+--------------------+---------------------+---------------------+----------------------+---------

### formatting
- to_date, to_timestamp : converts int or string columns to date,timestamp datatypes

In [59]:
datetimes = [(20231212,'28_Feb_2024 10:11:12.234'),
            (20121101,'18_Apr_2023 09:30:45.234'),
            (20100101,'30_Dec_2010 18:45:55.000')]

d_df = spark.createDataFrame(datetimes, schema = "date INT, time STRING")
d_df.show(truncate=False)

+--------+------------------------+
|date    |time                    |
+--------+------------------------+
|20231212|28_Feb_2024 10:11:12.234|
|20121101|18_Apr_2023 09:30:45.234|
|20100101|30_Dec_2010 18:45:55.000|
+--------+------------------------+



In [61]:
from pyspark.sql.functions import to_date,to_timestamp
d_df\
    .withColumn('date_column',to_date('date','yyyyMMdd'))\
    .withColumn('timestamp_column',to_timestamp('time','dd_MMM_yyyy HH:mm:ss.SSS'))\
    .show(truncate=False)

+--------+------------------------+-----------+-----------------------+
|date    |time                    |date_column|timestamp_column       |
+--------+------------------------+-----------+-----------------------+
|20231212|28_Feb_2024 10:11:12.234|2023-12-12 |2024-02-28 10:11:12.234|
|20121101|18_Apr_2023 09:30:45.234|2012-11-01 |2023-04-18 09:30:45.234|
|20100101|30_Dec_2010 18:45:55.000|2010-01-01 |2010-12-30 18:45:55    |
+--------+------------------------+-----------+-----------------------+



In [65]:
# date-foramt --> print the output in the given format
from pyspark.sql.functions import date_format

d_df\
    .withColumn('year',date_format(current_date(),'yyyy-MM-dd'))\
    .withColumn('month',date_format(current_date(),'MM-dd-yyy'))\
    .withColumn('day',date_format(current_date(),'dd-MM-yyy'))\
    .withColumn('hour',date_format(current_timestamp(),'HH:mm:ss'))\
    .withColumn('minute',date_format(current_timestamp(),'mm:ss-HH'))\
    .withColumn('sec',date_format(current_timestamp(),'ss-HH:mm'))\
    .show(truncate=False)

+--------+------------------------+----------+----------+----------+--------+--------+--------+
|date    |time                    |year      |month     |day       |hour    |minute  |sec     |
+--------+------------------------+----------+----------+----------+--------+--------+--------+
|20231212|28_Feb_2024 10:11:12.234|2025-06-15|06-15-2025|15-06-2025|17:00:59|00:59-17|59-17:00|
|20121101|18_Apr_2023 09:30:45.234|2025-06-15|06-15-2025|15-06-2025|17:00:59|00:59-17|59-17:00|
|20100101|30_Dec_2010 18:45:55.000|2025-06-15|06-15-2025|15-06-2025|17:00:59|00:59-17|59-17:00|
+--------+------------------------+----------+----------+----------+--------+--------+--------+



# extract the info

In [70]:
from pyspark.sql.functions import year, quarter, month, hour, minute, second
from pyspark.sql.functions import weekofyear, dayofyear, dayofweek
d_df\
    .withColumn('current_timestamp',current_timestamp())\
    .withColumn('current_year',year(current_timestamp()))\
    .withColumn('current_qurter',quarter(current_timestamp()))\
    .withColumn('current_month',month(current_timestamp()))\
    .withColumn('current_hour',hour(current_timestamp()))\
    .withColumn('current_minute',minute(current_timestamp()))\
    .withColumn('current_second',second(current_timestamp()))\
    .withColumn('weekofyear',weekofyear(current_timestamp()))\
    .withColumn('dayofyear',dayofyear(current_timestamp()))\
    .withColumn('dayofweek',dayofweek(current_timestamp()))\
    .show(truncate=False)

+--------+------------------------+--------------------------+------------+--------------+-------------+------------+--------------+--------------+----------+---------+---------+
|date    |time                    |current_timestamp         |current_year|current_qurter|current_month|current_hour|current_minute|current_second|weekofyear|dayofyear|dayofweek|
+--------+------------------------+--------------------------+------------+--------------+-------------+------------+--------------+--------------+----------+---------+---------+
|20231212|28_Feb_2024 10:11:12.234|2025-06-15 17:07:38.592857|2025        |2             |6            |17          |7             |38            |24        |166      |1        |
|20121101|18_Apr_2023 09:30:45.234|2025-06-15 17:07:38.592857|2025        |2             |6            |17          |7             |38            |24        |166      |1        |
|20100101|30_Dec_2010 18:45:55.000|2025-06-15 17:07:38.592857|2025        |2             |6            |1

# timestamp - epoch/unix
- an integer started from jan 1st 1970 midnight UTC
- its also know as epoch, for evvery second it increases by 1
- to convert from normal to epoch - unix_timestamp()
- to convert from epoch to normal - from_unixtime()

In [77]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, lit

d_df\
    .withColumn('epochs_col', unix_timestamp(current_timestamp()))\
    .withColumn('norma_col', from_unixtime(lit(1750026225)))\
    .show()

+--------+--------------------+----------+-------------------+
|    date|                time|epochs_col|          norma_col|
+--------+--------------------+----------+-------------------+
|20231212|28_Feb_2024 10:11...|1750026441|2025-06-15 17:23:45|
|20121101|18_Apr_2023 09:30...|1750026441|2025-06-15 17:23:45|
|20100101|30_Dec_2010 18:45...|1750026441|2025-06-15 17:23:45|
+--------+--------------------+----------+-------------------+



In [78]:
spark.stop()