In [1]:
# Importing the necessary tables
import findspark
import pyspark

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import to_date, col
from pyspark.sql.types import StructType, StructField, DateType, StringType

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
# Defining a transformation function
def to_date_df(df, fmt, fld):
    return df.withColumn(fld, to_date(col(fld), fmt))

In [5]:
# Creating a database to test
my_schema = StructType([
    StructField('ID', StringType()),
    StructField('EventDate', StringType())
])

my_rows = [Row('123', '04/05/2020'),
           Row('124', '4/5/2020'),
           Row('125', '04/5/2020'),
           Row('126', '4/05/2020')]
my_rdd = spark.sparkContext.parallelize(my_rows, 2)
my_df = spark.createDataFrame(my_rdd, my_schema)

In [6]:
my_df.printSchema()
my_df.show()

root
 |-- ID: string (nullable = true)
 |-- EventDate: string (nullable = true)

+---+----------+
| ID| EventDate|
+---+----------+
|123|04/05/2020|
|124|  4/5/2020|
|125| 04/5/2020|
|126| 4/05/2020|
+---+----------+



In [7]:
new_df = to_date_df(my_df, 'M/d/y', 'EventDate')
new_df.printSchema()
new_df.show()

root
 |-- ID: string (nullable = true)
 |-- EventDate: date (nullable = true)

+---+----------+
| ID| EventDate|
+---+----------+
|123|2020-04-05|
|124|2020-04-05|
|125|2020-04-05|
|126|2020-04-05|
+---+----------+



In [8]:
spark.stop()