In [0]:
#Create Sample Dataframe
employee_data= [(10,"Michael Robinson","1999-06-01","100", 2000),
                (29,"James Wood","2003-03-01","209",8000),
                (39,"Chris Andrews","2005-94-91","100",6900),
                (48,"Mark Bond","2008-10-01","100", 7000),
                (50,"Steve Watson","1996-02-01","480", 1000),
                (60,"Mathews Simon","1998-11-01","500",5000),
                (78, "Peter Paul", "2011-04-01","600",5000)
               ]
employee_schema= ["employee_id","Name","doj", "employee_dept_id","salary"]
empDF = spark.createDataFrame(data=employee_data, schema= employee_schema)
display(empDF)

employee_id,Name,doj,employee_dept_id,salary
10,Michael Robinson,1999-06-01,100,2000
29,James Wood,2003-03-01,209,8000
39,Chris Andrews,2005-94-91,100,6900
48,Mark Bond,2008-10-01,100,7000
50,Steve Watson,1996-02-01,480,1000
60,Mathews Simon,1998-11-01,500,5000
78,Peter Paul,2011-04-01,600,5000


In [0]:
#First Method of Split
from pyspark.sql.functions import split

df1 = empDF.withColumn('First_Name', split(empDF['Name'],' ' ).getItem(0)) \
           .withColumn('Last_Name',split(empDF['Name'],' ').getItem(1))

display (df1)


employee_id,Name,doj,employee_dept_id,salary,First_Name,Last_Name
10,Michael Robinson,1999-06-01,100,2000,Michael,Robinson
29,James Wood,2003-03-01,209,8000,James,Wood
39,Chris Andrews,2005-94-91,100,6900,Chris,Andrews
48,Mark Bond,2008-10-01,100,7000,Mark,Bond
50,Steve Watson,1996-02-01,480,1000,Steve,Watson
60,Mathews Simon,1998-11-01,500,5000,Mathews,Simon
78,Peter Paul,2011-04-01,600,5000,Peter,Paul


In [0]:
#Second Method of Split
import pyspark

split_col= pyspark.sql.functions.split(empDF['Name'],' ')

df2 = empDF.withColumn('First_Name', split_col.getItem(0)) \
            .withColumn('Last_Name', split_col.getItem(1))

display (df2)

employee_id,Name,doj,employee_dept_id,salary,First_Name,Last_Name
10,Michael Robinson,1999-06-01,100,2000,Michael,Robinson
29,James Wood,2003-03-01,209,8000,James,Wood
39,Chris Andrews,2005-94-91,100,6900,Chris,Andrews
48,Mark Bond,2008-10-01,100,7000,Mark,Bond
50,Steve Watson,1996-02-01,480,1000,Steve,Watson
60,Mathews Simon,1998-11-01,500,5000,Mathews,Simon
78,Peter Paul,2011-04-01,600,5000,Peter,Paul


In [0]:
#Third Method of Split
split_col= pyspark.sql.functions.split(empDF['doj'], '-')

df3= empDF.select("employee_id","Name","employee_dept_id","salary",
split_col.getItem(0).alias("joining_year'), split_col.getItem(1).alias('joining_month"), split_col.getItem(2).alias('joining_day'))

display (df3)

employee_id,Name,employee_dept_id,salary,"joining_year'), split_col.getItem(1).alias('joining_month",joining_day
10,Michael Robinson,100,2000,1999,1
29,James Wood,209,8000,2003,1
39,Chris Andrews,100,6900,2005,91
48,Mark Bond,100,7000,2008,1
50,Steve Watson,480,1000,1996,1
60,Mathews Simon,500,5000,1998,1
78,Peter Paul,600,5000,2011,1


In [0]:
#Combine multiple split
df4 = empDF.withColumn('First_Name',split(empDF['Name'], ' ').getItem(0)) \
        .withColumn('Last_Name',split(empDF['Name'],' ').getItem(1)) \
        .withColumn('Joining_Year',split (empDF ['doj'],'-').getItem(0)) \
        .withColumn('Joining_Month',split (empDF['doj'],'-').getItem(1)) \
        .withColumn('Joining_Day',split (empDF['doj'],'-').getItem(2))

display (df4)

employee_id,Name,doj,employee_dept_id,salary,First_Name,Last_Name,Joining_Year,Joining_Month,Joining_Day
10,Michael Robinson,1999-06-01,100,2000,Michael,Robinson,1999,6,1
29,James Wood,2003-03-01,209,8000,James,Wood,2003,3,1
39,Chris Andrews,2005-94-91,100,6900,Chris,Andrews,2005,94,91
48,Mark Bond,2008-10-01,100,7000,Mark,Bond,2008,10,1
50,Steve Watson,1996-02-01,480,1000,Steve,Watson,1996,2,1
60,Mathews Simon,1998-11-01,500,5000,Mathews,Simon,1998,11,1
78,Peter Paul,2011-04-01,600,5000,Peter,Paul,2011,4,1


In [0]:
#Split and Drop splitted columns

df5 = empDF.withColumn('First_Name', split (empDF['Name'], ' ').getItem(0)) \
        .withColumn('Last_Name', split (empDF ['Name'], ' ').getItem(1)) \
        .withColumn('Joining_Year', split (empDF['doj'], '-').getItem(0)) \
        .withColumn('Joining_Month', split (empDF ['doj'], '-').getItem(1)) \
        .withColumn('Joining_Day', split (empDF['doj'], '-').getItem(2)) \
        .drop (empDF ['Name']) \
        .drop (empDF ['doj'])

display (df5)

employee_id,employee_dept_id,salary,First_Name,Last_Name,Joining_Year,Joining_Month,Joining_Day
10,100,2000,Michael,Robinson,1999,6,1
29,209,8000,James,Wood,2003,3,1
39,100,6900,Chris,Andrews,2005,94,91
48,100,7000,Mark,Bond,2008,10,1
50,480,1000,Steve,Watson,1996,2,1
60,500,5000,Mathews,Simon,1998,11,1
78,600,5000,Peter,Paul,2011,4,1
