In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test1').getOrCreate()
spark

In [4]:
columns = ['Seqno', 'Name']
data = [('1', 'john jones'), 
       ('2', 'tracey smith'),
       ('3', 'amy sanders')]
df = spark.createDataFrame(data = data, schema= columns)
df.show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [14]:
# Apply function using withColumn

from pyspark.sql.functions import *
df.withColumn('Upper_Name', upper(df.Name)).show()

+-----+------------+------------+
|Seqno|        Name|  Upper_Name|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [6]:
# Apply function using select 

df.select('Seqno', "Name", upper(df.Name)).show()

+-----+------------+------------+
|Seqno|        Name| upper(Name)|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [7]:
# Apply function using sql()
df.createOrReplaceTempView('TAB')
spark.sql('select *, upper(Name) from TAB').show()

+-----+------------+------------+
|Seqno|        Name| upper(Name)|
+-----+------------+------------+
|    1|  john jones|  JOHN JONES|
|    2|tracey smith|TRACEY SMITH|
|    3| amy sanders| AMY SANDERS|
+-----+------------+------------+



In [8]:
# Create custom function 

def upperCase(str):
    return str.upper()

In [16]:
# Convert function to udf

from pyspark.sql.types import *
upperCaseUDF = udf(lambda x: upperCase(x), StringType())

In [17]:
# Custom UDF with withColumn()
df.withColumn('Cureated Name', upperCaseUDF(col("Name"))).show()

+-----+------------+-------------+
|Seqno|        Name|Cureated Name|
+-----+------------+-------------+
|    1|  john jones|   JOHN JONES|
|    2|tracey smith| TRACEY SMITH|
|    3| amy sanders|  AMY SANDERS|
+-----+------------+-------------+



In [20]:
# Custom UDF with select()
df.select(col('Seqno'), upperCaseUDF(col("Name")).alias("Name")).show()

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  JOHN JONES|
|    2|TRACEY SMITH|
|    3| AMY SANDERS|
+-----+------------+



In [21]:
# Custom UDF with sql()
spark.udf.register('upperCaseUDF', upperCaseUDF)
df.createOrReplaceTempView('TAB')
spark.sql('select Seqno, Name, upperCaseUDF(Name) from TAB').show()

+-----+------------+------------------+
|Seqno|        Name|upperCaseUDF(Name)|
+-----+------------+------------------+
|    1|  john jones|        JOHN JONES|
|    2|tracey smith|      TRACEY SMITH|
|    3| amy sanders|       AMY SANDERS|
+-----+------------+------------------+



In [22]:
spark.stop()