In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local","udf")
spark = SparkSession.builder.appName("UDF").getOrCreate()

In [3]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

schema = StructType([StructField("Name",StringType(),True),StructField("Age",IntegerType(),True),StructField("Salary",IntegerType(),True),
        StructField("Gender",StringType(),True), StructField("City",StringType(),True)])
data = [
    ('Aljandro', 28, 13500000, 'M', 'Barranquilla'), ('Sara', 30, 4700000, 'F', 'Bogota'),('Mauricio', 35, 13000000, 'M', 'Bogota'), ('Santiago', 32, 21000000, 'M', 'Barranquilla'),
    ('Andres', 33, 10000000, 'M', 'Medellin'),  ('Jacob', 27, 7000000, 'M', 'Barranquilla'), ('Jairo', 65, 2000000, 'M', 'Bogota'), ('Carmen', 65, 1500000, 'F', 'Cali'),
    ('Sonia', 52, 4500000, 'F', 'Cali'), ('Emily', 28, 6000000, 'F', 'Barranquilla'), ('David', 35, 4000000, 'M', 'Barranquilla'), ('Sophia', 45, 7500000, 'F', 'Medellin'),
    ('Daniel', 31, 3000000, 'M', 'Medellin'), ('Olivia', 26, 5500000, 'F', 'Barranquilla'), ('Matthew', 50, 8500000, 'M', 'Bogota'), ('Emma', 33, 7000000, 'F', 'Barranquilla'),
    ('Andrew', 29, 4500000, 'M', 'Bogota'),  ('Isabella', 39, 9000000, 'F', 'Cali'), ('John', 55, 5000000, 'M', 'Barranquilla'),('Ava', 30, 6500000, 'F', 'Barranquilla'),
    ('James', 42, 8000000, 'M', 'Medellin'), ('Mia', 32, 5500000, 'F', 'Bogota'), ('Alexander', 38, 7500000, 'M', 'Cali'), ('Abigail', 27, 6000000, 'F', 'Cali'),
    ('William', 48, 8500000, 'M', 'Bogota'),  ('Charlotte', 25, 7000000, 'F', 'Cali'), ('Michael', 41, 9500000, 'M', 'Barranquilla'), ('Harper', 29, 6500000, 'F', 'Barranquilla'),
    ('Ethan', 36, 8000000, 'M', 'Cali'),  ('Evelyn', 34, 5500000, 'F', 'Barranquilla'), ('Ryan', 51, 9000000, 'M', 'Cali'),  ('Amelia', 31, 7000000, 'F', 'Medellin'),
    ('Matthew', 37, 7500000, 'M', 'Medellin'), ('Elizabeth', 23, 6000000, 'F', 'Bogota'), ('Jacob', 49, 9500000, 'M', 'Medellin'), ('Samantha', 28, 6500000, 'F', 'Cali'),
    ('Benjamin', 43, 8000000, 'M', 'Medellin'), ('Lily', 35, 5500000, 'F', 'Barranquilla'), ('Henry', 52, 9000000, 'M', 'Medellin'),('Chloe', 33, 7000000, 'F', 'Bogota'),
    ('Lucas', 40, 7500000, 'M', 'Bogota'), ('Grace', 24, 6000000, 'F', 'Medellin'), ('Gabriel', 47, 9500000, 'M', 'Cali'), ('Madison', 30, 6500000, 'F', 'Barranquilla'),
    ('Carter', 39, 8000000, 'M', 'Bogota'), ('Avery', 26, 5500000, 'F', 'Barranquilla'), ('Owen', 53, 9000000, 'M', 'Medellin'),('Victoria', 32, 7000000, 'F', 'Barranquilla'),
    ('Jackson', 44, 8500000, 'M', 'Barranquilla'), ('Penelope', 29, 6500000, 'F', 'Medellin'), ('Sebastian', 45, 9000000, 'M', 'Barranquilla'),
]
df_1 = spark.createDataFrame(
    schema = schema,
    data = data
)
df_1.show(3)

+--------+---+--------+------+------------+
|    Name|Age|  Salary|Gender|        City|
+--------+---+--------+------+------------+
|Aljandro| 28|13500000|     M|Barranquilla|
|    Sara| 30| 4700000|     F|      Bogota|
|Mauricio| 35|13000000|     M|      Bogota|
+--------+---+--------+------+------------+
only showing top 3 rows



In [6]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

def add_prefix_city(city):
    return "City of " + city
add_prefix_city_udf = udf(add_prefix_city, StringType())
df_city_prefix = df_1.withColumn("City With Prefix",add_prefix_city_udf(col("City")))
df_city_prefix.show(3)
    

+--------+---+--------+------+------------+--------------------+
|    Name|Age|  Salary|Gender|        City|    City With Prefix|
+--------+---+--------+------+------------+--------------------+
|Aljandro| 28|13500000|     M|Barranquilla|City of Barranquilla|
|    Sara| 30| 4700000|     F|      Bogota|      City of Bogota|
|Mauricio| 35|13000000|     M|      Bogota|      City of Bogota|
+--------+---+--------+------+------------+--------------------+
only showing top 3 rows



In [8]:
def salutation(gender,name):
    if gender == "M":
        return f"Mr. {name}"
    elif gender == "F":
        return f"Mrs. {name}"
udf_salutation = udf(salutation,StringType())
df_final = df_city_prefix.withColumn(
    "Formal Name",
    udf_salutation(
        col("Gender"),
        col("Name")
    )
)
df_final.show()

+--------+---+--------+------+------------+--------------------+-------------+
|    Name|Age|  Salary|Gender|        City|    City With Prefix|  Formal Name|
+--------+---+--------+------+------------+--------------------+-------------+
|Aljandro| 28|13500000|     M|Barranquilla|City of Barranquilla| Mr. Aljandro|
|    Sara| 30| 4700000|     F|      Bogota|      City of Bogota|    Mrs. Sara|
|Mauricio| 35|13000000|     M|      Bogota|      City of Bogota| Mr. Mauricio|
|Santiago| 32|21000000|     M|Barranquilla|City of Barranquilla| Mr. Santiago|
|  Andres| 33|10000000|     M|    Medellin|    City of Medellin|   Mr. Andres|
|   Jacob| 27| 7000000|     M|Barranquilla|City of Barranquilla|    Mr. Jacob|
|   Jairo| 65| 2000000|     M|      Bogota|      City of Bogota|    Mr. Jairo|
|  Carmen| 65| 1500000|     F|        Cali|        City of Cali|  Mrs. Carmen|
|   Sonia| 52| 4500000|     F|        Cali|        City of Cali|   Mrs. Sonia|
|   Emily| 28| 6000000|     F|Barranquilla|City of B