In [1]:
from pyspark.sql import SparkSession
from delta import *


builder = SparkSession.builder.appName("transform") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.driver.memory", "4g") \
        .config("spark.sql.autoBroadcastJoinThreshold", "-1")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/08 20:37:02 WARN Utils: Your hostname, javier-ubuntu, resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
25/08/08 20:37:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/javier/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/javier/.ivy2.5.2/cache
The jars for the packages stored in: /home/javier/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7ca1a77e-b4a0-479b-a226-e45b01c7b883;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 154ms :: artifacts dl 7ms
	:: modules in use:
	i

## DataFrame.transform()

The `pyspark.sql.DataFrame.transform()` is used to chain the custom transformations and this function returns the new DataFrame after applying the specified transformations.

This function always returns the same number of rows that exists on the input PySpark DataFrame.

DataFrame.transform(func: Callable[[…], DataFrame], *args: Any, **kwargs: Any) → pyspark.sql.dataframe.DataFrame  
    * func – Custom function to call.  
    * *args – Arguments to pass to func.  
    * *kwargs – Keyword arguments to pass to func.  


In [2]:
# Create SparkSession
spark = SparkSession.builder \
            .appName('SparkByExamples.com') \
            .getOrCreate()

# Prepare Data
simpleData = (  ("Java",4000,5), \
                ("Python", 4600,10),  \
                ("Scala", 4100,15),   \
                ("Scala", 4500,15),   \
                ("PHP", 3000,20),  \
            )
columns= ["CourseName", "fee", "discount"]

# Create DataFrame
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show(truncate=False)

25/08/08 20:37:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+----------+----+--------+
|CourseName|fee |discount|
+----------+----+--------+
|Java      |4000|5       |
|Python    |4600|10      |
|Scala     |4100|15      |
|Scala     |4500|15      |
|PHP       |3000|20      |
+----------+----+--------+



### Create Custom functions

`to_upper_str_columns()` – This function converts the CourseName column to upper case and updates the same column.

In [3]:
# Custom transformation 1
from pyspark.sql.functions import upper

def to_upper_str_columns(df):
    return df.withColumn("CourseName",upper(df.CourseName))

`reduce_price()` – This function takes the argument and reduces the value from the fee and creates a new column.

In [4]:
# Custom transformation 2
def reduce_price(reduceBy):
    def inner(df):
        return df.withColumn("new_fee",df.fee - reduceBy)
    return inner

`apply_discount()` – This creates a new column with the discounted fee.

In [5]:
# Custom transformation 3
def apply_discount(df):
    return df.withColumn("discounted_fee",  df.new_fee - (df.new_fee * df.discount) / 100)

#### Apply DataFrame.transform()

In [6]:
df2 = df.transform(to_upper_str_columns) \
        .transform(reduce_price(1000)) \
        .transform(apply_discount)        

df2.show(truncate=False)

                                                                                

+----------+----+--------+-------+--------------+
|CourseName|fee |discount|new_fee|discounted_fee|
+----------+----+--------+-------+--------------+
|JAVA      |4000|5       |3000   |2850.0        |
|PYTHON    |4600|10      |3600   |3240.0        |
|SCALA     |4100|15      |3100   |2635.0        |
|SCALA     |4500|15      |3500   |2975.0        |
|PHP       |3000|20      |2000   |1600.0        |
+----------+----+--------+-------+--------------+



### Example 2

In [7]:
# Create DataFrame with Array
data = [("James,,Smith",["Java","Scala","C++"],["Spark","Java"]),
        ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"]),
        ("Robert,,Williams",["CSharp","VB"],["Spark","Python"])
        ]
df = spark.createDataFrame(data=data,schema=["Name","Languages1","Languages2"])
df.show()



+----------------+------------------+---------------+
|            Name|        Languages1|     Languages2|
+----------------+------------------+---------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|
+----------------+------------------+---------------+



### Syntax
pyspark.sql.functions.transform(col, f)  
The following are the parameters:

    * col – ArrayType column
    * f – Optional. Function to apply.

In [8]:
# using transform() function
from pyspark.sql.functions import upper
from pyspark.sql.functions import transform

df.select('Name', transform("Languages1", lambda x: upper(x)).alias("languages1")).show()

+----------------+------------------+
|            Name|        languages1|
+----------------+------------------+
|    James,,Smith|[JAVA, SCALA, C++]|
|   Michael,Rose,|[SPARK, JAVA, C++]|
|Robert,,Williams|      [CSHARP, VB]|
+----------------+------------------+



## Example 3

In [9]:

data = [(10,"68.28.91.22",[35,35,35,36,35,35,32,35,30,35,32,35]),
        (13,"67.185.72.1",[45,45,45,46,45,45,42,35,40,45,42,45]),
        (8,"208.109.163.218",[40,40,40,40,40,43,42,40,40,45,42,45])
        ]
df = spark.createDataFrame(data=data,schema=["id","ip","temp"])
df.show(truncate=False)

+---+---------------+------------------------------------------------+
|id |ip             |temp                                            |
+---+---------------+------------------------------------------------+
|10 |68.28.91.22    |[35, 35, 35, 36, 35, 35, 32, 35, 30, 35, 32, 35]|
|13 |67.185.72.1    |[45, 45, 45, 46, 45, 45, 42, 35, 40, 45, 42, 45]|
|8  |208.109.163.218|[40, 40, 40, 40, 40, 43, 42, 40, 40, 45, 42, 45]|
+---+---------------+------------------------------------------------+



In [10]:
# pyspark
df.select('id', 'ip', transform("temp", lambda x: ((x * 9) / 5) + 32 ).alias("temp_in F")).show(truncate=False)

+---+---------------+------------------------------------------------------------------------------------+
|id |ip             |temp_in F                                                                           |
+---+---------------+------------------------------------------------------------------------------------+
|10 |68.28.91.22    |[95.0, 95.0, 95.0, 96.8, 95.0, 95.0, 89.6, 95.0, 86.0, 95.0, 89.6, 95.0]            |
|13 |67.185.72.1    |[113.0, 113.0, 113.0, 114.8, 113.0, 113.0, 107.6, 95.0, 104.0, 113.0, 107.6, 113.0] |
|8  |208.109.163.218|[104.0, 104.0, 104.0, 104.0, 104.0, 109.4, 107.6, 104.0, 104.0, 113.0, 107.6, 113.0]|
+---+---------------+------------------------------------------------------------------------------------+



In [11]:
# sql
df.createOrReplaceTempView('df_view')
spark.sql("""select id, ip, 
             transform (temp, t -> ((t * 9) / 5) + 32 ) as fahrenheit_temp
             from df_view""").show(truncate=False)

+---+---------------+------------------------------------------------------------------------------------+
|id |ip             |fahrenheit_temp                                                                     |
+---+---------------+------------------------------------------------------------------------------------+
|10 |68.28.91.22    |[95.0, 95.0, 95.0, 96.8, 95.0, 95.0, 89.6, 95.0, 86.0, 95.0, 89.6, 95.0]            |
|13 |67.185.72.1    |[113.0, 113.0, 113.0, 114.8, 113.0, 113.0, 107.6, 95.0, 104.0, 113.0, 107.6, 113.0] |
|8  |208.109.163.218|[104.0, 104.0, 104.0, 104.0, 104.0, 109.4, 107.6, 104.0, 104.0, 113.0, 107.6, 113.0]|
+---+---------------+------------------------------------------------------------------------------------+



In [12]:
df.createOrReplaceTempView('df_view')
spark.sql("""select id, ip, 
             transform (temp, t -> ((t * 9) div 5) + 32 ) as fahrenheit_temp
             from df_view""").show(truncate=False)

+---+---------------+------------------------------------------------------------+
|id |ip             |fahrenheit_temp                                             |
+---+---------------+------------------------------------------------------------+
|10 |68.28.91.22    |[95, 95, 95, 96, 95, 95, 89, 95, 86, 95, 89, 95]            |
|13 |67.185.72.1    |[113, 113, 113, 114, 113, 113, 107, 95, 104, 113, 107, 113] |
|8  |208.109.163.218|[104, 104, 104, 104, 104, 109, 107, 104, 104, 113, 107, 113]|
+---+---------------+------------------------------------------------------------+

