In [52]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName('Transforms').getOrCreate()

df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

In [11]:
# SELECT
# df.select('Item_Identifier', 'Item_Weight', 'Item_Fat_Content').show() is also valid to select specific cols
# Best practice is to always use a col object as certain pyspark functions require a col object such as alias, aggregations, ...

df.select(col('Item_Identifier'), col('Item_Weight'), col('Item_Fat_Content')).show()

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|          FDA15|        9.3|         Low Fat|
|          DRC01|       5.92|         Regular|
|          FDN15|       17.5|         Low Fat|
|          FDX07|       19.2|         Regular|
|          NCD19|       8.93|         Low Fat|
|          FDP36|     10.395|         Regular|
|          FDO10|      13.65|         Regular|
|          FDP10|       NULL|         Low Fat|
|          FDH17|       16.2|         Regular|
|          FDU28|       19.2|         Regular|
|          FDY07|       11.8|         Low Fat|
|          FDA03|       18.5|         Regular|
|          FDX32|       15.1|         Regular|
|          FDS46|       17.6|         Regular|
|          FDF32|      16.35|         Low Fat|
|          FDP49|        9.0|         Regular|
|          NCB42|       11.8|         Low Fat|
|          FDP49|        9.0|         Regular|
|          DR

In [10]:
# ALIAS
df.select(col('Item_Identifier').alias('Item_Id')).show()

+-------+
|Item_Id|
+-------+
|  FDA15|
|  DRC01|
|  FDN15|
|  FDX07|
|  NCD19|
|  FDP36|
|  FDO10|
|  FDP10|
|  FDH17|
|  FDU28|
|  FDY07|
|  FDA03|
|  FDX32|
|  FDS46|
|  FDF32|
|  FDP49|
|  NCB42|
|  FDP49|
|  DRI11|
|  FDU02|
+-------+
only showing top 20 rows


In [13]:
# FILTER/WHERE
# Scenario 1
df.filter(col('Item_Fat_Content') == 'Regular').show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          FDX07|       19.2|         Regular|            0.0|Fruits and Vegeta...| 182.095|           OUT010|                     1998|       NULL|              Tier 3|    Gro

In [15]:
# Scenario 2
df.filter((col('Item_Type') == 'Soft Drinks') & (col('Item_Weight') < 10)).show()

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          DRZ11|       8.85|         Regular|    0.113123893|Soft Drinks|122.5388|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|        1609.9044|
|          DRF4

In [16]:
# Scenario 3
df.filter((col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1','Tier 2'))).show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDH17|       16.2|         Regular|    0.016687114|        Frozen Foods| 96.9726|           OUT045|                     2002|       NULL|              Tier 2|Supermarket Type1|        1076.5986|
|          FDU28|       19.2|         Regular|     0.09444959|        Frozen Foods|187.8214|           OUT017|                     2007|       NULL|              Tier 2|Superma

In [19]:
# withColumnRenamed
# Used to rename columns
# Only changes column name at the dataframe level
# Useful when performing joins
df.withColumnRenamed('Item_Weight','Item_Wt').show()

+---------------+-------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Wt|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|    9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|   5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         

In [19]:
# withColumn
# Used to create/modify columns

# lit() is a function used to create a Column of literal value.
from pyspark.sql.functions import lit, regexp_replace
from pyspark.sql.types import StringType

In [15]:
# Scenario 1 - Create new Column with a literal value
df = df.withColumn('flag',lit("new"))
df.select(col('flag')).show()

+----+
|flag|
+----+
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
| new|
+----+
only showing top 20 rows


In [16]:
# Scenario 2 - Create new Column based on 2 Columns
df = df.withColumn('multiply',col('Item_Weight')*col('Item_MRP'))
df.select(col('Item_Weight'), col('Item_MRP'), col('multiply')).show()

+-----------+--------+------------------+
|Item_Weight|Item_MRP|          multiply|
+-----------+--------+------------------+
|        9.3|249.8092|2323.2255600000003|
|       5.92| 48.2692|285.75366399999996|
|       17.5| 141.618|          2478.315|
|       19.2| 182.095|3496.2239999999997|
|       8.93| 53.8614|        480.982302|
|     10.395| 51.4008| 534.3113159999999|
|      13.65| 57.6588|         787.04262|
|       NULL|107.7622|              NULL|
|       16.2| 96.9726|1570.9561199999998|
|       19.2|187.8214|        3606.17088|
|       11.8| 45.5402|         537.37436|
|       18.5|144.1102|         2666.0387|
|       15.1|145.4786|2196.7268599999998|
|       17.6|119.6782|2106.3363200000003|
|      16.35|196.4426|        3211.83651|
|        9.0| 56.3614|507.25260000000003|
|       11.8|115.3492|        1361.12056|
|        9.0| 54.3614|489.25260000000003|
|       NULL|113.2834|              NULL|
|      13.35|230.5352|        3077.64492|
+-----------+--------+------------

In [20]:
# Scenario 3 - Modify Existing Column
df = df.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),"Regular","Reg"))\
    .withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),"Low Fat","Lf"))
df.select(col('Item_Fat_Content')).show()

+----------------+
|Item_Fat_Content|
+----------------+
|              Lf|
|             Reg|
|              Lf|
|             Reg|
|              Lf|
|             Reg|
|             Reg|
|              Lf|
|             Reg|
|             Reg|
|              Lf|
|             Reg|
|             Reg|
|             Reg|
|              Lf|
|             Reg|
|              Lf|
|             Reg|
|              Lf|
|              Lf|
+----------------+
only showing top 20 rows


In [22]:
# Type Casting
# Modify Existing Column Type
df = df.withColumn('Item_Weight', col('Item_Weight').cast(StringType()))
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)
 |-- flag: string (nullable = false)
 |-- multiply: double (nullable = true)



In [25]:
# Sort/orderBy

# Scenario 1
df = df.sort(col('Item_Weight').desc())
df.select(col('Item_Weight')).show()

+-----------+
|Item_Weight|
+-----------+
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|      9.895|
|        9.8|
|        9.8|
|        9.8|
|        9.8|
|        9.8|
+-----------+
only showing top 20 rows


In [26]:
# Scenario 2
df = df.sort(col('Item_Visibility').asc())
df.select(col('Item_Visibility')).show()

+---------------+
|Item_Visibility|
+---------------+
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
|            0.0|
+---------------+
only showing top 20 rows


In [30]:
# Scenario 3
# ascending=[False, False] or ascending=[0, 0] is valid
df = df.sort(['Item_Weight','Item_Visibility'],ascending = [0,0])
df.select(col('Item_Weight'), col('Item_Visibility')).show()

+-----------+---------------+
|Item_Weight|Item_Visibility|
+-----------+---------------+
|      21.35|    0.130127365|
|      21.35|    0.115194717|
|      21.35|    0.078060605|
|      21.35|    0.069102831|
|      21.35|    0.068822477|
|      21.35|    0.068809463|
|      21.35|    0.068765205|
|      21.25|    0.156012631|
|      21.25|    0.155694794|
|      21.25|    0.155350299|
|      21.25|    0.155250377|
|      21.25|    0.114246019|
|      21.25|    0.114066204|
|      21.25|    0.113833823|
|      21.25|    0.024795057|
|      21.25|    0.024756031|
|      21.25|    0.024705597|
|      21.25|    0.024693927|
|      21.25|    0.024650932|
|      21.25|    0.019533098|
+-----------+---------------+
only showing top 20 rows


In [31]:
# Scenario 4
df = df.sort(['Item_weight','Item_Visibility'], ascending = [0,1])
df.select(col('Item_Weight'), col('Item_Visibility')).show()

+-----------+---------------+
|Item_Weight|Item_Visibility|
+-----------+---------------+
|      21.35|    0.068765205|
|      21.35|    0.068809463|
|      21.35|    0.068822477|
|      21.35|    0.069102831|
|      21.35|    0.078060605|
|      21.35|    0.115194717|
|      21.35|    0.130127365|
|      21.25|    0.009996872|
|      21.25|    0.009998763|
|      21.25|     0.01001904|
|      21.25|    0.010039493|
|      21.25|     0.01005532|
|      21.25|    0.016735879|
|      21.25|    0.019407069|
|      21.25|    0.019423232|
|      21.25|     0.01945343|
|      21.25|    0.019462623|
|      21.25|    0.019502354|
|      21.25|    0.019533098|
|      21.25|    0.024650932|
+-----------+---------------+
only showing top 20 rows


In [32]:
# LIMIT
df.limit(10).show() 

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDC02|      21.35|         Low Fat|    0.068765205|              Canned|260.4278|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|        3644.5892|
|          FDC02|      21.35|         Low Fat|    0.068809463|              Canned|258.5278|           OUT035|                     2004|      Small|              Tier 2|Superma

In [33]:
# DROP

# Scenario 1 - Drop Single Column
df.drop('Item_Visibility').show()

+---------------+-----------+----------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDC02|      21.35|         Low Fat|              Canned|260.4278|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|        3644.5892|
|          FDC02|      21.35|         Low Fat|              Canned|258.5278|           OUT035|                     2004|      Small|              Tier 2|Supermarket Type1|         5206.556|
|          FDC02|      21.35|         Low Fat|    

In [34]:
# Scenario 1 - Drop Multiple Columns
df.drop('Item_Visibility','Item_Type').show()

+---------------+-----------+----------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDC02|      21.35|         Low Fat|260.4278|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|        3644.5892|
|          FDC02|      21.35|         Low Fat|258.5278|           OUT035|                     2004|      Small|              Tier 2|Supermarket Type1|         5206.556|
|          FDC02|      21.35|         Low Fat|258.3278|           OUT046|                     1997|      Small|              Tier 1|Supermarket Type1|     

In [36]:
# Scenario 2 - Drop All Duplicates
df.dropDuplicates().show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDB57|      20.25|         Regular|    0.018801549|Fruits and Vegeta...|222.1772|           OUT035|                     2004|      Small|              Tier 2|Supermarket Type1|          5559.43|
|          FDI27|       8.71|         Regular|     0.04605781|               Dairy| 43.8744|           OUT049|                     1999|     Medium|              Tier 1|Superma

In [37]:
# Scenario 2 - Drop All Duplicates
df.distinct().show()

ConnectionRefusedError: [Errno 111] Connection refused

In [2]:
# Scenario 2 - Drop rows when a subset of columns have duplicates
df.drop_duplicates(subset=['Item_Type']).show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDP36|     10.395|         Regular|            0.0|        Baking Goods| 51.4008|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         556.6088|
|          FDO23|      17.85|         Low Fat|            0.0|              Breads| 93.1436|           OUT045|                     2002|       NULL|              Tier 2|Superma

In [5]:
# String Functions
# INITCAP() - Translate the first letter of each word to upper case in the sentence.

df.select(initcap('Item_Type')).show()

+--------------------+
|  initcap(Item_Type)|
+--------------------+
|               Dairy|
|         Soft Drinks|
|                Meat|
|Fruits And Vegeta...|
|           Household|
|        Baking Goods|
|         Snack Foods|
|         Snack Foods|
|        Frozen Foods|
|        Frozen Foods|
|Fruits And Vegeta...|
|               Dairy|
|Fruits And Vegeta...|
|         Snack Foods|
|Fruits And Vegeta...|
|           Breakfast|
|  Health And Hygiene|
|           Breakfast|
|         Hard Drinks|
|               Dairy|
+--------------------+
only showing top 20 rows


In [8]:
# UPPER() 
df.select(upper('Item_Type')).show()

+--------------------+
|    upper(Item_Type)|
+--------------------+
|               DAIRY|
|         SOFT DRINKS|
|                MEAT|
|FRUITS AND VEGETA...|
|           HOUSEHOLD|
|        BAKING GOODS|
|         SNACK FOODS|
|         SNACK FOODS|
|        FROZEN FOODS|
|        FROZEN FOODS|
|FRUITS AND VEGETA...|
|               DAIRY|
|FRUITS AND VEGETA...|
|         SNACK FOODS|
|FRUITS AND VEGETA...|
|           BREAKFAST|
|  HEALTH AND HYGIENE|
|           BREAKFAST|
|         HARD DRINKS|
|               DAIRY|
+--------------------+
only showing top 20 rows


In [9]:
df.select(upper('Item_Type').alias('upper_Item_Type')).show()

+--------------------+
|     upper_Item_Type|
+--------------------+
|               DAIRY|
|         SOFT DRINKS|
|                MEAT|
|FRUITS AND VEGETA...|
|           HOUSEHOLD|
|        BAKING GOODS|
|         SNACK FOODS|
|         SNACK FOODS|
|        FROZEN FOODS|
|        FROZEN FOODS|
|FRUITS AND VEGETA...|
|               DAIRY|
|FRUITS AND VEGETA...|
|         SNACK FOODS|
|FRUITS AND VEGETA...|
|           BREAKFAST|
|  HEALTH AND HYGIENE|
|           BREAKFAST|
|         HARD DRINKS|
|               DAIRY|
+--------------------+
only showing top 20 rows


In [10]:
# LOWER
df.select(lower('Item_Type').alias('upper_Item_Type')).show()

+--------------------+
|     upper_Item_Type|
+--------------------+
|               dairy|
|         soft drinks|
|                meat|
|fruits and vegeta...|
|           household|
|        baking goods|
|         snack foods|
|         snack foods|
|        frozen foods|
|        frozen foods|
|fruits and vegeta...|
|               dairy|
|fruits and vegeta...|
|         snack foods|
|fruits and vegeta...|
|           breakfast|
|  health and hygiene|
|           breakfast|
|         hard drinks|
|               dairy|
+--------------------+
only showing top 20 rows


In [40]:
# Date Functions
# CURRENT_DATE()

df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df = df.withColumn('curr_date',current_date())

df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales| curr_date|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2025-08-19|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2

In [41]:
# DATE_ADD()

df = df.withColumn('week_after',date_add('curr_date',7))

df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales| curr_date|week_after|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2025-08-19|2025-08-26|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2

In [42]:
# DATE_SUB()

df = df.withColumn('week_before',date_sub('curr_date',7))
df.show()

# Equivalent with DATE_ADD()
# df.withColumn('week_before',date_add('curr_date',-7)).show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales| curr_date|week_after|week_before|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2025-08-19|2025-08-26| 2025-08-12|
|          DRC01|       5.92|         Re

In [43]:
# DATEDIFF

df = df.withColumn('datediff',datediff('week_after','curr_date'))
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+--------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales| curr_date|week_after|week_before|datediff|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+--------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2025-08-19|2025-08-26| 2025-08-12|       7|
|   

In [44]:
# DATE_FORMAT()

df = df.withColumn('week_before',date_format('week_before','dd-MM-yyyy'))
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+--------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales| curr_date|week_after|week_before|datediff|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+--------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2025-08-19|2025-08-26| 12-08-2025|       7|
|   

In [50]:
# Null Handling

# Dropping Nulls
# df.count()
# df.dropna('all').count() # Drop rows when all columns are NULL 
# df.dropna('any').count() # Drop rows when any columns are NULL 
# df.dropna(subset=['Outlet_Size']).count() # Drop rows when specific columns are NULL 

In [51]:
# Fill Nulls
# df.show()
# df.fillna('NotAvailable').show() # Replace all NULLs with value
# df.fillna('NotAvailable',subset=['Outlet_Size']).show() # Replace all NULLs in specific columns with value

In [69]:
# Split and Indexing

# Split
df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df.select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()
df.withColumn('Outlet_Type',split('Outlet_Type',' ')).select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()

+--------------------+-----------------+
|Outlet_Location_Type|      Outlet_Type|
+--------------------+-----------------+
|              Tier 1|Supermarket Type1|
|              Tier 3|Supermarket Type2|
|              Tier 1|Supermarket Type1|
|              Tier 3|    Grocery Store|
|              Tier 3|Supermarket Type1|
|              Tier 3|Supermarket Type2|
|              Tier 3|Supermarket Type1|
|              Tier 3|Supermarket Type3|
|              Tier 2|Supermarket Type1|
|              Tier 2|Supermarket Type1|
+--------------------+-----------------+

+--------------------+--------------------+
|Outlet_Location_Type|         Outlet_Type|
+--------------------+--------------------+
|              Tier 1|[Supermarket, Type1]|
|              Tier 3|[Supermarket, Type2]|
|              Tier 1|[Supermarket, Type1]|
|              Tier 3|    [Grocery, Store]|
|              Tier 3|[Supermarket, Type1]|
|              Tier 3|[Supermarket, Type2]|
|              Tier 3|[Superm

In [70]:
# Index
df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df.select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()
df.withColumn('Outlet_Type',split('Outlet_Type',' ')[1]).select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()

+--------------------+-----------------+
|Outlet_Location_Type|      Outlet_Type|
+--------------------+-----------------+
|              Tier 1|Supermarket Type1|
|              Tier 3|Supermarket Type2|
|              Tier 1|Supermarket Type1|
|              Tier 3|    Grocery Store|
|              Tier 3|Supermarket Type1|
|              Tier 3|Supermarket Type2|
|              Tier 3|Supermarket Type1|
|              Tier 3|Supermarket Type3|
|              Tier 2|Supermarket Type1|
|              Tier 2|Supermarket Type1|
+--------------------+-----------------+

+--------------------+-----------+
|Outlet_Location_Type|Outlet_Type|
+--------------------+-----------+
|              Tier 1|      Type1|
|              Tier 3|      Type2|
|              Tier 1|      Type1|
|              Tier 3|      Store|
|              Tier 3|      Type1|
|              Tier 3|      Type2|
|              Tier 3|      Type1|
|              Tier 3|      Type3|
|              Tier 2|      Type1|
|    

In [75]:
# EXPLODE

df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df.select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()

df_exp = df.withColumn('Outlet_Type',split('Outlet_Type',' '))

df_exp.select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()
df_exp.withColumn('Outlet_Type',explode('Outlet_Type')).select(col('Outlet_Location_Type'), col('Outlet_Type')).limit(10).show()

+--------------------+-----------------+
|Outlet_Location_Type|      Outlet_Type|
+--------------------+-----------------+
|              Tier 1|Supermarket Type1|
|              Tier 3|Supermarket Type2|
|              Tier 1|Supermarket Type1|
|              Tier 3|    Grocery Store|
|              Tier 3|Supermarket Type1|
|              Tier 3|Supermarket Type2|
|              Tier 3|Supermarket Type1|
|              Tier 3|Supermarket Type3|
|              Tier 2|Supermarket Type1|
|              Tier 2|Supermarket Type1|
+--------------------+-----------------+

+--------------------+--------------------+
|Outlet_Location_Type|         Outlet_Type|
+--------------------+--------------------+
|              Tier 1|[Supermarket, Type1]|
|              Tier 3|[Supermarket, Type2]|
|              Tier 1|[Supermarket, Type1]|
|              Tier 3|    [Grocery, Store]|
|              Tier 3|[Supermarket, Type1]|
|              Tier 3|[Supermarket, Type2]|
|              Tier 3|[Superm

In [74]:
# Array_Contains
df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df_exp = df.withColumn('Outlet_Type',split('Outlet_Type',' '))
df_exp.withColumn('Type1_flag',array_contains('Outlet_Type','Type1')).select(col('Outlet_Location_Type'), col('Outlet_Type'), col('Type1_flag')).limit(10).show()

+--------------------+--------------------+----------+
|Outlet_Location_Type|         Outlet_Type|Type1_flag|
+--------------------+--------------------+----------+
|              Tier 1|[Supermarket, Type1]|      true|
|              Tier 3|[Supermarket, Type2]|     false|
|              Tier 1|[Supermarket, Type1]|      true|
|              Tier 3|    [Grocery, Store]|     false|
|              Tier 3|[Supermarket, Type1]|      true|
|              Tier 3|[Supermarket, Type2]|     false|
|              Tier 3|[Supermarket, Type1]|      true|
|              Tier 3|[Supermarket, Type3]|     false|
|              Tier 2|[Supermarket, Type1]|      true|
|              Tier 2|[Supermarket, Type1]|      true|
+--------------------+--------------------+----------+



In [78]:
# GrouBy
# Scenario 1 - Sum

df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('BigMart_Sales.csv')
df.groupBy('Item_Type').agg(sum('Item_MRP')).show()

+--------------------+------------------+
|           Item_Type|     sum(Item_MRP)|
+--------------------+------------------+
|       Starchy Foods|21880.027399999995|
|        Baking Goods| 81894.73640000001|
|              Breads| 35379.11979999999|
|Fruits and Vegeta...|178124.08099999998|
|                Meat|59449.863799999956|
|         Hard Drinks|29334.676599999995|
|         Soft Drinks|58514.164999999964|
|           Household|135976.52539999998|
|           Breakfast|        15596.6966|
|               Dairy|101276.45959999996|
|         Snack Foods|175433.92040000003|
|              Others|22451.891600000006|
|             Seafood| 9077.870000000003|
|              Canned|  90706.7269999999|
|        Frozen Foods|118558.88140000001|
|  Health and Hygiene|        68025.8388|
+--------------------+------------------+



In [79]:
# Scenario 2 - Avg

df.groupBy('Item_Type').agg(avg('Item_MRP')).show()

+--------------------+------------------+
|           Item_Type|     avg(Item_MRP)|
+--------------------+------------------+
|       Starchy Foods|147.83802297297294|
|        Baking Goods|126.38076604938273|
|              Breads| 140.9526685258964|
|Fruits and Vegeta...|144.58123457792206|
|                Meat|139.88203247058814|
|         Hard Drinks|137.07792803738315|
|         Soft Drinks|131.49250561797746|
|           Household|149.42475318681318|
|           Breakfast|141.78815090909092|
|               Dairy|148.49920762463336|
|         Snack Foods|146.19493366666669|
|              Others|132.85142958579885|
|             Seafood|141.84171875000004|
|              Canned|139.76383204930647|
|        Frozen Foods|138.50336612149533|
|  Health and Hygiene|130.81892076923077|
+--------------------+------------------+



In [80]:
# Scenario 3 - GroupBy Multiple Cols

df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP').alias('Total_MRP')).show()

+--------------------+-----------+------------------+
|           Item_Type|Outlet_Size|         Total_MRP|
+--------------------+-----------+------------------+
|       Starchy Foods|     Medium| 7124.136199999997|
|Fruits and Vegeta...|     Medium|59047.217200000014|
|       Starchy Foods|       NULL|         6040.6402|
|              Breads|       NULL|        10011.5004|
|        Baking Goods|       NULL|23433.838799999994|
|Fruits and Vegeta...|       NULL|49758.730999999985|
|        Frozen Foods|       High|12588.291000000001|
|         Soft Drinks|       High| 6456.165199999999|
|           Breakfast|      Small|3917.0407999999998|
|                Meat|     Medium| 20326.45059999999|
|Fruits and Vegeta...|       High| 20671.34759999999|
|                Meat|       High| 5627.036400000002|
|        Baking Goods|       High| 9431.749199999998|
|           Household|     Medium| 42688.57439999998|
|                Meat|       NULL|16158.166000000005|
|         Hard Drinks|      

In [None]:
# Scenario 4 - GroupBy Multiple Cols and Agg

df.groupBy('Item_Type','Outlet_Size').agg(sum('Item_MRP'),avg('Item_MRP')).show()

+--------------------+-----------+------------------+------------------+
|           Item_Type|Outlet_Size|     sum(Item_MRP)|     avg(Item_MRP)|
+--------------------+-----------+------------------+------------------+
|       Starchy Foods|     Medium| 7124.136199999997| 148.4195041666666|
|Fruits and Vegeta...|     Medium|59047.217200000014| 142.9714702179177|
|       Starchy Foods|       NULL|         6040.6402|140.48000465116277|
|              Breads|       NULL|        10011.5004|139.04861666666667|
|        Baking Goods|       NULL|23433.838799999994|126.66939891891889|
|Fruits and Vegeta...|       NULL|49758.730999999985|142.57516045845267|
|        Frozen Foods|       High|12588.291000000001|         136.82925|
|         Soft Drinks|       High| 6456.165199999999|131.75847346938772|
|           Breakfast|      Small|3917.0407999999998|130.56802666666667|
|                Meat|     Medium| 20326.45059999999|136.41913154362408|
|Fruits and Vegeta...|       High| 20671.3475999999