In [None]:
The arrays_except() function in PySpark returns the elements from the first array that are NOT present in the second array. It works like a set difference operation.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_except

# Initialize Spark session
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

# Sample Data: Employees in Team A and Team B
data = [
    ("Sales", ["John", "Alice", "Bob", "Mike"], ["Alice", "Mike", "David"]),
    ("HR", ["Emma", "Sophia", "Olivia"], ["Sophia", "Liam", "Olivia"]),
    ("IT", ["Ryan", "Noah", "Ava"], ["Mason", "Ava", "Noah"])
]

# Create DataFrame
df = spark.createDataFrame(data, ["department", "team_A", "team_B"])

# Find employees who are in Team A but NOT in Team B
df = df.withColumn("only_in_team_A", array_except("team_A", "team_B"))

# Show results
df.show(truncate=False)


+----------+------------------------+----------------------+--------------+
|department|team_A                  |team_B                |only_in_team_A|
+----------+------------------------+----------------------+--------------+
|Sales     |[John, Alice, Bob, Mike]|[Alice, Mike, David]  |[John, Bob]   |
|HR        |[Emma, Sophia, Olivia]  |[Sophia, Liam, Olivia]|[Emma]        |
|IT        |[Ryan, Noah, Ava]       |[Mason, Ava, Noah]    |[Ryan]        |
+----------+------------------------+----------------------+--------------+

