# Example 3: Filtering and Conditional Logic

Learn different ways to filter data and apply conditional logic.

This demonstrates:
- Simple and complex filters
- Conditional columns with when/otherwise
- Multiple conditions
- Filter chaining

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("FilteringData").getOrCreate()

In [None]:
# Create employee dataset
employees = [
    (1, "John", "IT", 75000, 28),
    (2, "Sarah", "HR", 65000, 35),
    (3, "Mike", "IT", 85000, 42),
    (4, "Lisa", "Sales", 70000, 31),
    (5, "Tom", "IT", 95000, 45),
    (6, "Emma", "HR", 60000, 28),
    (7, "David", "Sales", 80000, 38),
    (8, "Anna", "IT", 72000, 29)
]

df = spark.createDataFrame(employees, ["id", "name", "department", "salary", "age"])
print("Employee Data:")
df.show()

In [None]:
# Simple filter - employees in IT
it_employees = df.filter(F.col("department") == "IT")
print("\nIT Employees:")
it_employees.show()

In [None]:
# Filter with multiple conditions (AND)
high_earners_it = df.filter(
    (F.col("department") == "IT") & (F.col("salary") > 80000)
)
print("\nIT Employees earning >$80,000:")
high_earners_it.show()

In [None]:
# Filter with OR condition
it_or_hr = df.filter(
    (F.col("department") == "IT") | (F.col("department") == "HR")
)
print("\nIT or HR Employees:")
it_or_hr.show()

In [None]:
# Using isin for multiple values
tech_sales = df.filter(F.col("department").isin(["IT", "Sales"]))
print("\nIT or Sales Employees (using isin):")
tech_sales.show()

In [None]:
# Range filter
mid_age = df.filter((F.col("age") >= 30) & (F.col("age") <= 40))
print("\nEmployees aged 30-40:")
mid_age.show()

In [None]:
# Conditional columns - categorize salary
df_categorized = df.withColumn(
    "salary_category",
    F.when(F.col("salary") < 65000, "Low")
     .when((F.col("salary") >= 65000) & (F.col("salary") < 80000), "Medium")
     .when(F.col("salary") >= 80000, "High")
     .otherwise("Unknown")
)

print("\nEmployees with Salary Category:")
df_categorized.show()

In [None]:
# Age groups
df_age_groups = df.withColumn(
    "age_group",
    F.when(F.col("age") < 30, "Young")
     .when(F.col("age") < 40, "Mid-Career")
     .otherwise("Senior")
)

print("\nEmployees with Age Groups:")
df_age_groups.show()

In [None]:
# Complex condition - high performers
high_performers = df.filter(
    ((F.col("salary") > 75000) & (F.col("age") < 35)) |
    ((F.col("salary") > 85000) & (F.col("department") == "IT"))
)

print("\nHigh Performers:")
high_performers.show()

In [None]:
spark.stop()