In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("SparkSQL_Examples").getOrCreate()

data = [
    (1, "Alice", "HR", 5000, 2023),
    (2, "Bob", "IT", 6000, 2024),
    (3, "Cathy", "IT", 7500, 2024),
    (4, "David", "HR", 4000, 2023),
    (5, "Eva", "Finance", 8000, 2024)
]

columns = ["id", "name", "dept", "salary", "year"]
df = spark.createDataFrame(data, columns)

df.createOrReplaceTempView("employees")

df.where(df.dept == "IT").display()
spark.sql("SELECT * FROM employees WHERE dept = 'IT'").show()

df.orderBy(df.salary.desc()).display()
spark.sql("SELECT * FROM employees ORDER BY salary DESC").show()



In [0]:
df.select("name", "dept").show()
spark.sql("SELECT name, dept FROM employees").show()
df.filter(df.salary>5000).show()
spark.sql("SELECT * FROM employees WHERE salary > 5000").show()


### Groupby + aggregate

In [0]:
df.groupBy("dept").agg(sum("salary").alias("total_salary")).show()
spark.sql("SELECT dept, SUM(salary) AS total_salary FROM employees GROUP BY dept").show()


### Join

In [0]:
dept_data = [("HR", "Hyderabad"), ("IT", "Bangalore"), ("Finance", "Mumbai")]
dept_df = spark.createDataFrame(dept_data, ["dept_name", "location"])
dept_df.createOrReplaceTempView("departments")

df.join(dept_df, df.dept == dept_df.dept_name, "inner").select("name", "dept", "location").show()
spark.sql("SELECT e.name, e.dept, d.location FROM employees e JOIN departments d ON e.dept = d.dept_name").show()


### Set Operations

In [0]:
df_2023 = df.filter(df.year == 2023)
df_2024 = df.filter(df.year == 2024)
# df_2023.union(df_2024).distinct().show()
spark.sql("SELECT * FROM employees WHERE year=2023 UNION SELECT * FROM employees WHERE year=2024").display()
# df_2023.unionAll(df_2024).show()
spark.sql("SELECT * FROM employees WHERE year=2023 UNION ALL SELECT * FROM employees WHERE year=2024").display()
# df_2023.intersect(df_2024).show()
spark.sql("SELECT * FROM employees WHERE year=2023 INTERSECT SELECT * FROM employees WHERE year=2024").display()
# df_2023.exceptAll(df_2024).show()
spark.sql("SELECT * FROM employees WHERE year=2023 EXCEPT SELECT * FROM employees WHERE year=2024").display()
