# Learning spark sql

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQLExample").getOrCreate()

data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("people")

result = spark.sql("SELECT * FROM people WHERE Age > 28")
result.show()

+-------+---+
|   Name|Age|
+-------+---+
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AdvancedSparkSQL").getOrCreate()

data = [
 {"name": "Alice", "age": 25, "department": "HR", "salary": 50000},
 {"name": "Bob", "age": 30, "department": "IT", "salary": 70000},
 {"name": "Charlie", "age": 35, "department": "Finance", "salary": 80000},
 {"name": "David", "age": 40, "department": "IT", "salary": 90000},
 {"name": "Eve", "age": 45, "department": "Finance", "salary": 100000},
]

df = spark.createDataFrame(data)
df.createOrReplaceTempView("employees")

query = """
 SELECT department, COUNT(*) AS employee_count, AVG(salary) AS avg_salary
 FROM employees
 WHERE age > 30
 GROUP BY department
 ORDER BY avg_salary DESC
"""

result = spark.sql(query)
result.show()


+----------+--------------+----------+
|department|employee_count|avg_salary|
+----------+--------------+----------+
|   Finance|             2|   90000.0|
|        IT|             1|   90000.0|
+----------+--------------+----------+



In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQL_Join_Union").getOrCreate()
employees_data = [
 (1, "Alice", "HR"),
 (2, "Bob", "IT"),
 (3, "Charlie", "Finance"),
 (4, "David", "IT"),
 (5, "Eve", "Finance"),
]

employees_columns = ["emp_id", "name", "department"]
employees_df = spark.createDataFrame(employees_data, employees_columns)
employees_df.createOrReplaceTempView("employees")


salaries_data = [
 (1, 50000),
 (2, 70000),
 (3, 80000),
 (4, 90000),
 (5, 100000),
]
salaries_columns = ["emp_id", "salary"]
salaries_df = spark.createDataFrame(salaries_data, salaries_columns)
salaries_df.createOrReplaceTempView("salaries")

join_query = """
 SELECT e.emp_id, e.name, e.department, s.salary
 FROM employees e
 JOIN salaries s
 ON e.emp_id = s.emp_id
"""
joined_df = spark.sql(join_query)
print("Joined Table:")
joined_df.show()

Joined Table:
+------+-------+----------+------+
|emp_id|   name|department|salary|
+------+-------+----------+------+
|     1|  Alice|        HR| 50000|
|     2|    Bob|        IT| 70000|
|     3|Charlie|   Finance| 80000|
|     4|  David|        IT| 90000|
|     5|    Eve|   Finance|100000|
+------+-------+----------+------+



In [14]:
new_employees_data = [
 (6, "Frank", "Marketing", 110000),
 (7, "Grace", "HR", 60000),
]
new_employees_columns = ["emp_id", "name", "department", "salary"]
new_employees_df = spark.createDataFrame(new_employees_data,new_employees_columns)
new_employees_df.createOrReplaceTempView("new_employees_df")

union_query = """
 SELECT emp_id, name, department, salary FROM (
 SELECT e.emp_id, e.name, e.department, s.salary
 FROM employees e
 JOIN salaries s ON e.emp_id = s.emp_id
 )
 UNION
 SELECT emp_id, name, department, salary FROM new_employees_df
"""
union_df = spark.sql(union_query)
print("Union Table:")
union_df.show()

Union Table:
+------+-------+----------+------+
|emp_id|   name|department|salary|
+------+-------+----------+------+
|     5|    Eve|   Finance|100000|
|     3|Charlie|   Finance| 80000|
|     2|    Bob|        IT| 70000|
|     4|  David|        IT| 90000|
|     1|  Alice|        HR| 50000|
|     6|  Frank| Marketing|110000|
|     7|  Grace|        HR| 60000|
+------+-------+----------+------+

