In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQL_Lab").getOrCreate()

data = [
    ("Alice", "Sales", 5000),
    ("Bob", "Marketing", 4000),
    ("Charlie", "Sales", 6000),
    ("David", "IT", 8000),
    ("Eve", "Marketing", 4500)
]
columns = ["name", "dept", "salary"]

df = spark.createDataFrame(data, columns)
df.show()


+-------+---------+------+
|   name|     dept|salary|
+-------+---------+------+
|  Alice|    Sales|  5000|
|    Bob|Marketing|  4000|
|Charlie|    Sales|  6000|
|  David|       IT|  8000|
|    Eve|Marketing|  4500|
+-------+---------+------+



In [2]:
# Превращаем DataFrame в "SQL-таблицу" (виртуальную)
df.createOrReplaceTempView("employees")

# Теперь мы можем писать SQL запросы к "employees"
sql_df = spark.sql("""
    SELECT dept, AVG(salary) as avg_sal
    FROM employees
    GROUP BY dept
    HAVING avg_sal > 4500
""")

sql_df.show()

# Доказательство, что это тот же DataFrame:
sql_df.printSchema()

+-----+-------+
| dept|avg_sal|
+-----+-------+
|Sales| 5500.0|
|   IT| 8000.0|
+-----+-------+

root
 |-- dept: string (nullable = true)
 |-- avg_sal: double (nullable = true)



### Смешивание стилей

In [3]:
# 1. Делаем выборку через SQL
top_earners = spark.sql("SELECT * FROM employees WHERE salary > 5000")

# 2. Продолжаем обработку через DataFrame API
from pyspark.sql import functions as F

final_res = top_earners.withColumn("bonus", F.col("salary") * 0.1)
final_res.show()

+-------+-----+------+-----+
|   name| dept|salary|bonus|
+-------+-----+------+-----+
|Charlie|Sales|  6000|600.0|
|  David|   IT|  8000|800.0|
+-------+-----+------+-----+



### Анализ планов (Доказательство равенства)

In [4]:
# Вариант DataFrame
df_api = df.filter("salary > 5000").select("name")

# Вариант SQL
spark.sql("SELECT name FROM employees WHERE salary > 5000").explain()
df_api.explain()

# Сравните Physical Plan в выводе. Они должны быть идентичны (или почти идентичны).

== Physical Plan ==
*(1) Project [name#0]
+- *(1) Filter (isnotnull(salary#2L) AND (salary#2L > 5000))
   +- *(1) Scan ExistingRDD[name#0,dept#1,salary#2L]


== Physical Plan ==
*(1) Project [name#0]
+- *(1) Filter (isnotnull(salary#2L) AND (salary#2L > 5000))
   +- *(1) Scan ExistingRDD[name#0,dept#1,salary#2L]


