# PySpark 3.4 Demo Notebook

This notebook demonstrates PySpark 3.4 functionality in a Podman container.

In [None]:
# Import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, max as spark_max

print(f"PySpark version: {pyspark.__version__}")

In [None]:
# Create Spark session
spark = SparkSession.builder \
    .appName("PySpark Notebook Demo") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI: http://localhost:4040")

In [None]:
# Create sample data
data = [
    ("Alice", 25, "Engineering", 75000),
    ("Bob", 30, "Sales", 65000),
    ("Charlie", 35, "Engineering", 85000),
    ("Diana", 28, "Marketing", 60000),
    ("Eve", 32, "Engineering", 90000),
    ("Frank", 29, "Sales", 70000)
]

columns = ["name", "age", "department", "salary"]
df = spark.createDataFrame(data, columns)

df.show()

In [None]:
# Basic aggregations
print("Department Statistics:")
df.groupBy("department") \
  .agg(count("*").alias("count"),
       avg("salary").alias("avg_salary"),
       spark_max("salary").alias("max_salary")) \
  .show()

In [None]:
# Filter high earners
print("High earners (>70k):")
df.filter(col("salary") > 70000).show()

In [None]:
# SQL operations
df.createOrReplaceTempView("employees")

result = spark.sql("""
    SELECT department, 
           COUNT(*) as employee_count,
           ROUND(AVG(salary), 2) as avg_salary
    FROM employees 
    GROUP BY department 
    ORDER BY avg_salary DESC
""")

print("SQL Query Results:")
result.show()

In [None]:
# Visualization with matplotlib
import matplotlib.pyplot as plt
import pandas as pd

# Convert to Pandas for plotting
pandas_df = df.toPandas()

# Plot salary by department
plt.figure(figsize=(10, 6))
pandas_df.groupby('department')['salary'].mean().plot(kind='bar')
plt.title('Average Salary by Department')
plt.ylabel('Salary')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Stop Spark session (run when done)
# spark.stop()
print("Spark session is still active. Run spark.stop() when done.")